100 | <- &sof redirect comment_or_includes block_line? {return "comment_or_includes", nil /*return [r].concat(cil, bl || []);*/ }
101 | / block_lines
102 | / & '<' rs:( cm:comment &eolf {return cm, nil /*return c;*/ }
103 | // avoid a paragraph if we know that the line starts with a block tag
104 | / block_tag
105 | ) {return rs, nil /*return rs;*/ }
106 | / paragraph
107 | // Inlineline includes generic tags; wrapped into paragraphs in token
108 | // transform and DOM postprocessor
109 | / inlineline
110 | / s:sol !inline_breaks {return s, nil /*return s;*/ }
111 |
112 |
113 | // A block nested in other constructs. Avoid eating end delimiters for other
114 | // constructs by checking against inline_breaks first.
115 | //
116 | nested_block <- !inline_breaks b:block {return b, nil /*return b;*/ }
117 |
118 |
119 | // The same, but suitable for use inside a table construct.
120 | // Doesn't match table_heading_tag, table_row_tag, table_data_tag,
121 | // table_caption tag, or table_end_tag, although it does allow
122 | // table_start_tag (for nested tables).
123 | //
124 | nested_block_in_table
125 | <-
126 | // avoid recursion via nested_block_in_table, as that can lead to stack
127 | // overflow in large tables
128 | // See https://phabricator.wikimedia.org/T59670
129 | #{
130 | push(c, "tableDataBlock", true)
131 | return nil
132 | /*
133 | return stops.push('tableDataBlock', true);
134 | */
135 | }
136 | // XXX: don't rely on a lame look-ahead like this; use syntax stops
137 | // instead, so that multi-line th content followed by a line prefixed with
138 | // a comment is also handled. Alternatively, implement a sol look-behind
139 | // assertion accepting spaces and comments.
140 | !(sol (space* sol)? space* (pipe / "!")) b:nested_block
141 | #{pop(c, "tableDataBlock"); return nil}
142 | {
143 | return b, nil
144 | /*
145 | stops.pop('tableDataBlock');
146 | return b;
147 | */
148 | }
149 |
150 |
151 | // Line-based block constructs.
152 | //
153 | block_lines
154 | <- s:sol
155 | // eat an empty line before the block
156 | (s2:(os:optionalSpaceToken so:sol))?
157 | bl:block_line
158 |
159 | // Horizontal rules
160 | hr <- "----" "-"*
161 | // Check if a newline or content follows
162 | ( &sol "" {return nil, nil /*return undefined;*/ } / "" {return true, nil /*return true;*/ } ) {
163 | return &html.Node{
164 | Type: html.ElementNode,
165 | Data: "hr",
166 | }, nil
167 | /*
168 | var dataAttribs = {
169 | tsr: tsrOffsets(),
170 | lineContent: lineContent,
171 | };
172 | if (d.length > 0) {
173 | dataAttribs.extra_dashes = d.length;
174 | }
175 | return new SelfclosingTagTk('hr', [], dataAttribs);
176 | */
177 | }
178 |
179 |
180 | // Block structures with start-of-line wiki syntax
181 | //
182 | block_line
183 | <- heading
184 | / list_item
185 | / hr
186 | / st: space_or_newline*
187 | r:( & [ <{}|!] tl:table_line {return tl, nil /*return tl;*/ }
188 | // tag-only lines should not trigger pre either
189 | / bts:(bt:block_tag stl:optionalSpaceToken {return concat(bt, stl), nil /*return bt.concat(stl);*/ })+
190 | &eolf {return bts, nil /*return bts;*/ }
191 | ) {return concat(st, r), nil
192 | /*
193 | return st.concat(r);
194 | */
195 | }
196 |
197 |
198 | // A paragraph. We don't emit 'p' tokens to avoid issues with template
199 | // transclusions, tags in the source and the like. Instead, we perform
200 | // some paragraph wrapping on the token stream and the DOM.
201 | //
202 | paragraph
203 | <- s1:sol s2:sol c1:inlineline {
204 | n := &html.Node{
205 | Type: html.ElementNode,
206 | Data: "p",
207 | }
208 | addChild(n, c1)
209 | return n, nil
210 | }
211 |
212 | br <- optionalSpaceToken &newline {
213 | return &html.Node{
214 | Type: html.ElementNode,
215 | Data: "br",
216 | }, nil
217 | /*
218 | return s.concat([
219 | new SelfclosingTagTk('br', [], { tsr: tsrOffsets() }),
220 | ]);
221 | */
222 | }
223 |
224 | inline_breaks <- & { return inlineBreaks(c) }
225 |
226 | inlineline
227 | <- ((r:urltext)
228 | / inlineline_element)+
229 |
230 | inlineline_element
231 | <- !inline_breaks
232 | r:(inline_element / [^\r\n])
233 | {return r, nil}
234 |
235 | inline_element
236 | <- & '<' r:( xmlish_tag
237 | / comment
238 | ) {return r, nil /*return r;*/ }
239 | / & '{' r:tplarg_or_template {return r, nil/* return r; */}
240 | / & "-{" r:lang_variant_or_tpl {return r, nil/* return r; */}
241 | // FIXME: The php parser's replaceInternalLinks2 splits on [[, resulting
242 | // in sequences with odd number of brackets parsing as text, and sequences
243 | // with even number of brackets having its innermost pair parse as a
244 | // wikilink. For now, we faithfully reproduce what's found there but
245 | // wikitext, the language, shouldn't be defined by odd tokenizing behaviour
246 | // in the php parser. Flagging this for a future cleanup.
247 | / ("[[" &'[')+
248 | / & '[' r:( wikilink / extlink ) {return r, nil/* return r; */}
249 | / & "'" r:quote {return r, nil/* return r; */}
250 |
251 | // Headings */
252 |
253 | heading <- & "=" // guard, to make sure '='+ will match.
254 | // XXX: Also check to end to avoid inline parsing?
255 | r:(
256 | #{ inc(c, "h"); return nil /*return stops.inc('h');*/ }
257 | s:'='+ // moved in here to make s accessible to inner action
258 | ce:(
259 | (ill:(inlineline?))
260 | '='+ {return ill, nil}
261 | )?
262 | & {
263 | return ce!=nil || len(concat(s)) > 2, nil
264 | /*return ce || s.length > 2;*/
265 | }
266 | //("" {return nil, nil /*return endOffset();*/ })
267 | spc:(spaces / comment)*
268 | &eolf
269 | #{dec(c, "h"); return nil}
270 | {
271 | n := &html.Node{
272 | Type: html.ElementNode,
273 | Data: "h"+strconv.Itoa(len(concat(s))),
274 | }
275 | addChild(n, []interface{}{ce, spc})
276 | return n, nil
277 | /*
278 | var c;
279 | var e;
280 | var level;
281 | stops.dec('h');
282 | if (ce) {
283 | c = ce[0];
284 | e = ce[1];
285 | level = Math.min(s.length, e.length);
286 | } else {
287 | // split up equal signs into two equal parts, with at least
288 | // one character in the middle.
289 | level = Math.floor((s.length - 1) / 2);
290 | c = ['='.repeat(s.length - 2 * level)];
291 | s = e = '='.repeat(level);
292 | }
293 | level = Math.min(6, level);
294 | // convert surplus equals into text
295 | if (s.length > level) {
296 | var extras1 = s.substr(0, s.length - level);
297 | if (c[0].constructor === String) {
298 | c[0] = extras1 + c[0];
299 | } else {
300 | c.unshift(extras1);
301 | }
302 | }
303 | if (e.length > level) {
304 | var extras2 = e.substr(0, e.length - level);
305 | var lastElem = lastItem(c);
306 | if (lastElem.constructor === String) {
307 | c[c.length - 1] += extras2;
308 | } else {
309 | c.push(extras2);
310 | }
311 | }
312 |
313 | var tsr = tsrOffsets('start');
314 | tsr[1] += level;
315 | return [
316 | new TagTk('h' + level, [], { tsr: tsr }),
317 | ].concat(c, [
318 | new EndTagTk('h' + level, [], { tsr: [endTPos - level, endTPos] }),
319 | spc,
320 | ]);
321 | */
322 | }
323 | ) {
324 | return r, nil /*return r;*/
325 | }
326 |
327 |
328 | // Comments */
329 |
330 | // The php parser does a straight str.replace(/).)*-->/g, "")
331 | // but, as always, things around here are a little more complicated.
332 | //
333 | // We accept the same comments, but because we emit them as HTML comments
334 | // instead of deleting them, we have to encode the data to ensure that
335 | // we always emit a valid HTML5 comment. See the encodeComment helper
336 | // for further details.
337 |
338 | comment
339 | <- "" .)* ("-->" / eof) {
340 | return &html.Node{
341 | Type: html.CommentNode,
342 | Data: concat(c1),
343 | }, nil
344 | /*
345 | var data = DU.encodeComment(c);
346 | return [new CommentTk(data, { tsr: tsrOffsets() })];
347 | */
348 | }
349 |
350 |
351 | // Behavior switches. See:
352 | // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches
353 | behavior_switch
354 | <- ("__" behavior_text "__") {return "behavior_text", nil
355 | /*
356 | if (env.conf.wiki.isMagicWord(bs)) {
357 | return [
358 | new SelfclosingTagTk('behavior-switch', [ new KV('word', bs) ],
359 | { tsr: tsrOffsets(), src: bs, magicSrc: bs }
360 | ),
361 | ];
362 | } else {
363 | return [ bs ];
364 | }
365 | */
366 | }
367 |
368 | // Instead of defining a charset, php's doDoubleUnderscore concats a regexp of
369 | // all the language specific aliases of the behavior switches and then does a
370 | // match and replace. Just be as permissive as possible and let the
371 | // BehaviorSwitchPreprocessor back out of any overreach.
372 | behavior_text <- ( !"__" [^'"<~[{\n\r:;\]}|!=] )+
373 |
374 |
375 | // ************************************************************
376 | // External (bracketed and autolinked) links
377 | // ************************************************************/
378 |
379 | autolink
380 | <- ! {
381 | extlink, _ := peek(c, "extlink").(bool)
382 | return extlink, nil
383 | /*return stops.onStack('extlink');*/
384 | }
385 | // this must be a word boundary, so previous character must be non-word
386 | ! {return true, nil /*return /\w/.test(input[endOffset() - 1] || '');*/ }
387 | r:(
388 | // urllink, inlined
389 | target:autourl {
390 | return target, nil
391 | /*
392 | var res = [new SelfclosingTagTk('urllink', [new KV('href', target)], { tsr: tsrOffsets() })];
393 | return res;
394 | */
395 | }
396 | / autoref
397 | / isbn) {return r, nil /*return r;*/ }
398 |
399 | extlink
400 | <- ! {
401 | extlink, _ := peek(c, "extlink").(bool)
402 | return extlink, nil
403 | /* return stops.onStack('extlink'); */
404 | } // extlink cannot be nested
405 | "["
406 | # {push(c, "extlink", true); return nil /*return stops.push('extlink', true);*/ }
407 | addr:(url_protocol urladdr / "")
408 | target:(extlink_preprocessor_text / "")
409 | & {
410 | // TODO: smarter check
411 | return true, nil
412 | /*
413 | // Protocol must be valid and there ought to be at least one
414 | // post-protocol character. So strip last char off target
415 | // before testing protocol.
416 | var flat = tu.flattenString([addr, target]);
417 | if (Array.isArray(flat)) {
418 | // There are templates present, alas.
419 | return flat.length > 0;
420 | }
421 | return Util.isProtocolValid(flat.slice(0, -1), env);
422 | */
423 | }
424 | ( space / unispace )*
425 | //( "" {return nil, nil /*return endOffset();*/ })
426 | content:inlineline?
427 | "]"
428 | #{ pop(c, "extlink"); return nil }
429 | {
430 | n := &html.Node{
431 | Type: html.ElementNode,
432 | Data: "a",
433 | Attr: []html.Attribute{
434 | {Key: "href", Val: concat(addr, target)},
435 | {Key: "class", Val: "external"},
436 | {Key: "rel", Val: "nofollow"},
437 | },
438 | }
439 | addChild(n, content)
440 | return n, nil
441 | /*
442 | stops.pop('extlink');
443 | return [
444 | new SelfclosingTagTk('extlink', [
445 | new KV('href', tu.flattenString([addr, target])),
446 | new KV('mw:content', content || ''),
447 | new KV('spaces', sp),
448 | ], {
449 | targetOff: targetOff,
450 | tsr: tsrOffsets(),
451 | contentOffsets: [targetOff, endOffset() - 1],
452 | }),
453 | ];
454 | */
455 | }
456 |
457 | autoref
458 | <- ("RFC" / "PMID") space_or_nbsp+ [0-9]+ end_of_word
459 | { return nil, nil
460 | /*
461 | var base_urls = {
462 | 'RFC': 'https://tools.ietf.org/html/rfc%s',
463 | 'PMID': '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract',
464 | };
465 | return [
466 | new SelfclosingTagTk('extlink', [
467 | new KV('href', tu.sprintf(base_urls[ref], identifier)),
468 | new KV('mw:content', tu.flattenString([ref, sp, identifier])),
469 | new KV('typeof', 'mw:ExtLink/' + ref),
470 | ],
471 | { stx: "magiclink", tsr: tsrOffsets() }),
472 | ];
473 | */
474 | }
475 |
476 | isbn
477 | <- "ISBN" space_or_nbsp+ (
478 | [0-9]
479 | (space_or_nbsp_or_dash &[0-9] {return nil, nil/* return s; */} / [0-9])+
480 | ((space_or_nbsp_or_dash / "") [xX] / "")
481 | ) (
482 | end_of_word
483 | {return nil, nil
484 | /*
485 | // Convert isbn token-and-entity array to stripped string.
486 | return tu.flattenStringlist(isbn).filter(function(e) {
487 | return e.constructor === String;
488 | }).join('').replace(/[^\dX]/ig, '').toUpperCase();
489 | */
490 | }
491 | ) &{
492 | return false, nil
493 | /*
494 | // ISBNs can only be 10 or 13 digits long (with a specific format)
495 | return isbncode.length === 10 ||
496 | (isbncode.length === 13 && /^97[89]/.test(isbncode));
497 | */
498 | } {return nil, nil
499 | /*
500 | return [
501 | new SelfclosingTagTk('extlink', [
502 | new KV('href', 'Special:BookSources/' + isbncode),
503 | new KV('mw:content', tu.flattenString(['ISBN', sp, isbn])),
504 | new KV('typeof', 'mw:WikiLink/ISBN'),
505 | ],
506 | { stx: "magiclink", tsr: tsrOffsets() }),
507 | ];
508 | */
509 | }
510 |
511 |
512 | // Default URL protocols in MediaWiki (see DefaultSettings). Normally
513 | // these can be configured dynamically. */
514 |
515 | url_protocol <-
516 | & {return false, nil/* return Util.isProtocolValid(input.substr(endOffset()), env); */}
517 | ( "//" / [A-Za-z] [-A-Za-z0-9+.]* ":" "//"? ) {return nil, nil/* return p;*/ }
518 |
519 | // no punctuation, and '{<' to trigger directives
520 | no_punctuation_char <- [^ :\][\r\n"'<>,.&%{]
521 | //TODO: no_punctuation_char <- [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
522 |
523 | // this is the general url rule
524 | // on the PHP side, the path part matches EXT_LINK_URL_CLASS
525 | // which is '[^][<>"\x00-\x20\x7F\p{Zs}]'
526 | // the 's' and 'r' pieces below match the characters in
527 | // EXT_LINK_URL_CLASS which aren't included in no_punctuation_char
528 | url
529 | <- proto:url_protocol
530 | addr:(urladdr / "")
531 | path:( ( !inline_breaks
532 | c1:no_punctuation_char
533 | {return c1, nil /*return c; */}
534 | )
535 | / s:[.:,'] {return s, nil/* return s; */}
536 | / comment
537 | / tplarg_or_template
538 | / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
539 | r:(
540 | & "&" he:htmlentity {return he, nil/* return he; */}
541 | / [&%{]
542 | ) {return r, nil /*return r;*/ }
543 | )*
544 | // Must be at least one character after the protocol
545 | & {return false, nil /*return addr.length > 0 || path.length > 0;*/ }
546 | {return []interface{}{proto, addr, path}, nil
547 | /*
548 | return tu.flattenString([proto, addr].concat(path));
549 | */
550 | }
551 |
552 | // this is the somewhat-restricted rule used in autolinks
553 | // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink.
554 | // The `path` portion matches EXT_LINK_URL_CLASS, as in the general
555 | // url rule. As in PHP, we do some fancy fixup to yank out
556 | // trailing punctuation, perhaps including parentheses.
557 | // The 's' and 'r' pieces match the characters in EXT_LINK_URL_CLASS
558 | // which aren't included in no_punctuation_char
559 | autourl
560 | <- &{return true, nil /*return stops.push('autourl', { sawLParen: false }); */}
561 | ! "//" // protocol-relative autolinks not allowed (T32269)
562 | (
563 | url_protocol
564 | (urladdr / "")
565 | ( ( !inline_breaks
566 | ! "("
567 | c1:no_punctuation_char
568 | {return c1, nil/* return c; */}
569 | )
570 | / "(" {return "(", nil/* stops.onStack('autourl').sawLParen = true; return "("; */}
571 | / [.:,]
572 | / (['] ![']) // single quotes are ok, double quotes are bad
573 | / comment
574 | / tplarg_or_template
575 | / ! ( raw_htmlentity &{return false, nil /* return /^[<>\u00A0]$/.test(rhe); */} )
576 | r:(
577 | & "&" he:htmlentity {return he, nil/* return he; */}
578 | / [&%{]
579 | ) {return r, nil/* return r; */}
580 | )*
581 | {return "TODO: autourl",nil
582 | /*
583 | // as in Parser.php::makeFreeExternalLink, we're going to
584 | // yank trailing punctuation out of this match.
585 | var url = tu.flattenStringlist([proto, addr].concat(path));
586 | // only need to look at last element; HTML entities are strip-proof.
587 | var last = lastItem(url);
588 | var trim = 0;
589 | if (last && last.constructor === String) {
590 | var strip = ',;\\.:!?';
591 | if (!stops.onStack('autourl').sawLParen) {
592 | strip += ')';
593 | }
594 | strip = new RegExp('[' + JSUtils.escapeRegExp(strip) + ']*$');
595 | trim = strip.exec(last)[0].length;
596 | url[url.length - 1] = last.slice(0, last.length - trim);
597 | }
598 | url = tu.flattenStringlist(url);
599 | if (url.length === 1 && url[0].constructor === String && url[0].length <= proto.length) {
600 | return null; // ensure we haven't stripped everything: T106945
601 | }
602 | peg$currPos -= trim;
603 | stops.pop('autourl');
604 | return url;
605 | */
606 | } ) &{return false, nil/* return r !== null; */} {return nil, nil/*return r; */}
607 | / &{return false, nil /*return stops.pop('autourl');*/ }
608 |
609 | // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified
610 | // expression to match an IPv6 address. The IPv4 address and "at least
611 | // one character of a host name" portions are punted to the `path`
612 | // component of the `autourl` and `url` productions
613 | urladdr
614 | <- ( "[" [0-9A-Fa-f:.]+ "]" )
615 |
616 | // ************************************************************
617 | // Templates, -arguments and wikilinks
618 | // ************************************************************/
619 |
620 |
621 | // Precedence: template arguments win over templates. See
622 | // http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
623 | // 4: {{{{·}}}} → {·{{{·}}}·}
624 | // 5: {{{{{·}}}}} → {{·{{{·}}}·}}
625 | // 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
626 | // 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
627 | // This is only if close has > 3 braces; otherwise we just match open
628 | // and close as we find them.
629 | //
630 | tplarg_or_template
631 | <- &"{{" //&{return false, nil}
632 | //
633 | //// Refuse to recurse beyond `maxDepth` levels. Default in the PHP parser
634 | //// is $wgMaxTemplateDepth = 40; This is to prevent crashing from
635 | //// buggy wikitext with lots of unclosed template calls, as in
636 | //// eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
637 | // if (stops.onCount('templatedepth') === undefined ||
638 | // stops.onCount('templatedepth') < env.conf.parsoid.maxDepth) {
639 | // return true;
640 | // } else {
641 | // return false;
642 | // }
643 | t:tplarg_or_template_guarded {return t, nil /*return t;*/ }
644 |
645 | tplarg_or_template_guarded
646 | <- #{inc(c, "templatedepth"); return nil /* return stops.inc('templatedepth');*/ }
647 | r:( &("{{" &("{{{"+ !'{') tplarg) a:(template/broken_template) {return a, nil /*return a;*/ }
648 | / a:('{' &("{{{"+ !'{'))? b:tplarg {return concat(a, b), nil /*return [a].concat(b);*/ }
649 | / a:('{' &("{{" !'{'))? b:template {return concat(a, b), nil /*return [a].concat(b);*/ }
650 | / a:broken_template {return a, nil /*return a;*/ }
651 | ) #{
652 | dec(c, "templatedepth")
653 | return nil
654 | } {
655 | return r, nil
656 | /*
657 | stops.dec('templatedepth');
658 | return r;
659 | */
660 | }
661 |
662 | tplarg_or_template_or_bust
663 | <- (tplarg_or_template / .)+
664 |
665 | template
666 | <- #{
667 | push(c, "level", push(c, "preproc", /*{{*/ "}}"))
668 | return nil
669 | /* return stops.push('preproc', / * {{ * /"}}"); */
670 | }
671 | t:template_preproc
672 | #{
673 | popTo(c, "preproc", pop(c, "level").(int))
674 | return nil
675 | }
676 | {return t, nil/* stops.popTo('preproc', stopLen); return t; */}
677 |
678 | // The PHP preprocessor maintains a single stack of "closing token we
679 | // are currently looking for", with no backtracking. This means that
680 | // once you see `[[ {{` you are looking only for `}}` -- if that template
681 | // turns out to be broken you will never pop the `}}` and there is no way
682 | // to close the `[[`. Since the PEG tokenizer in Parsoid uses backtracking
683 | // and parses in a single pass (instead of PHP's split preprocessor/parser)
684 | // we have to be a little more careful when we emulate this behavior.
685 | // If we use a rule like:
686 | // template = "{{" tplname tplargs* "}}"?
687 | // Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
688 | // turns out we never find the `}}`, which involves a lot of tedious gluing
689 | // tokens back together with fingers crossed we haven't discarded any
690 | // significant newlines/whitespace/etc. An alternative would be a rule like:
691 | // broken_template = "{{" tlb
692 | // but again, `template` is used in many different contexts; `tlb` isn't
693 | // necessarily the right one to recursively invoke. Instead we get the
694 | // broken template off of the PEGjs production stack by returning immediately
695 | // after `{{`, but we leave a "broken token" on top of the preprocessor
696 | // stops stack to indicate we're "still in" the {{ context and shouldn't
697 | // ever inlineBreak for any closing tokens above this one. For example:
698 | // [[Foo{{Bar]]
699 | // This will match as:
700 | // wikilink->text,template->text --> FAILS looking for }}
701 | // backtracks, popping "]]" and "}}" off preproc stack
702 | // wikilink->text,broken_template,text --> FAILS looking for ]]
703 | // backtracks, popping "]]" and "broken" off preproc stack
704 | // broken_wikilink,text,broken_template,text --> OK
705 | // with ["broken", "broken"] left on the preproc stops stack
706 | // Note that we use stops.popTo() to make sure the preproc stack is
707 | // cleaned up properly during backtracking, even if there were broken-FOO
708 | // productions taken which (deliberately) left elements on the preproc stack.
709 |
710 | broken_template
711 | <- &"{{" #{push(c, "preproc", "broken"); return nil/* return stops.push('preproc', 'broken'); */}
712 | // for broken-template, deliberately fail to pop the preproc stops stack
713 | t:"{{"
714 | #{pop(c, "preproc"); return nil}
715 | {return t, nil/* return t; */}
716 |
717 | template_preproc
718 | <- "{{" nl_comment_space*
719 | target:template_param_value
720 | attributes:(nl_comment_space* "|"
721 | r:(
722 | nl_comment_space*
723 | &("|" / "}}")
724 | {return nil, nil/* return new KV('', tu.flattenIfArray(v), [p0, p0, p0,
725 | p]);*/
726 | } // empty argument
727 | / template_param
728 | ) {return r, nil/* return r; */}
729 | )*
730 | nl_comment_space*
731 | inline_breaks "}}" {
732 | opts, ok := c.globalStore["opts"].(opts)
733 | if !ok {
734 | return nil, nil
735 | }
736 | if opts.templateHandler == nil {
737 | return nil, nil
738 | }
739 | var attrs []Attribute
740 | for _, attr := range flatten(attributes) {
741 | attr := attr.(Attribute)
742 | attrs = append(attrs, attr)
743 | }
744 | val, err := opts.templateHandler(strings.TrimSpace(concat(target)), attrs)
745 | if err != nil {
746 | return fmt.Sprintf("{{ template error: %s }}", err.Error()), nil
747 | }
748 | return val, nil
749 | /*
750 | // Insert target as first positional attribute, so that it can be
751 | // generically expanded. The TemplateHandler then needs to shift it out
752 | // again.
753 | params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
754 | var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() });
755 | return obj;
756 | */
757 | } / ("{{" space_or_newline* "}}")
758 |
759 | tplarg
760 | <- //("" {return nil, nil /*return stops.push('preproc', / * {{ * /"}}"); */})
761 | t:(tplarg_preproc / &{return false, nil /*return stops.popTo('preproc', stopLen); */} )
762 | {return t, nil/* stops.popTo('preproc', stopLen); return t; */}
763 |
764 | tplarg_preproc
765 | <- "{{{"
766 | //("" {return nil, nil/* return endOffset(); */})
767 | target:template_param_value?
768 | params:(nl_comment_space* "|"
769 | ( ("" {return nil, nil/* return endOffset(); */})
770 | nl_comment_space*
771 | ("" {return nil, nil/* return endOffset(); */})
772 | &("|" / "}}}")
773 | {return nil, nil/* return {return nil, nil tokens: v, srcOffsets: [p0, p1] }; */} // empty argument
774 | / template_param_value
775 | ) {return nil, nil/* return r; */}
776 | )*
777 | nl_comment_space*
778 | inline_breaks "}}}" {return concat(target, params), nil
779 | /*
780 | params = params.map(function(o) {
781 | var s = o.srcOffsets;
782 | return new KV('', tu.flattenIfArray(o.tokens), [s[0], s[0], s[0], s[1]]);
783 | });
784 | if (target === null) { target = { tokens: '', srcOffsets: [p, p, p, p] }; }
785 | // Insert target as first positional attribute, so that it can be
786 | // generically expanded. The TemplateHandler then needs to shift it out
787 | // again.
788 | params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
789 | var obj = new SelfclosingTagTk('templatearg', params, { tsr: tsrOffsets(), src: text() });
790 | return obj;
791 | */
792 | }
793 |
794 | template_param
795 | <- key:template_param_name
796 | val:(
797 | //("" {return nil, nil/* return endOffset(); */})
798 | optionalSpaceToken
799 | "="
800 | //("" {return nil, nil/* return endOffset(); */})
801 | optionalSpaceToken
802 | tpv:template_param_value? {return tpv, nil
803 | /*
804 | return { kEndPos: kEndPos, vStartPos: vStartPos, value: (tpv && tpv.tokens) || [] };
805 | */
806 | }
807 | )? {
808 | return Attribute{
809 | Key: key,
810 | Val: val,
811 | }, nil
812 | /*
813 | if (val !== null) {
814 | if (val.value !== null) {
815 | return new KV(name, tu.flattenIfArray(val.value), [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
816 | } else {
817 | return new KV(tu.flattenIfArray(name), '', [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
818 | }
819 | } else {
820 | return new KV('', tu.flattenIfArray(name), [startOffset(), startOffset(), startOffset(), endOffset()]);
821 | }
822 | */
823 | }
824 | // empty parameter
825 | / & [|}] {return nil, nil
826 | /*
827 | return new KV('', '', [startOffset(), startOffset(), startOffset(), endOffset()]);
828 | */
829 | }
830 |
831 | template_param_name
832 | <- & {
833 | push(c, "equal", true)
834 | return true, nil /*return stops.push('equal', true); */}
835 | tpt:(template_param_text / &'=' {return "", nil/* return ''; */})
836 | {
837 | pop(c, "equal")
838 | return tpt, nil
839 | /*
840 | stops.pop('equal');
841 | return tpt;
842 | */
843 | }
844 |
845 | / & {
846 | pop(c, "equal")
847 | return false, nil
848 | /* return stops.pop('equal'); */
849 | }
850 |
851 | template_param_value
852 | <- #{ push(c, "equal", false); return nil }
853 | tpt:template_param_text
854 | #{ pop(c, "equal"); return nil }
855 | {
856 | return tpt, nil
857 | /*
858 | stops.pop('equal');
859 | return { tokens: tpt, srcOffsets: tsrOffsets() };
860 | */
861 | }
862 |
863 | template_param_text
864 | <- #{
865 | push(c, "table", false)
866 | push(c, "extlink", false)
867 | push(c, "templateArg", true)
868 | push(c, "tableCellArg", false)
869 | inc(c, "template")
870 | return nil
871 | /*
872 | // re-enable tables within template parameters
873 | stops.push('table', false);
874 | stops.push('extlink', false);
875 | stops.push('templateArg', true);
876 | stops.push('tableCellArg', false);
877 | return stops.inc('template');
878 | */
879 | }
880 | il:(nested_block / newlineToken)+ #{
881 | pop(c, "table")
882 | pop(c, "extlink")
883 | pop(c, "templateArg")
884 | pop(c, "tableCellArg")
885 | dec(c, "template")
886 | return nil
887 | }
888 | {
889 | return il, nil
890 | /*
891 | stops.pop('table');
892 | stops.pop('extlink');
893 | stops.pop('templateArg');
894 | stops.pop('tableCellArg');
895 | stops.dec('template');
896 | // il is guaranteed to be an array -- so, tu.flattenIfArray will
897 | // always return an array
898 | var r = tu.flattenIfArray(il);
899 | if (r.length === 1 && r[0].constructor === String) {
900 | r = r[0];
901 | }
902 | return r;
903 | */
904 | }
905 |
906 | //// Language converter block markup of language variants: -{ ... }-
907 |
908 | // Note that "rightmost opening" precedence rule (see
909 | // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means
910 | // that neither -{{ nor -{{{ are parsed as a -{ token, although
911 | // -{{{{ is (since {{{ has precedence over {{).
912 |
913 | lang_variant_or_tpl
914 | <- &("-{" &("{{{"+ !'{') tplarg) a:lang_variant {return a, nil/* return a; */}
915 | / a:('-' &("{{{"+ !'{')) b:tplarg {return concat(a, b), nil /*return [a].concat(b);*/ }
916 | / a:('-' &("{{" "{{{"* !'{')) b:template {return concat(a, b), nil/* return [a].concat(b); */}
917 | / &"-{" a:lang_variant {return a, nil /*return a; */}
918 |
919 | broken_lang_variant
920 | <- &{return true, nil /*return stops.push('preproc', 'broken'); */}
921 | // for broken-lang-variant, deliberately fail to pop the stops stack
922 | r:"-{" {return r, nil /*return r; */}
923 |
924 | lang_variant
925 | <- ("" {return nil, nil /*return stops.push('preproc', /* -{ * / '}-'); */})
926 | lv:(lang_variant_preproc / &{return false, nil /*return stops.popTo('preproc', stopLen); */})
927 | {return lv, nil /*stops.popTo('preproc', stopLen); return lv; */}
928 | / broken_lang_variant
929 |
930 | lang_variant_preproc
931 | <- ("-{" {return nil, nil/* return startOffset(); */})
932 | (
933 | &{return false, nil /* return env.langConverterEnabled(); */}
934 | ff:opt_lang_variant_flags {return ff, nil
935 | /*
936 | // Avoid mutating cached expression results
937 | ff = Util.clone(ff, true);
938 | // if flags contains 'R', then don't treat ; or : specially inside.
939 | if (ff.flags) {
940 | ff.raw = ff.flags.has('R') || ff.flags.has('N');
941 | } else if (ff.variants) {
942 | ff.raw = true;
943 | }
944 | return ff;
945 | */
946 | } /
947 | &{return false, nil /*return !env.langConverterEnabled(); */}
948 | "" {return nil, nil
949 | /*
950 | // if language converter not enabled, don't try to parse inside.
951 | return { raw: true };
952 | */
953 | }
954 | )
955 | (
956 | &{return false, nil /*return f.raw; */} lv:lang_variant_text {return lv, nil/* return [{ text: lv }]; */}
957 | /
958 | &{return false, nil /* return !f.raw; */} lv:lang_variant_option_list {return lv, nil/* return lv; */}
959 | )
960 | inline_breaks
961 | ("}-" {return nil, nil/* return endOffset(); */}) {return "TODO lang_variant_preproc", nil
962 | /*
963 |
964 | if (!env.langConverterEnabled()) {
965 | return [ "-{", ts[0].text.tokens, "}-" ];
966 | }
967 | var lvsrc = input.substring(lv0, lv1);
968 | var attribs = [];
969 |
970 | // Do a deep clone since we may be destructively modifying
971 | // (the `t[fld] = name;` below) the result of a cached expression
972 | ts = Util.clone(ts, true);
973 |
974 | ts.forEach(function(t) {
975 | // move token strings into KV attributes so that they are
976 | // properly expanded by early stages of the token pipeline
977 | ['text','from','to'].forEach(function(fld) {
978 | if (t[fld] === undefined) { return; }
979 | var name = 'mw:lv' + attribs.length;
980 | attribs.push(new KV(name, t[fld].tokens, t[fld].srcOffsets));
981 | t[fld] = name;
982 | });
983 | });
984 | return [
985 | new SelfclosingTagTk(
986 | 'language-variant',
987 | attribs,
988 | {return nil, nil
989 | tsr: [lv0, lv1],
990 | src: lvsrc,
991 | flags: f.flags && Array.from(f.flags).sort(),
992 | variants: f.variants && Array.from(f.variants).sort(),
993 | original: f.original,
994 | flagSp: f.sp,
995 | texts: ts,
996 | }),
997 | ];
998 | */
999 | }
1000 |
1001 | opt_lang_variant_flags
1002 | <- f:( ff:lang_variant_flags "|" {return ff, nil/* return ff; */} )? {return f, nil
1003 | /*
1004 | // Collect & separate flags and variants into a set and ordered list
1005 | var flags = new Set();
1006 | var variants = new Set();
1007 | var flagList = [];
1008 | var flagSpace = [];
1009 | var variantList = [];
1010 | var variantSpace = [];
1011 | var useVariants = false;
1012 | var internalSp = []; // internal whitespace, for round-tripping
1013 | if (f !== null) {
1014 | // lang_variant_flags returns arrays in reverse order.
1015 | f.flags.reverse();
1016 | f.sp.reverse();
1017 | var spPtr = 0;
1018 | f.flags.forEach(function(item) {
1019 | if (item.flag) {
1020 | flagSpace.push(f.sp[spPtr++]);
1021 | flags.add(item.flag);
1022 | flagList.push(item.flag);
1023 | flagSpace.push(f.sp[spPtr++]);
1024 | }
1025 | if (item.variant) {
1026 | variantSpace.push(f.sp[spPtr++]);
1027 | variants.add(item.variant);
1028 | variantList.push(item.variant);
1029 | variantSpace.push(f.sp[spPtr++]);
1030 | }
1031 | });
1032 | if (spPtr < f.sp.length) {
1033 | // handle space after a trailing semicolon
1034 | flagSpace.push(f.sp[spPtr]);
1035 | variantSpace.push(f.sp[spPtr]);
1036 | }
1037 | }
1038 | // Parse flags (this logic is from core/languages/ConverterRule.php
1039 | // in the parseFlags() function)
1040 | if (flags.size === 0 && variants.size === 0) {
1041 | flags.add('$S');
1042 | } else if (flags.has('R')) {
1043 | flags = new Set(['R']); // remove other flags
1044 | } else if (flags.has('N')) {
1045 | flags = new Set(['N']); // remove other flags
1046 | } else if (flags.has('-')) {
1047 | flags = new Set(['-']); // remove other flags
1048 | } else if (flags.has('T') && flags.size === 1) {
1049 | flags.add('H');
1050 | } else if (flags.has('H')) {
1051 | // Replace A flag, and remove other flags except T and D
1052 | var nf = new Set(['$+', 'H']);
1053 | if (flags.has('T')) { nf.add('T'); }
1054 | if (flags.has('D')) { nf.add('D'); }
1055 | flags = nf;
1056 | } else if (variants.size > 0) {
1057 | useVariants = true;
1058 | } else {
1059 | if (flags.has('A')) {
1060 | flags.add('$+');
1061 | flags.add('$S');
1062 | }
1063 | if (flags.has('D')) {
1064 | flags.delete('$S');
1065 | }
1066 | }
1067 | if (useVariants) {
1068 | return { variants: variants, original: variantList, sp: variantSpace };
1069 | } else {
1070 | return { flags: flags, original: flagList, sp: flagSpace };
1071 | }
1072 | */
1073 | }
1074 |
1075 | lang_variant_flags
1076 | <- (space_or_newline*) lang_variant_flag (space_or_newline*)
1077 | ( ";" lang_variant_flags? )? {return nil, nil
1078 | /*
1079 | var r = more && more[1] ? more[1] : { sp: [], flags: [] };
1080 | // Note that sp and flags are in reverse order, since we're using
1081 | // right recursion and want to push instead of unshift.
1082 | r.sp.push(sp2.join(''));
1083 | r.sp.push(sp1.join(''));
1084 | r.flags.push(f);
1085 | return r;
1086 | */
1087 | }
1088 | / (space_or_newline*) {return nil, nil
1089 | /*
1090 | return { sp: [ sp.join('') ], flags: [] };
1091 | */
1092 | }
1093 |
1094 | lang_variant_flag
1095 | <- [-+A-Z] {return nil, nil /*return { flag: f }; */}
1096 | / lang_variant_name {return nil, nil/* return { variant: v }; */}
1097 | / (!space_or_newline !nowiki [^{}|;])+ {return nil, nil/* return { bogus: b.join('') }; /*
1098 | bad flag * /*/}
1099 |
1100 | lang_variant_name // language variant name, like zh, zh-cn, etc.
1101 | <- [a-z] [-a-z]+ {return nil, nil/* return h + t.join(''); */}
1102 | // Escaped otherwise-unrepresentable language names
1103 | // Primarily for supporting html2html round trips; PHP doesn't support
1104 | // using nowikis here (yet!)
1105 | / nowiki_text
1106 |
1107 | lang_variant_option_list
1108 | <- lang_variant_option ( ";" lang_variant_option {return nil, nil/* return oo; */})*
1109 | ( ";" space_or_newline* )? // optional trailing semicolon
1110 | {return nil, nil
1111 | /*
1112 | var r = [ o ].concat(rest);
1113 | if (tr) { r.push({ semi: true, sp: tr[1].join('') }); }
1114 | return r;
1115 | */
1116 | }
1117 | / lang_variant_text {return nil, nil/* return [{ text: lvtext }]; */}
1118 |
1119 | lang_variant_option
1120 | <- (space_or_newline*) lang_variant_name
1121 | (space_or_newline*) ":"
1122 | (space_or_newline*)
1123 | (lang_variant_nowiki / lang_variant_text_no_semi)
1124 | {return nil, nil
1125 | /*
1126 | return {
1127 | twoway: true,
1128 | lang: lang,
1129 | text: lvtext,
1130 | sp: [sp1.join(''), sp2.join(''), sp3.join('')]
1131 | };
1132 | */
1133 | }
1134 | / (space_or_newline*)
1135 | (lang_variant_nowiki / lang_variant_text_no_semi_or_arrow)
1136 | "=>"
1137 | (space_or_newline*) lang_variant_name
1138 | (space_or_newline*) ":"
1139 | (space_or_newline*)
1140 | (lang_variant_nowiki / lang_variant_text_no_semi)
1141 | {return nil, nil
1142 | /*
1143 | return {
1144 | oneway: true,
1145 | from: from,
1146 | lang: lang,
1147 | to: to,
1148 | sp: [sp1.join(''), sp2.join(''), sp3.join(''), sp4.join('')]
1149 | };
1150 | */
1151 | }
1152 |
1153 | // html2wt support: If a language name or conversion string can't be
1154 | // represented w/o breaking wikitext, just wrap it in a .
1155 | // PHP doesn't support this (yet), but Parsoid does.
1156 | lang_variant_nowiki
1157 | <- ("" {return nil, nil/*return startOffset();*/})
1158 | nowiki_text
1159 | ("" {return nil, nil/* return endOffset();*/})
1160 | space_or_newline* {return nil, nil
1161 | /*
1162 | return { tokens: [ n ], srcOffsets: [start, end] };
1163 | */
1164 | }
1165 |
1166 | lang_variant_text
1167 | <- ("" {return nil, nil/*return startOffset();*/})
1168 | (inlineline / "|" )*
1169 | ("" {return nil, nil/*return endOffset();*/})
1170 | {return nil, nil/* return { tokens: tokens || [], srcOffsets: [start, end] }; */}
1171 |
1172 | lang_variant_text_no_semi
1173 | <- & {return false, nil/* return stops.push('semicolon', true); */}
1174 | lang_variant_text
1175 | {return nil, nil/* stops.pop('semicolon'); return lvtext; */}
1176 | / & {return false, nil/* return stops.pop('semicolon'); */}
1177 |
1178 | lang_variant_text_no_semi_or_arrow
1179 | <- & {return false, nil/* return stops.push('arrow', true); */}
1180 | lang_variant_text_no_semi {return nil, nil/* stops.pop('arrow'); return lvtext; */}
1181 | / & {return false, nil/* return stops.pop('arrow'); */}
1182 |
1183 | wikilink_content
1184 | <- (pipe lt:link_text? {
1185 | return lt, nil
1186 | /*
1187 | var maybeContent = new KV('mw:maybeContent', lt, [startPos, endOffset()]);
1188 | maybeContent.vsrc = input.substring(startPos, endOffset());
1189 | return maybeContent;
1190 | */
1191 | })*
1192 |
1193 | wikilink <- wikilink_preproc / broken_wikilink
1194 |
1195 | // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
1196 | // second bracket could start an extlink. Deliberately leave entry
1197 | // on preproc stack since we haven't seen a double-close bracket.
1198 | // (See full explanation above broken_template production.)
1199 | broken_wikilink
1200 | <- &"[[" #{
1201 | push(c, "preproc", "broken")
1202 | return nil
1203 | /* return stops.push('preproc', 'broken'); */
1204 | }
1205 | a:("[" (extlink / "["))
1206 | #{ pop(c, "preproc"); return nil }
1207 | {
1208 | return a, nil
1209 | /* return a; */
1210 | }
1211 |
1212 | wikilink_preproc
1213 | <- "[["
1214 | #{ push(c, "preproc", "]]"); return nil }
1215 | target:wikilink_preprocessor_text?
1216 | //("" {return nil, nil/* return endOffset(); */})
1217 | lcs:wikilink_content
1218 | inline_breaks "]]"
1219 | #{ pop(c, "preproc"); return nil }
1220 | {
1221 | targetStr := concat(target)
1222 | if strings.HasPrefix(targetStr, "File:") || strings.HasPrefix(targetStr, "Image:") {
1223 | n := &html.Node{
1224 | Type: html.ElementNode,
1225 | Data: "div",
1226 | Attr: []html.Attribute{
1227 | {Key: "class", Val: "image"},
1228 | },
1229 | }
1230 | link := &html.Node{
1231 | Type: html.ElementNode,
1232 | Data: "a",
1233 | Attr: []html.Attribute{
1234 | {Key: "href", Val: TitleToURL(targetStr)},
1235 | },
1236 | }
1237 | addChild(link, targetStr)
1238 | addChild(n, link)
1239 | children, ok := lcs.([]interface{})
1240 | if ok && len(children) > 0 {
1241 | descDiv := &html.Node{
1242 | Type: html.ElementNode,
1243 | Data: "div",
1244 | Attr: []html.Attribute{
1245 | {Key: "class", Val: "caption"},
1246 | },
1247 | }
1248 | addChild(descDiv, children[len(children)-1])
1249 | addChild(n, descDiv)
1250 | }
1251 | return n, nil
1252 | }
1253 | n := &html.Node{
1254 | Type: html.ElementNode,
1255 | Data: "a",
1256 | Attr: []html.Attribute{
1257 | {Key: "href", Val: TitleToURL(targetStr)},
1258 | },
1259 | }
1260 | if !addChild(n, lcs) {
1261 | addChild(n, targetStr)
1262 | }
1263 | return n, nil
1264 | /*
1265 | var pipeTrick = (lcs.length === 1 && lcs[0].v === null);
1266 | var textTokens = [];
1267 | if (target === null || pipeTrick) {
1268 | textTokens.push("[[");
1269 | if (target) {
1270 | textTokens.push(target);
1271 | }
1272 | lcs.forEach(function(a) {
1273 | // a is a mw:maybeContent attribute
1274 | textTokens.push("|");
1275 | if (a.v !== null) { textTokens.push(a.v); }
1276 | });
1277 | textTokens.push("]]");
1278 | return textTokens;
1279 | }
1280 | var obj = new SelfclosingTagTk('wikilink');
1281 | var hrefKV = new KV('href', target);
1282 | hrefKV.vsrc = input.substring(startOffset() + 2, tpos);
1283 | // XXX: Point to object with path, revision and input information
1284 | // obj.source = input;
1285 | obj.attribs.push(hrefKV);
1286 | obj.attribs = obj.attribs.concat(lcs);
1287 | obj.dataAttribs = {
1288 | tsr: tsrOffsets(),
1289 | src: text(),
1290 | };
1291 | return [obj];
1292 | */
1293 | }
1294 |
1295 | // Tables are allowed inside image captions.
1296 | link_text
1297 | <- #{
1298 | // Suppress the flag temporarily in this rule to consume the '=' here.
1299 | push(c, "equal", false)
1300 | push(c, "linkdesc", true)
1301 | return nil
1302 | }
1303 | c1:( // This group is similar to "block_line" but "list_item"
1304 | // is omitted since `doBlockLevels` happens after
1305 | // `replaceInternalLinks2`, where newlines are stripped.
1306 | (sol (heading / hr / full_table_in_link_caption))
1307 | / urltext
1308 | / (!inline_breaks
1309 | r:( inline_element / '[' text_char+ ']' (&(!']' / "]]")) / . ) {return r, nil}
1310 | )
1311 | )+ #{
1312 | pop(c, "equal")
1313 | pop(c, "linkdesc")
1314 | return nil
1315 | }
1316 | {
1317 | return c1, nil
1318 | }
1319 |
1320 | // Generic quote rule for italic and bold, further processed in a token
1321 | // stream transformation in doQuotes. Relies on NlTk tokens being emitted
1322 | // for each line of text to balance quotes per line.
1323 |
1324 | // We are not using a simple pair rule here as we need to support mis-nested
1325 | // bolds/italics and MediaWiki's special heuristics for apostrophes, which are
1326 | // all not context free. */
1327 | quote <- ("''" "'"*) {
1328 | return &html.Node{
1329 | Type: html.ElementNode,
1330 | Data: "b",
1331 | Attr: []html.Attribute{
1332 | {Key: "_parsetoken"},
1333 | },
1334 | }, nil
1335 | /*
1336 | // sequences of four or more than five quotes are assumed to start
1337 | // with some number of plain-text apostrophes.
1338 | var plainticks = 0;
1339 | var result = [];
1340 | if (quotes.length === 4) {
1341 | plainticks = 1;
1342 | } else if (quotes.length > 5) {
1343 | plainticks = quotes.length - 5;
1344 | }
1345 | if (plainticks > 0) {
1346 | result.push(quotes.substring(0, plainticks));
1347 | }
1348 | // mw-quote token Will be consumed in token transforms
1349 | var tsr = tsrOffsets();
1350 | tsr[0] += plainticks;
1351 | var mwq = new SelfclosingTagTk('mw-quote', [], { tsr: tsr });
1352 | mwq.value = quotes.substring(plainticks);
1353 | result.push(mwq);
1354 | return result;
1355 | */
1356 | }
1357 |
1358 |
1359 | // *********************************************************
1360 | // Pre and xmlish tags
1361 | // *********************************************************/
1362 |
1363 | extension_tag <-
1364 | &{return false, nil /*return !stops.onStack('extTag'); */}
1365 | xmlish_tag
1366 | // Account for `maybeExtensionTag` returning unmatched start / end tags
1367 | &{return false, nil /* return extToken.name === 'extension'; */}
1368 | {return nil, nil/* return extToken; */}
1369 |
1370 | nowiki
1371 | <- extension_tag
1372 | &{return false, nil /* return extToken.getAttribute('name') === 'nowiki'; */}
1373 | {return nil, nil/* return extToken; */}
1374 |
1375 | // Used by nowiki extension to tokenize html entities.
1376 | nowiki_content
1377 | <- c2:(htmlentity / .)* {return c2, nil/* return tu.flattenIfArray(c); */}
1378 |
1379 | // Used by lang_variant productions to protect special language names or
1380 | // conversion strings.
1381 | nowiki_text
1382 | <- nowiki
1383 | {return nil, nil
1384 | /*
1385 | var txt = Util.getExtArgInfo(extToken).dict.body.extsrc;
1386 | return Util.decodeEntities(txt);
1387 | */
1388 | }
1389 |
1390 | // Generic XML-like tags
1391 |
1392 | // These also cover extensions (including Cite), which will hook into the
1393 | // token stream for further processing. The content of extension tags is
1394 | // parsed as regular inline, but the source positions of the tag are added
1395 | // to allow reconstructing the unparsed text from the input. */
1396 |
1397 | // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and
1398 | // following paragraphs.
1399 | tag_name_chars <- [^\t\n\v />\x00]
1400 | tag_name <- ([A-Za-z] tag_name_chars*)
1401 |
1402 | xmlish_tag
1403 | <- # {
1404 | push(c, "table", false)
1405 | push(c, "tableCellArg", false)
1406 | return nil
1407 | }
1408 | // By the time we get to `doTableStuff` in the php parser, we've already
1409 | // safely encoded element attributes. See 55313f4e in core.
1410 | // stops.push('table', false);
1411 | // stops.push('tableCellArg', false);
1412 | //return true;
1413 | //}
1414 | "<" end:"/"?
1415 | name:(tag_name & {return true, nil}
1416 | ///*
1417 | // return isXMLTag(tn, false); // NOTE: 'extTag' stop was pushed.
1418 | // */
1419 | //}
1420 | )
1421 | attribs:generic_newline_attributes
1422 | space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff
1423 | selfclose:"/"?
1424 | space* // not preserved - canonicalized on RT via dirty diff
1425 | ">"
1426 | #{
1427 | pop(c, "table")
1428 | pop(c, "tableCellArg")
1429 | pop(c, "extTag")
1430 | return nil
1431 | }
1432 | {
1433 | n := &html.Node{
1434 | Type: html.ElementNode,
1435 | Data: concat(name),
1436 | }
1437 |
1438 | for _, attr := range flatten(attribs) {
1439 | attr := attr.(html.Attribute)
1440 | n.Attr = append(n.Attr, attr)
1441 | }
1442 |
1443 | if end != nil {
1444 | n.Attr = append(n.Attr, html.Attribute{Key:"_parseend"})
1445 | } else if selfclose == nil {
1446 | n.Attr = append(n.Attr, html.Attribute{Key:"_parsestart"})
1447 | }
1448 |
1449 | return n, nil
1450 | /*
1451 | stops.pop('table');
1452 | stops.pop('tableCellArg');
1453 | stops.pop('extTag');
1454 |
1455 | var lcName = name.toLowerCase();
1456 |
1457 | // Extension tags don't necessarily have the same semantics as html tags,
1458 | // so don't treat them as void elements.
1459 | var isVoidElt = Util.isVoidElement(lcName) && !env.conf.wiki.extensionTags.has(lcName);
1460 |
1461 | // Support
1462 | if (lcName === 'br' && end) {
1463 | end = null;
1464 | }
1465 |
1466 | var res = tu.buildXMLTag(name, lcName, attribs, end, !!selfclose || isVoidElt, tsrOffsets());
1467 |
1468 | // change up data-attribs in one scenario
1469 | // void-elts that aren't self-closed ==> useful for accurate RT-ing
1470 | if (!selfclose && isVoidElt) {
1471 | res.dataAttribs.selfClose = undefined;
1472 | res.dataAttribs.noClose = true;
1473 | }
1474 |
1475 | return maybeExtensionTag(res);
1476 | */
1477 | }
1478 |
1479 |
1480 | // A variant of xmlish_tag, but also checks if the tag name is a block-level
1481 | // tag as defined in
1482 | // http://www.w3.org/TR/html5/syntax.html#tag-open-state and
1483 | // following paragraphs.
1484 | //
1485 | block_tag
1486 | <- & {
1487 | // By the time we get to `doTableStuff` in the php parser, we've already
1488 | // safely encoded element attributes. See 55313f4e in core.
1489 | push(c, "table", false)
1490 | push(c, "tableCellArg", false)
1491 | return true, nil
1492 | }
1493 | "<" "/"?
1494 | (tag_name & {
1495 | push(c, "extTag", false)
1496 | return false, nil
1497 | }
1498 | //#/*
1499 | //# return isXMLTag(tn, true); // NOTE: 'extTag' stop was pushed.
1500 | //# */
1501 | //#}
1502 | )
1503 | generic_newline_attributes
1504 | space_or_newline*
1505 | "/"?
1506 | ">" {
1507 | pop(c, "table")
1508 | pop(c, "tableCellArg")
1509 | pop(c, "extTag")
1510 | return nil, nil
1511 | /*
1512 | stops.pop('table');
1513 | stops.pop('tableCellArg');
1514 | stops.pop('extTag');
1515 | var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, !!selfclose, tsrOffsets());
1516 | return [maybeExtensionTag(t)];
1517 | */
1518 | }
1519 | / "<" "/"? tag_name & {
1520 | pop(c, "extTag")
1521 | return false, nil
1522 | }
1523 | / & {
1524 | pop(c, "table")
1525 | pop(c, "tableCellArg")
1526 | return false, nil
1527 | }
1528 |
1529 | // A generic attribute that can span multiple lines.
1530 | generic_newline_attribute
1531 | <- space_or_newline*
1532 | ("" {return nil, nil/* return endOffset(); */})
1533 | key:generic_attribute_name
1534 | ("" {return nil, nil/* return endOffset(); */})
1535 | val:(space_or_newline* "=" v:generic_att_value? {return v, nil/* return v; */})?
1536 | {return html.Attribute{Key: concat(key), Val: concat(val)}, nil
1537 | /*
1538 | // NB: Keep in sync w/ table_attibute
1539 | var res;
1540 | // Encapsulate protected attributes.
1541 | if (typeof name === 'string') {return nil, nil
1542 | name = tu.protectAttrs(name);
1543 | }
1544 | if (vd !== null) {
1545 | res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]);
1546 | res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]);
1547 | } else {
1548 | res = new KV(name, '', [namePos0, namePos, namePos, namePos]);
1549 | }
1550 | if (Array.isArray(name)) {
1551 | res.ksrc = input.substring(namePos0, namePos);
1552 | }
1553 | return res;
1554 | */
1555 | }
1556 |
1557 | // A single-line attribute.
1558 | table_attribute
1559 | <- optionalSpaceToken
1560 | ("" {return nil, nil /* return endOffset(); */})
1561 | table_attribute_name
1562 | ("" {return nil, nil /* return endOffset(); */})
1563 | (optionalSpaceToken "=" table_att_value? {return nil, nil /* return v; */})?
1564 | {return nil,nil
1565 | /*
1566 | // NB: Keep in sync w/ generic_newline_attribute
1567 | var res;
1568 | // Encapsulate protected attributes.
1569 | if (typeof name === 'string') {
1570 | name = tu.protectAttrs(name);
1571 | }
1572 | if (vd !== null) {
1573 | res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]);
1574 | res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]);
1575 | } else {
1576 | res = new KV(name, '', [namePos0, namePos, namePos, namePos]);
1577 | }
1578 | if (Array.isArray(name)) {
1579 | res.ksrc = input.substring(namePos0, namePos);
1580 | }
1581 | return res;
1582 | */
1583 | }
1584 |
1585 | // The arrangement of chars is to emphasize the split between what's disallowed
1586 | // by html5 and what's necessary to give directive a chance.
1587 | // See: http://www.w3.org/TR/html5/syntax.html#attributes-0
1588 | generic_attribute_name
1589 | <- q:(["'=]?) // From #before-attribute-name-state, < is omitted for directive
1590 | r:( [^ \t\r\n\x00/=><&{}!|-]+
1591 | / !inline_breaks
1592 | // \0/=> is the html5 attribute name set we do not want.
1593 | t:( directive / !( space_or_newline / [\x00/=>] ) c2:. { return c2, nil /*return c;*/ }
1594 | ) {return t, nil /*return t; */}
1595 | )*
1596 | & {
1597 | return len(flatten(r))>0 || len(flatten(q))>0, nil
1598 | /* return r.length > 0 || q.length > 0; */
1599 | }
1600 | {return concat(q, r), nil /* return tu.flattenString([q].concat(r)); */}
1601 |
1602 | // Also accept these chars in a wikitext table or tr attribute name position.
1603 | // They are normally not matched by the table_attribute_name.
1604 | broken_table_attribute_name_char <- [\x00/=>] {return nil, nil /* return new KV(c, ''); */}
1605 |
1606 | // Same as generic_attribute_name, except for accepting tags and wikilinks.
1607 | // (That doesn't make sense (ie. match php) in the generic case.)
1608 | // We also give a chance to break on \[ (see T2553).
1609 | table_attribute_name
1610 | <- (["'=]?) // From #before-attribute-name-state, < is omitted for directive
1611 | ( [^ \t\r\n\x00/=><&{}!|[-]+
1612 | / !inline_breaks
1613 | // \0/=> is the html5 attribute name set we do not want.
1614 | ( wikilink
1615 | / directive
1616 | // Accept insane tags-inside-attributes as attribute names.
1617 | // The sanitizer will strip and shadow them for roundtripping.
1618 | // Example: generated with..
1619 | / &xmlish_tag inlineline {return nil, nil/* return ill; */}
1620 | / !( space_or_newline / [\x00/=>] ) . {return nil, nil/* return c; */}
1621 | ) {return nil, nil/* return t; */}
1622 | )*
1623 | & {return false, nil/* return r.length > 0 || q.length > 0; */}
1624 | {return nil, nil/* return tu.flattenString([q].concat(r)); */}
1625 |
1626 | // Attribute value, quoted variants can span multiple lines.
1627 | // Missing end quote: accept /> look-ahead as heuristic.
1628 | // These need to be kept in sync with the attribute_preprocessor_text_*
1629 | generic_att_value
1630 | <- (space_or_newline* "'") t:attribute_preprocessor_text_single? ("'" / &('/'? '>')) {
1631 | return t, nil
1632 | /*
1633 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1634 | */
1635 | }
1636 | / (space_or_newline* '"') t:attribute_preprocessor_text_double? ('"' / &('/'? '>')) {
1637 | return t, nil
1638 | /*
1639 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1640 | */
1641 | }
1642 | / space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') {
1643 | return t, nil
1644 | /*
1645 | return tu.getAttrVal(t, startOffset() + s.length, endOffset());
1646 | */
1647 | }
1648 |
1649 | // Attribute value, restricted to a single line.
1650 | // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic.
1651 | // These need to be kept in sync with the table_attribute_preprocessor_text_*
1652 | table_att_value
1653 | <- (space* "'") table_attribute_preprocessor_text_single? ("'" / &("!!" / [|\r\n])) {return nil, nil
1654 | /*
1655 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1656 | */
1657 | }
1658 | / (space* '"') table_attribute_preprocessor_text_double? ('"' / &("!!" / [|\r\n])) {return nil, nil
1659 | /*
1660 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1661 | */
1662 | }
1663 | / space* table_attribute_preprocessor_text &(space_or_newline/ eof / "!!" / '|') {return nil, nil
1664 | /*
1665 | return tu.getAttrVal(t, startOffset() + s.length, endOffset());
1666 | */
1667 | }
1668 |
1669 | // *******************************************************
1670 | // Lists
1671 | // *******************************************************/
1672 | list_item <- dtdd / hacky_dl_uses / li
1673 |
1674 | li <- bullets:list_char+
1675 | c2:inlineline?
1676 | // The inline_break is to check if we've hit a template end delimiter.
1677 | &(eolf / inline_breaks)
1678 | {
1679 | n := &html.Node{
1680 | Type: html.ElementNode,
1681 | Data: "li",
1682 | }
1683 | addChild(n, c2)
1684 | return n ,nil
1685 | /*
1686 | // Leave bullets as an array -- list handler expects this
1687 | var tsr = tsrOffsets('start');
1688 | tsr[1] += bullets.length;
1689 | var li = new TagTk('listItem', [], { tsr: tsr });
1690 | li.bullets = bullets;
1691 | return [ li ].concat(c || []);
1692 | */
1693 | }
1694 |
1695 |
1696 | // This rule is required to support wikitext of this form
1697 | // ::{|border="1"|foo|bar|baz|}
1698 | // where the leading colons are used to indent the entire table.
1699 | // This hack was added back in 2006 in commit
1700 | // a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl
1701 | // Fürstenberg.
1702 | //
1703 | hacky_dl_uses <- ":"+
1704 | (table_line (sol table_line)*)
1705 | inlineline?
1706 | &comment_space_eolf
1707 | {return nil,nil
1708 | /*
1709 | // Leave bullets as an array -- list handler expects this
1710 | var tsr = tsrOffsets('start');
1711 | tsr[1] += bullets.length;
1712 | var li = new TagTk('listItem', [], { tsr: tsr });
1713 | li.bullets = bullets;
1714 | return tu.flattenIfArray([li, tbl || [], line || []]);
1715 | */
1716 | }
1717 |
1718 | dtdd
1719 | <- (!(";" !list_char) list_char {return nil, nil /*return lc;*/ })*
1720 | ";"
1721 | & {return false, nil/*return stops.inc('colon');*/}
1722 | inlineline?
1723 | (":" {return nil, nil /*return endOffset(); */})
1724 | // Fortunately dtdds cannot be nested, so we can simply set the flag
1725 | // back to 0 to disable it.
1726 | & {return false, nil /*stops.counters.colon = 0; return true;*/}
1727 | inlineline?
1728 | &eolf {return nil, nil
1729 | /*
1730 | // Leave bullets as an array -- list handler expects this
1731 | // TSR: +1 for the leading ";"
1732 | var numBullets = bullets.length + 1;
1733 | var tsr = tsrOffsets('start');
1734 | tsr[1] += numBullets;
1735 | var li1 = new TagTk('listItem', [], { tsr: tsr });
1736 | li1.bullets = bullets.slice();
1737 | li1.bullets.push(";");
1738 | // TSR: -1 for the intermediate ":"
1739 | var li2 = new TagTk('listItem', [], { tsr: [cpos - 1, cpos], stx: 'row' });
1740 | li2.bullets = bullets.slice();
1741 | li2.bullets.push(":");
1742 |
1743 | return [ li1 ].concat(c || [], [ li2 ], d || []);
1744 | */
1745 | }
1746 | // Fall-back case to clear the colon flag
1747 | / & {return false, nil /*stops.counters.colon = 0; return false; */}
1748 |
1749 |
1750 | list_char <- [*#:;]
1751 |
1752 |
1753 |
1754 | // ****************************************************************************
1755 | // Tables
1756 | // ------
1757 | // Table rules are geared to support independent parsing of fragments in
1758 | // templates (the common table start / row / table end use case). The tokens
1759 | // produced by these fragments then match up to a table while building the
1760 | // DOM tree. For similar reasons, table rows do not emit explicit end tag
1761 | // tokens.
1762 |
1763 | // The separate table_line rule is faster than moving those rules
1764 | // directly to block_lines.
1765 |
1766 | // Notes about the full_table_in_link_caption rule
1767 | // -----------------------------------------------------
1768 | // However, for link-tables, we have introduced a stricter parse wherein
1769 | // we require table-start and table-end tags to not come from a template.
1770 | // In addition, this new rule doesn't accept fosterable-content in
1771 | // the table unlike the more lax (sol table_line)+ rule.
1772 |
1773 | // This is the best we can do at this time since we cannot distinguish
1774 | // between table rows and image options entirely in the tokenizer.
1775 |
1776 | // Consider the following examples:
1777 |
1778 | // Example 1:
1779 |
1780 | // [[Image:Foo.jpg|left|30px|Example 1
1781 | // {{This-template-returns-a-table-start-tag}}
1782 | // |foo
1783 | // {{This-template-returns-a-table-end-tag}}
1784 | // ]]
1785 |
1786 | // Example 2:
1787 |
1788 | // [[Image:Foo.jpg|left|30px|Example 1
1789 | // {{echo|a}}
1790 | // |foo
1791 | // {{echo|b}}
1792 | // ]]
1793 |
1794 | // So, we cannot know a priori (without preprocessing or fully expanding
1795 | // all templates) if "|foo" in the two examples is a table cell or an image
1796 | // option. This is a limitation of our tokenizer-based approach compared to
1797 | // the preprocessing-based approach of the PHP parser.
1798 |
1799 | // Given this limitation, we are okay forcing a full-table context in
1800 | // link captions (if necessary, we can relax the fosterable-content requirement
1801 | // but that is broken wikitext anyway, so we can force that edge-case wikitext
1802 | // to get fixed by rejecting it).
1803 | // ****************************************************************************/
1804 |
1805 | full_table_in_link_caption
1806 | <- (! inline_breaks / & "{{!}}" )
1807 | (
1808 | // Note that "linkdesc" is suppressed here to provide a nested parsing
1809 | // context in which to parse the table. Otherwise, we may break on
1810 | // on pipes in the `table_start_tag` and `table_row_tag` attributes.
1811 | // However, as a result, this can be more permissive than the current
1812 | // php implementation, but likelier to match the users intent.
1813 | & {return false, nil /*stops.push('linkdesc', false); return stops.push('table', true);
1814 | */}
1815 | (
1816 | table_start_tag optionalNewlines
1817 | // Accept multiple end tags since a nested table may have been
1818 | // opened in the table content line.
1819 | ((sol (table_content_line / tplarg_or_template) optionalNewlines)*
1820 | sol table_end_tag)+
1821 | ){return nil, nil
1822 | /*
1823 | stops.pop('linkdesc');
1824 | stops.pop('table');
1825 | return tbl;
1826 | */
1827 | }
1828 | / & {return false, nil/* stops.pop('linkdesc'); return stops.pop('table'); */}
1829 | ) {return nil, nil/* return r; */}
1830 |
1831 | // This rule assumes start-of-line position!
1832 | table_line
1833 | <- (! inline_breaks / & "{{!}}" )
1834 | (
1835 | & {return false, nil /* return stops.push('table', true); */}
1836 | (
1837 | table_start_tag optionalNewlines
1838 | / table_content_line optionalNewlines
1839 | / table_end_tag
1840 | ) {return nil, nil
1841 | /*
1842 | stops.pop('table');
1843 | return tl;
1844 | */
1845 | }
1846 | / & {return false, nil /* return stops.pop('table'); */}
1847 | ) {return nil, nil/* return r; */}
1848 |
1849 | table_content_line <- (space / comment)* (
1850 | table_heading_tags
1851 | / table_row_tag
1852 | / table_data_tags
1853 | / table_caption_tag
1854 | )
1855 |
1856 | table_start_tag
1857 | <- (space / comment)* ("" {return nil, nil/* return endOffset(); */}) "{" pipe
1858 | // ok to normalize away stray |} on rt (see T59360)
1859 | & {return false, nil /* return stops.push('table', false); */}
1860 | table_attributes
1861 | ("" {return nil, nil/* stops.pop('table'); return endOffset(); */})
1862 | {return nil, nil
1863 | /*
1864 | var coms = tu.popComments(ta);
1865 | if (coms) {
1866 | tsEndPos = coms.commentStartPos;
1867 | }
1868 |
1869 | var da = { tsr: [startPos, tsEndPos] };
1870 | if (p !== "|") {
1871 | // Variation from default
1872 | da.startTagSrc = b + p;
1873 | }
1874 |
1875 | sc.push(new TagTk('table', ta, da));
1876 | if (coms) {
1877 | sc = sc.concat(coms.buf);
1878 | }
1879 | return sc;
1880 | */
1881 | }
1882 |
1883 | // FIXME: Not sure if we want to support it, but this should allow columns.
1884 | table_caption_tag
1885 | // avoid recursion via nested_block_in_table
1886 | <- ! {return true, nil /*return stops.onStack('tableDataBlock');*/ }
1887 | pipe "+"
1888 | row_syntax_table_args?
1889 | ("" {return nil, nil /*return endOffset();*/ })
1890 | nested_block_in_table* {return nil, nil
1891 | /*
1892 | return tu.buildTableTokens("caption", "|+", args, [startOffset(), tagEndPos], endOffset(), c, true);
1893 | */
1894 | }
1895 |
1896 | table_row_tag
1897 | <- // avoid recursion via nested_block_in_table
1898 | ! {return true, nil /*return stops.onStack('tableDataBlock'); */}
1899 | pipe "-"+
1900 | & {return false, nil /* return stops.push('table', false); */}
1901 | table_attributes
1902 | ("" {return nil, nil/* stops.pop('table'); return endOffset(); */})
1903 | {return nil, nil
1904 | /*
1905 | var coms = tu.popComments(a);
1906 | if (coms) {
1907 | tagEndPos = coms.commentStartPos;
1908 | }
1909 |
1910 | var da = {
1911 | tsr: [ startOffset(), tagEndPos ],
1912 | startTagSrc: p + dashes,
1913 | };
1914 |
1915 | // We rely on our tree builder to close the row as needed. This is
1916 | // needed to support building tables from fragment templates with
1917 | // individual cells or rows.
1918 | var trToken = new TagTk('tr', a, da);
1919 |
1920 | var res = [ trToken ];
1921 | if (coms) {
1922 | res = res.concat(coms.buf);
1923 | }
1924 | return res;
1925 | */
1926 | }
1927 |
1928 | tds
1929 | <- ( ( pipe_pipe / pipe & row_syntax_table_args {return nil, nil /*return p;*/ } )
1930 | table_data_tag {return nil, nil
1931 | /*
1932 | var da = tdt[0].dataAttribs;
1933 | da.stx = "row";
1934 | da.tsr[0] -= pp.length; // include "||"
1935 | if (pp !== "||" || (da.startTagSrc && da.startTagSrc !== pp)) {
1936 | // Variation from default
1937 | da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : '');
1938 | }
1939 | return tdt;
1940 | */
1941 | }
1942 | )*
1943 |
1944 | // avoid recursion via nested_block_in_table
1945 | table_data_tags
1946 | <- ! {return true, nil/* return stops.onStack('tableDataBlock'); */}
1947 | pipe
1948 | ![+-] table_data_tag
1949 | ("" {return nil, nil/* return endOffset(); */})
1950 | tds {return nil, nil
1951 | // blahaskjdf;alsdf;;
1952 | }
1953 |
1954 | table_data_tag
1955 | <- ! "}"
1956 | row_syntax_table_args?
1957 | // use inline_breaks to break on tr etc
1958 | ("" {return nil, nil/* return endOffset(); */})
1959 | nested_block_in_table*
1960 | {return nil, nil
1961 | /*
1962 | return tu.buildTableTokens("td", "|", arg, [startOffset(), tagEndPos], endOffset(), td);
1963 | */
1964 | }
1965 |
1966 | table_heading_tags
1967 | <- "!"
1968 | & {return false, nil /*return stops.push('th', endOffset()); */}
1969 | table_heading_tag
1970 | ( ("!!" / pipe_pipe) table_heading_tag {return nil, nil
1971 | /*
1972 | var da = tht[0].dataAttribs;
1973 | da.stx = 'row';
1974 | da.tsr[0] -= pp.length; // include "!!" or "||"
1975 |
1976 | if (pp !== "!!" || (da.startTagSrc && da.startTagSrc !== pp)) {
1977 | // Variation from default
1978 | da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : '');
1979 | }
1980 | return tht;
1981 | */
1982 | }
1983 | )* {return nil, nil
1984 | /*
1985 | stops.pop('th');
1986 | th[0].dataAttribs.tsr[0]--; // include "!"
1987 | return th.concat(ths);
1988 | */
1989 | }
1990 | / & {return false, nil /*return stops.onStack('th') !== false ? stops.pop('th') : false;*/ }
1991 |
1992 | table_heading_tag
1993 | <- row_syntax_table_args?
1994 | ("" {return nil, nil /*return endOffset();*/ })
1995 | ( & {return false, nil
1996 | /*
1997 | // This SyntaxStop is only true until we hit the end of the line.
1998 | if (stops.onStack('th') !== false &&
1999 | /\n/.test(input.substring(stops.onStack('th'), endOffset()))) {
2000 | // There's been a newline. Remove the break and continue
2001 | // tokenizing nested_block_in_tables.
2002 | stops.pop('th');
2003 | }
2004 | return true;
2005 | */
2006 | } nested_block_in_table {return nil, nil/* return d; */} )* {return nil, nil
2007 | /*
2008 | return tu.buildTableTokens("th", "!", arg, [startOffset(), tagEndPos], endOffset(), c);
2009 | */
2010 | }
2011 |
2012 | table_end_tag
2013 | <- (space / comment)* ("" {return nil, nil/* return endOffset(); */}) pipe "}" {return nil, nil
2014 | /*
2015 | var tblEnd = new EndTagTk('table', [], { tsr: [startPos, endOffset()] });
2016 | if (p !== "|") {
2017 | // p+"" is triggering some bug in pegJS
2018 | // I cannot even use that expression in the comment!
2019 | tblEnd.dataAttribs.endTagSrc = p + b;
2020 | }
2021 | return sc.concat([tblEnd]);
2022 | */
2023 | }
2024 |
2025 | //
2026 | // Table parameters separated from the content by a single pipe. Does *not*
2027 | // match if followed by double pipe (row-based syntax).
2028 | //
2029 | row_syntax_table_args
2030 | <- & {return false, nil /* return stops.push('tableCellArg', return true, nil); */}
2031 | table_attributes space* pipe !pipe {return nil, nil
2032 | /*
2033 | stops.pop('tableCellArg');
2034 | return [as, s, p];
2035 | */
2036 | }
2037 | / & {return false, nil /* return stops.pop('tableCellArg'); */}
2038 |
2039 |
2040 | // *****************************************************************
2041 | // Text variants and other general rules
2042 | // *****************************************************************/
2043 |
2044 | // All chars that cannot start syntactic structures in the middle of a line
2045 | // XXX: ] and other end delimiters should probably only be activated inside
2046 | // structures to avoid unnecessarily leaving the text rule on plain
2047 | // content.
2048 |
2049 | // TODO: Much of this is should really be context-dependent (syntactic
2050 | // flags). The wikilink_preprocessor_text rule is an example where
2051 | // text_char is not quite right and had to be augmented. Try to minimize /
2052 | // clarify this carefully!
2053 | //
2054 |
2055 | text_char <- [^'<~[{\n\r:;\]}|!=-]
2056 |
2057 | // Legend
2058 | // ' quotes (italic/bold)
2059 | // < start of xmlish_tag
2060 | // ~ signatures/dates
2061 | // [ start of links
2062 | // { start of parser functions, transclusion and template args
2063 | // \n all sort of block-level markup at start of line
2064 | // \r ditto
2065 | // A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC)
2066 |
2067 | // _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
2068 | // ! and | table cell delimiters, might be better to specialize those
2069 | // = headings - also specialize those!
2070 |
2071 | // The following chars are also included for now, but only apply in some
2072 | // contexts and should probably be enabled only in those:
2073 | // : separate definition in ; term : definition
2074 | // ] end of link
2075 | // } end of parser func/transclusion/template arg
2076 | // - start of lang_variant -{ ... }-
2077 | // ; separator in lang_variant
2078 | //
2079 |
2080 | urltext <- ( [^-'<~[{\n/A-Za-z_|!:;\]} &=]+
2081 | / & [/A-Za-z] al:autolink {return al, nil /*return al;*/ }
2082 | / & "&" he:htmlentity {return he, nil /*return he;*/ }
2083 | // Convert trailing space into
2084 | // XXX: This should be moved to a serializer
2085 | // This is a hack to force a whitespace display before the colon
2086 | / ' ' & ':' {return " ", nil
2087 | /*
2088 | var toks = Util.placeholder('\u00a0', {
2089 | ' ',
2090 | tsr: tsrOffsets('start'),
2091 | isDisplayHack: true,
2092 | }, { tsr: tsrOffsets('end'), isDisplayHack: true });
2093 | var typeOf = toks[0].getAttribute('typeof');
2094 | toks[0].setAttribute('typeof', 'mw:DisplaySpace ' + typeOf);
2095 | return toks;
2096 | */
2097 | }
2098 | / & ("__") bs:behavior_switch {return bs, nil /*return bs;*/ }
2099 | // About 96% of text_char calls originate here.
2100 | // pegjs 0.8 inlines this simple rule automatically.
2101 | / text_char )+
2102 |
2103 | raw_htmlentity <- ("&" [#0-9a-zA-Z]+ ";") {return nil, nil
2104 | /*
2105 | return Util.decodeEntities(m);
2106 | */
2107 | }
2108 |
2109 | htmlentity <- raw_htmlentity {return nil, nil
2110 | /*
2111 | // if this is an invalid entity, don't tag it with 'mw:Entity'
2112 | if (cc.length > 2 /* decoded entity would be 1 or 2 UTF-16 characters * /) {
2113 | return cc;
2114 | }
2115 | return [
2116 | new TagTk('span', [new KV('typeof', 'mw:Entity')], { src: text(), srcContent: cc, tsr: tsrOffsets('start') }),
2117 | cc,
2118 | new EndTagTk('span', [], { tsr: tsrOffsets('end') }),
2119 | ];
2120 | */
2121 | }
2122 |
2123 | spaces <- [ \t]+
2124 |
2125 | space <- [ \t]
2126 |
2127 | optionalSpaceToken <- space*
2128 |
2129 | // This rule corresponds to \s in the PHP preg_* functions,
2130 | // which is used frequently in the PHP parser. The inclusion of
2131 | // form feed (but not other whitespace, like vertical tab) is a quirk
2132 | // of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular
2133 | // Expressions) library.
2134 | //
2135 | space_or_newline
2136 | <- [ \t\n\r\x0c]
2137 |
2138 | // This rule corresponds to \b in the PHP preg_* functions,
2139 | // after a word character. That is, it's a zero-width lookahead that
2140 | // the next character is not a word character.
2141 | //
2142 | end_of_word
2143 | <- eof / ![A-Za-z0-9_]
2144 |
2145 | // Unicode "separator, space" category. It covers the \u0020 space as well
2146 | // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}.
2147 | // Keep this up-to-date with the characters tagged ;Zs; in
2148 | // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
2149 | unispace <- [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
2150 |
2151 | // Non-newline whitespace, including non-breaking spaces. Used for magic links.
2152 | space_or_nbsp
2153 | <- space // includes \t
2154 | / unispace
2155 | / he:htmlentity &{ return false, nil /*return Array.isArray(he) && /^\u00A0$/.test(he[1]);*/ }
2156 | {return he, nil /*return he;*/ }
2157 |
2158 | // Used within ISBN magic links
2159 | space_or_nbsp_or_dash
2160 | <- space_or_nbsp / "-"
2161 |
2162 | // Extra newlines followed by at least another newline. Usually used to
2163 | // compress surplus newlines into a meta tag, so that they don't trigger
2164 | // paragraphs.
2165 | optionalNewlines
2166 | <- ([\n\r\t ] &[\n\r])*
2167 |
2168 | comment_or_includes <- (comment / (
2169 | ( #{
2170 | push(c, "sol_il", true)
2171 | return nil
2172 | }
2173 | i:include_limits
2174 | #{
2175 | pop(c, "sol_il")
2176 | return nil
2177 | }
2178 | ) {return i, nil}
2179 | ))*
2180 |
2181 | sol <- (empty_line_with_comments / sol_prefix) comment_or_includes
2182 |
2183 | sol_prefix
2184 | <- newlineToken
2185 | / & {
2186 | return c.pos.offset == 0, nil
2187 | /*
2188 | // Use the sol flag only at the start of the input
2189 | // NOTE: Explicitly check for 'false' and not a falsy value
2190 | return endOffset() === 0 && options.sol !== false;
2191 | */
2192 | } {return nil, nil /*return [];*/ }
2193 |
2194 | empty_line_with_comments
2195 | <- sol_prefix ("" {return "empty_line_with_comments", nil /*return endOffset();*/ }) (space* comment (space / comment)* newline)+ {return nil, nil
2196 | /*
2197 | return [
2198 | sp,
2199 | new SelfclosingTagTk("meta", [new KV('typeof', 'mw:EmptyLine')], {
2200 | tokens: tu.flattenIfArray(c),
2201 | tsr: [p, endOffset()],
2202 | }),
2203 | ];
2204 | */
2205 | }
2206 |
2207 | comment_space <- comment / space
2208 |
2209 | nl_comment_space <- newlineToken / comment_space
2210 |
2211 | //
2212 | // noinclude / includeonly / onlyinclude rules. These are normally
2213 | // handled by the xmlish_tag rule, except where generic tags are not
2214 | // allowed- for example in directives, which are allowed in various attribute
2215 | // names and -values.
2216 |
2217 | // Example test case:
2218 | // {|
2219 | // |-
2220 | // foo
2221 | //
2222 | // |Hello
2223 | // |}
2224 | //
2225 |
2226 | include_limits <-
2227 | il:("<" "/"? ([oyinclude]i+ & {return false, nil
2228 | /*
2229 | var incl = n.toLowerCase();
2230 | return incl === "noinclude" || incl === "onlyinclude" ||
2231 | incl === "includeonly";
2232 | */
2233 | }) space_or_newline* ">" {return nil, nil
2234 | /*
2235 | var incl = name.toLowerCase();
2236 | var dp = { tsr: tsrOffsets() };
2237 |
2238 | // Record variant since tag is not in normalized lower case
2239 | if (name !== incl) {
2240 | dp.srcTagName = name;
2241 | }
2242 |
2243 | // End tag only
2244 | if (c) {
2245 | return new EndTagTk(name, [], dp);
2246 | }
2247 |
2248 | var restOfInput = input.substring(endOffset());
2249 | var tagContent = restOfInput.match(new RegExp("^([\\s\\S]*?)(?:\\s*" + incl + "\\s*>)", "m"));
2250 |
2251 | // Start tag only
2252 | if (!tagContent || !tagContent[1]) {
2253 | return new TagTk(name, [], dp);
2254 | }
2255 |
2256 | // Get the content
2257 | var inclContent = tagContent[1];
2258 |
2259 | // Preserve SOL where necessary (for onlyinclude and noinclude)
2260 | // Note that this only works because we encounter <*include*> tags in
2261 | // the toplevel content and we rely on the php preprocessor to expand
2262 | // templates, so we shouldn't ever be tokenizing inInclude.
2263 | // Last line should be empty (except for comments)
2264 | if (incl !== "includeonly" && stops.onStack("sol_il")) {
2265 | var last = lastItem(inclContent.split('\n'));
2266 | if (!/^()*$/.test(last)) {
2267 | return false;
2268 | }
2269 | }
2270 |
2271 | // Tokenize include content in a new tokenizer
2272 | var inclContentToks = (new PegTokenizer(env)).tokenizeSync(inclContent);
2273 | inclContentToks = Util.stripEOFTkfromTokens(inclContentToks);
2274 |
2275 | // Shift tsr
2276 | Util.shiftTokenTSR(inclContentToks, endOffset());
2277 |
2278 | // Skip past content
2279 | peg$currPos += inclContent.length;
2280 |
2281 | return [new TagTk(name, [], dp)].concat(inclContentToks);
2282 | */
2283 | }) & {return il != nil, nil /*return !!il; */ } {return il, nil /*return il; */ }
2284 |
2285 | // Start of file
2286 | sof <- & {
2287 | return c.pos.offset == 0, nil
2288 | }
2289 |
2290 | // End of file
2291 | eof <- & {
2292 | len := c.globalStore["len"].(int)
2293 | return c.pos.offset == len, nil
2294 | }
2295 |
2296 | newline <- '\n' / "\r\n"
2297 |
2298 | newlineToken <- newline {return "\n", nil/* return [new NlTk(tsrOffsets())]; */}
2299 |
2300 | eolf <- newline / eof
2301 |
2302 | comment_space_eolf <- (space+ / comment)* eolf
2303 |
2304 | // 'Preprocessor' directive- higher-level things that can occur in otherwise
2305 | // plain-text content.
2306 | directive
2307 | <- comment
2308 | / extension_tag
2309 | / tplarg_or_template
2310 | / & "-{" v:lang_variant_or_tpl {return v, nil/* return v; */}
2311 | / & "&" e:htmlentity {return e, nil/* return e; */}
2312 | / include_limits
2313 |
2314 | wikilink_preprocessor_text
2315 | <- r:( [^<[{\n\r\t|!\]}{ &-]+
2316 | // XXX gwicke: any more chars we need to allow here?
2317 | / !inline_breaks wr:( directive / ( !"]]" ( text_char / [!<}\]\n\r-] ) ) )
2318 | {return wr, nil/* return wr; */}
2319 | )+ {return r, nil
2320 | /*
2321 | return tu.flattenStringlist(r);
2322 | */
2323 | }
2324 |
2325 | extlink_preprocessor_text
2326 | // added special separator character class inline: separates url from
2327 | // description / text
2328 | <- # { push(c, "linkdesc", false); return nil
2329 | /*
2330 | // Prevent breaking on pipes when we're in a link description.
2331 | // See the test, 'Images with the "|" character in the comment'.
2332 | return stops.push('linkdesc', false);
2333 | */
2334 | }
2335 | r:( [^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000-]+
2336 | / !inline_breaks s:( directive / no_punctuation_char / [&|{-] ) {return s, nil/* return s;
2337 | */}
2338 | /// urlencoded_char
2339 | // !inline_breaks no_punctuation_char
2340 | / ([.:,] !(space / eolf))
2341 | / (['] ![']) // single quotes are ok, double quotes are bad
2342 | )+
2343 | #{ pop(c, "linkdesc"); return nil }
2344 | {return r, nil
2345 | /*
2346 | stops.pop('linkdesc');
2347 | return tu.flattenString(r);
2348 | */
2349 | }
2350 |
2351 | // Attribute values with preprocessor support
2352 |
2353 | // n.b. / is a permissible char in the three rules below.
2354 | // We only break on />, enforced by the negated expression.
2355 | // Hence, it isn't included in the stop set.
2356 |
2357 | // The stop set is space_or_newline and > which matches generic_att_value.
2358 | attribute_preprocessor_text
2359 | <- r:( [^{}&<|/ \t\n\r\x0c>-]+
2360 | / !inline_breaks
2361 | !"/>"
2362 | s:( directive / [{}&<|/-] ) {return s, nil /*return s; */}
2363 | )+ {return r, nil
2364 | /*
2365 | return tu.flattenString(r);
2366 | */
2367 | }
2368 |
2369 | // The stop set is '> which matches generic_att_value.
2370 | attribute_preprocessor_text_single
2371 | <- r:( [^{}&<|/'>-]+
2372 | / !inline_breaks
2373 | !"/>"
2374 | s:( directive / [{}&<|/-] ) {return s, nil/* return s; */}
2375 | )* {return r, nil
2376 | /*
2377 | return tu.flattenString(r);
2378 | */
2379 | }
2380 |
2381 | // The stop set is "> which matches generic_att_value.
2382 | attribute_preprocessor_text_double
2383 | <- r:( [^{}&<|/">-]+
2384 | / !inline_breaks
2385 | !"/>"
2386 | s:( directive / [{}&<|/-] ) {return s, nil/* return s; */}
2387 | )* {return r, nil
2388 | /*
2389 | return tu.flattenString(r);
2390 | */
2391 | }
2392 |
2393 | // Variants with the entire attribute on a single line
2394 |
2395 | // n.b. ! is a permissible char in the three rules below.
2396 | // We only break on !! in th, enforced by the inline break.
2397 | // Hence, it isn't included in the stop set.
2398 | // [ is also permissible but we give a chance to break
2399 | // for the [[ special case in php's doTableStuff (See T2553).
2400 |
2401 | // The stop set is space_or_newline and | which matches table_att_value.
2402 | table_attribute_preprocessor_text
2403 | <- r:( [^{}& 0 {
24 | t.Fatalf("leaking state! %#v", p.cur.state)
25 | }
26 | }
27 |
28 | func TestConvert(t *testing.T) {
29 | log.SetFlags(log.Flags() | log.Lshortfile)
30 |
31 | cases := []struct {
32 | in string
33 | want string
34 | }{
35 | {
36 | "Blah",
37 | "Blah
",
38 | },
39 | {
40 | "== Test ==",
41 | " Test
",
42 | },
43 | {
44 | "=Test=",
45 | "Test
",
46 | },
47 | {
48 | "'''Test'''",
49 | "Test",
50 | },
51 | {
52 | "* foo\n* nah\n* woof",
53 | "- foo
\n- nah
\n- woof
",
54 | },
55 | {
56 | "----",
57 | "
",
58 | },
59 | {
60 | "{{reflink}}\n\nBlah",
61 | "Blah
",
62 | },
63 | {
64 | "[[Jordanstown]]",
65 | `Jordanstown
`,
66 | },
67 | {
68 | "[[Jordanstown|Blah]]",
69 | `Blah
`,
70 | },
71 | {
72 | `{{Infobox basketball club
73 | | name = Ulster Elks
74 | | color1 = white
75 | | color2 = blue
76 | | logo =
77 | | arena = [[Ulster University]] Sports Centre
78 | }}`,
79 | "",
80 | },
81 | {
82 | `Test
`,
83 | `Test
`,
84 | },
85 | {
86 | "[Foo\n]Bar",
87 | "[Foo\n]Bar
",
88 | },
89 | {
90 | "[A]B",
91 | "[A]B
",
92 | },
93 | }
94 |
95 | debugRules(true)
96 |
97 | for _, c := range cases {
98 | c := c
99 | t.Run(c.in, func(t *testing.T) {
100 | outBytes, err := Convert([]byte(c.in), strict())
101 | if err != nil {
102 | t.Fatal(err)
103 | }
104 |
105 | out := string(outBytes)
106 | if out != c.want {
107 | t.Errorf("Covert(%q) = %q; not %q", c.in, out, c.want)
108 | }
109 | })
110 | }
111 | }
112 |
113 | func TestSanitizationPolicy(t *testing.T) {
114 | cases := []struct {
115 | in string
116 | want string
117 | }{
118 | {
119 | "",
120 | "",
121 | },
122 | {
123 | "A
",
124 | "A
",
125 | },
126 | {
127 | "",
128 | "",
129 | },
130 | }
131 |
132 | p := wikitextPolicy()
133 |
134 | for _, c := range cases {
135 | c := c
136 | t.Run(c.in, func(t *testing.T) {
137 | doc, err := html.Parse(strings.NewReader(c.in))
138 | if err != nil {
139 | t.Fatal(err)
140 | }
141 | t.Logf("Doc = %s", spew.Sdump(doc))
142 |
143 | out := p.Sanitize(c.in)
144 | if out != c.want {
145 | t.Errorf("Sanitize(%q) = %q; not %q", c.in, out, c.want)
146 | }
147 | })
148 | }
149 | }
150 |
--------------------------------------------------------------------------------