├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── doc └── docs.yml ├── linter_notes.md └── src ├── ast.rs ├── default_transformations.rs ├── error.rs ├── grammar.rs ├── grammar.rustpeg ├── lib.rs ├── main.rs ├── tests └── mod.rs ├── transformations.rs ├── traversion.rs └── util.rs /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | src/generated_tests.rs 4 | testfiles 5 | test.md 6 | Cargo.lock 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | sudo: required 3 | 4 | rust: 5 | - stable 6 | - nightly 7 | 8 | script: 9 | - cargo build --all --verbose 10 | - cargo test --all --verbose 11 | - cargo doc --all --verbose 12 | 13 | after_success: 14 | - | 15 | bash <(curl https://raw.githubusercontent.com/xd009642/tarpaulin/master/travis-install.sh) 16 | cargo tarpaulin --out Xml 17 | bash <(curl -s https://codecov.io/bash) 18 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mediawiki_parser" 3 | version = "0.4.2" 4 | authors = ["Valentin Roland "] 5 | description = "A strict parser for MediaWiki markdown." 6 | repository = "https://github.com/vroland/mediawiki-parser" 7 | documentation = "https://docs.rs/mediawiki_parser/" 8 | build = "build.rs" 9 | readme = "README.md" 10 | keywords = ["mediawiki", "parser", "wikipedia", "wikibooks", "markdown"] 11 | categories = ["parsing", "text-processing"] 12 | license = "MIT" 13 | edition = "2018" 14 | 15 | [lib] 16 | name = "mediawiki_parser" 17 | path = "src/lib.rs" 18 | doc = true 19 | 20 | [features] 21 | default = [] 22 | no_position = [] 23 | ptime = ["time"] 24 | 25 | [[bin]] 26 | name = "mwtoast" 27 | path = "src/main.rs" 28 | doc = true 29 | 30 | [dependencies] 31 | serde = "1.0" 32 | serde_yaml = "0.8" 33 | serde_json = "1.0" 34 | serde_derive = "1.0" 35 | structopt = "0.2" 36 | colored = "1.6" 37 | time = { version = "0.1", optional = true } 38 | 39 | [build-dependencies] 40 | serde = "1.0" 41 | serde_derive = "1.0" 42 | peg = "0.5" 43 | serde_yaml = "0.8" 44 | 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 vroland 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mediawiki-parser 2 | This project aims to develop a parser for a subset of mediawiki markdown on the basis of Parsing Expression Grammars. 3 | It currently features a generated parser and test generation from a specification document. A simple binary to read from a file and write yaml to stdout is provided. 4 | 5 | ## Disclaimer 6 | 7 | The goal of mediawiki-parser is *not* full compatibility with MediaWiki and all of it's quirks. It is intended to be used if rejecting exotic or malformed input is fine. 8 | The markup supported is currently largely oriented towards the need of a specific MediaWiki Project and will likely not change drastically without external contributions. 9 | 10 | If you want to parse any MediaWiki with all its weirdness, take a look at [Parse Wiki Text](https://github.com/portstrom/parse_wiki_text) instead. 11 | 12 | ## Currently supported MediaWiki: 13 | 14 | * Text formatting: `''italic'', '''bold''', \LaTex, , ...` 15 | * Paragraphs 16 | * Heading hierarchies 17 | * Lists 18 | * Internal references (files) `[[File.ext|option|caption]]` 19 | * External references `[https://example.com/ example]` 20 | * Tables 21 | * Generic templates `{{name|anon_arg|arg=value}}` 22 | * Galleries 23 | * Generic html tags and comments `content` 24 | 25 | ## Known Limitations 26 | 27 | This project has some known limitations, which might or might not be lifted in the future. 28 | Part of this comes from treating WikiText as a context-free formal language, which is not entrierly true. 29 | 30 | * `{,},[,]` cannot be used in plain text, as they normally indicate special syntax. However, using them in math or `` is fine. 31 | * Indentation is currently not parsed as `pre`. 32 | * Templates are only pared on a syntactical level, they have no effects on their content whatsoever. 33 | 34 | 35 | ## Example 36 | 37 | Parsing will result in either a syntax tree with position information (mostly omitted here for conciseness): 38 | 39 | Input: 40 | ``` markdown 41 | this is some ''formatted'' [https://example.com example] text. 42 | ``` 43 | Output (as pseudo-YAML): 44 | ``` yaml 45 | --- 46 | type: document 47 | position: ... 48 | content: 49 | - type: paragraph 50 | position: ... 51 | content: 52 | - type: text 53 | position: ... 54 | text: "this is some " 55 | - type: formatted 56 | position: ... 57 | markup: italic 58 | content: 59 | - type: text 60 | position: 61 | start: 62 | offset: 15 63 | line: 1 64 | col: 16 65 | end: 66 | offset: 24 67 | line: 1 68 | col: 25 69 | text: formatted 70 | - type: text 71 | position: ... 72 | text: " " 73 | - type: externalreference 74 | position: ... 75 | target: "https://example.com" 76 | caption: 77 | - type: text 78 | position: ... 79 | text: example 80 | - type: text 81 | position: ... 82 | text: " text." 83 | ``` 84 | 85 | Or a syntax error (here is a pretty representation): 86 | ``` 87 | ERROR in line 1 at column 57: Could not continue to parse, expected one of: ''', [, " 458 | out: 459 | type: document 460 | content: 461 | - type: paragraph 462 | content: 463 | - type: text 464 | text: "bla " 465 | - type: comment 466 | text: " comment ³½}³¹ðđ æđ||đð@³¼¶²{{}} content " 467 | 468 | # The nowiki tag should ignore everything. 469 | - case: nowiki 470 | input: | 471 | abc {{bla}} ''' 472 | [ 473 | out: 474 | type: document 475 | content: 476 | - type: paragraph 477 | content: 478 | - type: formatted 479 | markup: nowiki 480 | content: 481 | - type: text 482 | text: "abc {{bla}} ''' \n [" 483 | 484 | # Strikethrough text. 485 | - case: strikethrough 486 | input: | 487 | strikethrough text 488 | strikethrough text 489 | out: 490 | type: document 491 | content: 492 | - type: paragraph 493 | content: 494 | - type: formatted 495 | markup: strikethrough 496 | content: 497 | - type: text 498 | text: strikethrough text 499 | - type: text 500 | text: " " 501 | - type: formatted 502 | markup: strikethrough 503 | content: 504 | - type: text 505 | text: strikethrough text 506 | 507 | # Definition markup 508 | - case: definition markup 509 | input: Riemannsumme 510 | out: 511 | type: document 512 | content: 513 | - type: paragraph 514 | content: 515 | - type: htmltag 516 | name: dfn 517 | attributes: 518 | - key: title 519 | value: Riemannsumme 520 | content: 521 | - type: text 522 | text: Riemannsumme 523 | 524 | # Underline text 525 | - case: underline markup 526 | input: Inserted 527 | out: 528 | type: document 529 | content: 530 | - type: paragraph 531 | content: 532 | - type: formatted 533 | markup: underline 534 | content: 535 | - type: text 536 | text: Inserted 537 | 538 | # Fixed-width text 539 | - case: fixed width markup 540 | input: Fixed width text 541 | out: 542 | type: document 543 | content: 544 | - type: paragraph 545 | content: 546 | - type: formatted 547 | markup: code 548 | content: 549 | - type: text 550 | text: Fixed width text 551 | 552 | # Blockquote 553 | - case: blockquote markup 554 | input:
Blockquote
555 | out: 556 | type: document 557 | content: 558 | - type: paragraph 559 | content: 560 | - type: formatted 561 | markup: blockquote 562 | content: 563 | - type: text 564 | text: Blockquote 565 | 566 | # Pre-formatted Text 567 | - case: pre formatted text 568 | input: | 569 |
Text is '''preformatted''' and 
 570 |       ''markups'' '''''cannot''''' be done
571 | out: 572 | type: document 573 | content: 574 | - type: paragraph 575 | content: 576 | - type: formatted 577 | markup: preformatted 578 | content: 579 | - type: text 580 | text: "Text is '''preformatted''' and \n''markups'' '''''cannot''''' be done" 581 | 582 | # A simple list of one item. 583 | - case: list one item 584 | input: "* item" 585 | out: 586 | type: document 587 | content: 588 | - type: list 589 | content: 590 | - type: listitem 591 | kind: unordered 592 | depth: 1 593 | content: 594 | - type: text 595 | text: item 596 | 597 | # A ordered list and a unordered list of one item 598 | - case: two simple lists 599 | input: | 600 | * item 1 601 | 602 | ## item 2 603 | out: 604 | type: document 605 | content: 606 | - type: list 607 | content: 608 | - type: listitem 609 | kind: unordered 610 | depth: 1 611 | content: 612 | - type: text 613 | text: item 1 614 | - type: list 615 | content: 616 | - type: listitem 617 | kind: ordered 618 | depth: 2 619 | content: 620 | - type: text 621 | text: item 2 622 | 623 | 624 | # Star, Fence, Semicolon should be considered as text in an inline context. 625 | - case: list symbols inline context 626 | input: | 627 | * item 1 #;* bla 628 | abc # def * 629 | out: 630 | type: document 631 | content: 632 | - type: list 633 | content: 634 | - type: listitem 635 | kind: unordered 636 | depth: 1 637 | content: 638 | - type: text 639 | text: "item 1 #;* bla" 640 | - type: paragraph 641 | content: 642 | - type: text 643 | text: "abc # def *" 644 | 645 | # A list with multiple different item types and a paragraph 646 | - case: list diverse items 647 | input: | 648 | : item 1 649 | ; item 11 650 | *** item 2 651 | ## item 3 652 | paragraph 653 | out: 654 | type: document 655 | content: 656 | - type: list 657 | content: 658 | - type: listitem 659 | kind: definition 660 | depth: 1 661 | content: 662 | - type: text 663 | text: item 1 664 | - type: listitem 665 | kind: definitionterm 666 | depth: 1 667 | content: 668 | - type: text 669 | text: item 11 670 | - type: list 671 | content: 672 | - type: listitem 673 | depth: 2 674 | kind: unordered 675 | content: 676 | - type: list 677 | content: 678 | - type: listitem 679 | depth: 3 680 | kind: unordered 681 | content: 682 | - type: text 683 | text: item 2 684 | - type: listitem 685 | depth: 2 686 | kind: ordered 687 | content: 688 | - type: text 689 | text: item 3 690 | - type: paragraph 691 | content: 692 | - type: text 693 | text: paragraph 694 | 695 | # Even inside of templates, lists must start on a new line 696 | - case: list in template 697 | input: | 698 | {{test| 699 | * item 1 700 | * item 2}} 701 | out: 702 | type: document 703 | content: 704 | - type: template 705 | name: 706 | - type: text 707 | text: test 708 | content: 709 | - type: templateargument 710 | name: "1" 711 | value: 712 | - type: list 713 | content: 714 | - type: listitem 715 | kind: unordered 716 | depth: 1 717 | content: 718 | - type: text 719 | text: item 1 720 | - type: listitem 721 | kind: unordered 722 | depth: 1 723 | content: 724 | - type: text 725 | text: item 2 726 | 727 | # Lists cannot be started mid-line. 728 | - case: list mod line 729 | input: | 730 | {{test| this * is * a test}} 731 | [[this ** '' * as'' * as well]] 732 | out: 733 | type: document 734 | content: 735 | - type: template 736 | name: 737 | - type: text 738 | text: test 739 | content: 740 | - type: templateargument 741 | name: "1" 742 | value: 743 | - type: text 744 | text: "this * is " 745 | - type: htmltag 746 | name: a 747 | attributes: [] 748 | content: 749 | - type: text 750 | text: " * a " 751 | - type: text 752 | text: " test" 753 | - type: internalreference 754 | target: 755 | - type: text 756 | text: "this ** " 757 | - type: formatted 758 | markup: italic 759 | content: 760 | - type: text 761 | text: " * as" 762 | - type: formatted 763 | markup: underline 764 | content: 765 | - type: text 766 | text: " * as" 767 | - type: text 768 | text: " well" 769 | options: [] 770 | caption: [] 771 | 772 | # A very simple template 773 | - case: simple template 774 | input: "{{name}}" 775 | out: 776 | type: document 777 | content: 778 | - type: template 779 | name: 780 | - type: text 781 | text: name 782 | content: [] 783 | 784 | # A sequence of block templates 785 | - case: block template sequence 786 | input: | 787 | {{name}} 788 | 789 | {{name}} 790 | out: 791 | type: document 792 | content: 793 | - type: template 794 | name: 795 | - type: text 796 | text: name 797 | content: [] 798 | - type: template 799 | name: 800 | - type: text 801 | text: name 802 | content: [] 803 | 804 | # A sequence of inline templates 805 | - case: inline template sequence 806 | input: | 807 | bla {{name}} and a {{name}} 808 | out: 809 | type: document 810 | content: 811 | - type: paragraph 812 | content: 813 | - type: text 814 | text: "bla " 815 | - type: template 816 | name: 817 | - type: text 818 | text: name 819 | content: [] 820 | - type: text 821 | text: " and a " 822 | - type: template 823 | name: 824 | - type: text 825 | text: name 826 | content: [] 827 | 828 | 829 | # A template with list-like name 830 | - case: template listlike name 831 | input: | 832 | {{:name 833 | 834 | }} 835 | out: 836 | type: document 837 | content: 838 | - type: template 839 | name: 840 | - type: text 841 | text: ":name" 842 | content: [] 843 | 844 | 845 | # A sequence of unnamed template arguments 846 | - case: anonymous attribute sequence 847 | input: | 848 | {{templatename 849 | |attribute2 850 | |atträöüß3 851 | |attribute4 852 | }} 853 | out: 854 | type: document 855 | content: 856 | - type: template 857 | name: 858 | - type: text 859 | text: templatename 860 | content: 861 | - type: templateargument 862 | name: "1" 863 | value: 864 | - type: paragraph 865 | content: 866 | - type: text 867 | text: attribute2 868 | - type: templateargument 869 | name: "2" 870 | value: 871 | - type: paragraph 872 | content: 873 | - type: text 874 | text: atträöüß3 875 | - type: templateargument 876 | name: "3" 877 | value: 878 | - type: paragraph 879 | content: 880 | - type: text 881 | text: attribute4 882 | 883 | # A named argument 884 | - case: named argument 885 | input: "{{name|caption=üäö test}}" 886 | out: 887 | type: document 888 | content: 889 | - type: template 890 | name: 891 | - type: text 892 | text: name 893 | content: 894 | - type: templateargument 895 | name: caption 896 | value: 897 | - type: text 898 | text: üäö test 899 | 900 | # Multiple named template arguments 901 | - case: multiple named template arguments 902 | input: "{{templatename|äöütem=2|item3=3|item4=4}}" 903 | out: 904 | type: document 905 | content: 906 | - type: template 907 | name: 908 | - type: text 909 | text: templatename 910 | content: 911 | - type: templateargument 912 | name: äöütem 913 | value: 914 | - type: text 915 | text: "2" 916 | - type: templateargument 917 | name: item3 918 | value: 919 | - type: text 920 | text: "3" 921 | - type: templateargument 922 | name: item4 923 | value: 924 | - type: text 925 | text: "4" 926 | 927 | # Mixed named and unnamed arguments 928 | - case: mixed template arguments 929 | input: "{{template1|item2=2|item3=3|item4}}" 930 | out: 931 | type: document 932 | content: 933 | - type: template 934 | name: 935 | - type: text 936 | text: template1 937 | content: 938 | - type: templateargument 939 | name: item2 940 | value: 941 | - type: text 942 | text: "2" 943 | - type: templateargument 944 | name: item3 945 | value: 946 | - type: text 947 | text: "3" 948 | - type: templateargument 949 | name: "1" 950 | value: 951 | - type: text 952 | text: "item4" 953 | 954 | # Nested templates 955 | - case: nested templates 956 | input: "{{Thankyou in {{preferred language}}|signature=Me}}" 957 | out: 958 | type: document 959 | content: 960 | - type: template 961 | name: 962 | - type: text 963 | text: "Thankyou in " 964 | - type: template 965 | name: 966 | - type: text 967 | text: "preferred language" 968 | content: [] 969 | content: 970 | - type: templateargument 971 | name: signature 972 | value: 973 | - type: text 974 | text: Me 975 | 976 | # A simple internal reference 977 | - case: simple internal ref 978 | input: "[[File:Abc]]" 979 | out: 980 | type: document 981 | content: 982 | - type: internalreference 983 | target: 984 | - type: text 985 | text: File:Abc 986 | options: [] 987 | caption: [] 988 | 989 | # An empty internal reference 990 | - case: empty internal ref 991 | input: "[[]]" 992 | out: 993 | type: document 994 | content: 995 | - type: internalreference 996 | target: [] 997 | options: [] 998 | caption: [] 999 | 1000 | # A simple internal reference with caption 1001 | - case: internal ref with caption 1002 | input: "[[File:Abc|this is a caption]]" 1003 | out: 1004 | type: document 1005 | content: 1006 | - type: internalreference 1007 | target: 1008 | - type: text 1009 | text: File:Abc 1010 | options: [] 1011 | caption: 1012 | - type: text 1013 | text: this is a caption 1014 | 1015 | # A simple internal reference with options 1016 | - case: internal ref with options 1017 | input: "[[File:Abc|opt1=value1|opt2=123|this is a caption with '''bold and |special|''' markup]]" 1018 | out: 1019 | type: document 1020 | content: 1021 | - type: internalreference 1022 | target: 1023 | - type: text 1024 | text: "File:Abc" 1025 | options: 1026 | - - type: text 1027 | text: "opt1=value1" 1028 | - - type: text 1029 | text: "opt2=123" 1030 | caption: 1031 | - type: text 1032 | text: "this is a caption with " 1033 | - type: formatted 1034 | markup: bold 1035 | content: 1036 | - type: text 1037 | text: "bold and |special|" 1038 | - type: text 1039 | text: " markup" 1040 | 1041 | # Simple table with one cell 1042 | - case: single cell table 1043 | input: | 1044 | {| 1045 | | attributevalue = "test" | test 1046 | |} 1047 | out: 1048 | type: document 1049 | content: 1050 | - type: table 1051 | attributes: [] 1052 | caption_attributes: [] 1053 | caption: [] 1054 | rows: 1055 | - type: tablerow 1056 | attributes: [] 1057 | cells: 1058 | - type: tablecell 1059 | attributes: 1060 | - key: attributevalue 1061 | value: test 1062 | header: false 1063 | content: 1064 | - type: paragraph 1065 | content: 1066 | - type: text 1067 | text: test 1068 | 1069 | # Simple table with caption 1070 | - case: table caption 1071 | input: | 1072 | {| 1073 | |+ caption_attribute=value | this is a ''caption'' 1074 | | attributevalue = "test" | test 1075 | |} 1076 | out: 1077 | type: document 1078 | content: 1079 | - type: table 1080 | attributes: [] 1081 | caption_attributes: 1082 | - key: caption_attribute 1083 | value: value 1084 | caption: 1085 | - type: paragraph 1086 | content: 1087 | - type: text 1088 | text: "this is a " 1089 | - type: formatted 1090 | markup: italic 1091 | content: 1092 | - type: text 1093 | text: "caption" 1094 | rows: 1095 | - type: tablerow 1096 | attributes: [] 1097 | cells: 1098 | - type: tablecell 1099 | attributes: 1100 | - key: attributevalue 1101 | value: test 1102 | header: false 1103 | content: 1104 | - type: paragraph 1105 | content: 1106 | - type: text 1107 | text: test 1108 | 1109 | # simple heading in template (mediawiki can't do this) 1110 | - case: table in template 1111 | input: | 1112 | {{test|bla= 1113 | {| 1114 | |+ caption_attribute=value | this is a ''caption'' 1115 | | attributevalue = "test" | test 1116 | |} 1117 | }} 1118 | out: 1119 | type: document 1120 | content: 1121 | - type: template 1122 | name: 1123 | - type: text 1124 | text: test 1125 | content: 1126 | - type: templateargument 1127 | name: bla 1128 | value: 1129 | - type: table 1130 | attributes: [] 1131 | caption_attributes: 1132 | - key: caption_attribute 1133 | value: value 1134 | caption: 1135 | - type: paragraph 1136 | content: 1137 | - type: text 1138 | text: "this is a " 1139 | - type: formatted 1140 | markup: italic 1141 | content: 1142 | - type: text 1143 | text: "caption" 1144 | rows: 1145 | - type: tablerow 1146 | attributes: [] 1147 | cells: 1148 | - type: tablecell 1149 | attributes: 1150 | - key: attributevalue 1151 | value: test 1152 | header: false 1153 | content: 1154 | - type: paragraph 1155 | content: 1156 | - type: text 1157 | text: test 1158 | 1159 | # simple heading in template (with hack used in real mediawiki) 1160 | - case: table in template mediawiki hack 1161 | input: | 1162 | {{test|bla= 1163 | {{(!}} 1164 | {{!+}} caption_attribute=value {{!}} this is a ''caption'' 1165 | {{!}} attributevalue = "test" {{!}} test 1166 | {{!)}} 1167 | }} 1168 | out: 1169 | type: document 1170 | content: 1171 | - type: template 1172 | name: 1173 | - type: text 1174 | text: test 1175 | content: 1176 | - type: templateargument 1177 | name: bla 1178 | value: 1179 | - type: table 1180 | attributes: [] 1181 | caption_attributes: 1182 | - key: caption_attribute 1183 | value: value 1184 | caption: 1185 | - type: paragraph 1186 | content: 1187 | - type: text 1188 | text: "this is a " 1189 | - type: formatted 1190 | markup: italic 1191 | content: 1192 | - type: text 1193 | text: "caption" 1194 | rows: 1195 | - type: tablerow 1196 | attributes: [] 1197 | cells: 1198 | - type: tablecell 1199 | attributes: 1200 | - key: attributevalue 1201 | value: test 1202 | header: false 1203 | content: 1204 | - type: paragraph 1205 | content: 1206 | - type: text 1207 | text: test 1208 | 1209 | # Table with multiple rows 1210 | - case: multi row table 1211 | input: | 1212 | {| class="wikitable" 1213 | |+ caption 1214 | |- 1215 | | attributevalue = "test" | test 1216 | |- 1217 | | test ''2'' 1218 | |-style="font-style: italic; color: green;" 1219 | | test3 1220 | | attr4=val | test4 1221 | |} 1222 | out: 1223 | type: document 1224 | content: 1225 | - type: table 1226 | attributes: 1227 | - key: class 1228 | value: wikitable 1229 | caption_attributes: [] 1230 | caption: 1231 | - type: paragraph 1232 | content: 1233 | - type: text 1234 | text: caption 1235 | rows: 1236 | - type: tablerow 1237 | attributes: [] 1238 | cells: 1239 | - type: tablecell 1240 | attributes: 1241 | - key: attributevalue 1242 | value: test 1243 | header: false 1244 | content: 1245 | - type: paragraph 1246 | content: 1247 | - type: text 1248 | text: test 1249 | - type: tablerow 1250 | attributes: [] 1251 | cells: 1252 | - type: tablecell 1253 | attributes: [] 1254 | header: false 1255 | content: 1256 | - type: paragraph 1257 | content: 1258 | - type: text 1259 | text: "test " 1260 | - type: formatted 1261 | markup: italic 1262 | content: 1263 | - type: text 1264 | text: "2" 1265 | - type: tablerow 1266 | attributes: 1267 | - key: style 1268 | value: "font-style: italic; color: green;" 1269 | cells: 1270 | - type: tablecell 1271 | attributes: [] 1272 | header: false 1273 | content: 1274 | - type: paragraph 1275 | content: 1276 | - type: text 1277 | text: "test3" 1278 | - type: tablecell 1279 | attributes: 1280 | - key: attr4 1281 | value: val 1282 | header: false 1283 | content: 1284 | - type: paragraph 1285 | content: 1286 | - type: text 1287 | text: "test4" 1288 | 1289 | # Table with multiple cells on one line 1290 | - case: table inline cells 1291 | input: | 1292 | {| 1293 | | attributevalue = "test" | test || cell 2 || || attribute=3 | cell 3 1294 | |} 1295 | out: 1296 | type: document 1297 | content: 1298 | - type: table 1299 | attributes: [] 1300 | caption_attributes: [] 1301 | caption: [] 1302 | rows: 1303 | - type: tablerow 1304 | attributes: [] 1305 | cells: 1306 | - type: tablecell 1307 | attributes: 1308 | - key: attributevalue 1309 | value: test 1310 | header: false 1311 | content: 1312 | - type: text 1313 | text: "test " 1314 | - type: tablecell 1315 | attributes: [] 1316 | header: false 1317 | content: 1318 | - type: text 1319 | text: "cell 2 " 1320 | - type: tablecell 1321 | attributes: [] 1322 | header: false 1323 | content: [] 1324 | - type: tablecell 1325 | attributes: 1326 | - key: attribute 1327 | value: "3" 1328 | header: false 1329 | content: 1330 | - type: paragraph 1331 | content: 1332 | - type: text 1333 | text: cell 3 1334 | 1335 | # Table with header cells 1336 | - case: table header cells 1337 | input: | 1338 | {| 1339 | ! Orange !! attribute="test" | Apple 1340 | |- 1341 | | Bread || Pie !! ''hey!!'' 1342 | ! footer 1343 | |} 1344 | out: 1345 | type: document 1346 | content: 1347 | - type: table 1348 | attributes: [] 1349 | caption_attributes: [] 1350 | caption: [] 1351 | rows: 1352 | - type: tablerow 1353 | attributes: [] 1354 | cells: 1355 | - type: tablecell 1356 | header: true 1357 | attributes: [] 1358 | content: 1359 | - type: text 1360 | text: "Orange " 1361 | - type: tablecell 1362 | header: true 1363 | attributes: 1364 | - key: attribute 1365 | value: test 1366 | content: 1367 | - type: paragraph 1368 | content: 1369 | - type: text 1370 | text: Apple 1371 | - type: tablerow 1372 | attributes: [] 1373 | cells: 1374 | - type: tablecell 1375 | header: false 1376 | attributes: [] 1377 | content: 1378 | - type: text 1379 | text: "Bread " 1380 | - type: tablecell 1381 | header: false 1382 | attributes: [] 1383 | content: 1384 | - type: text 1385 | text: "Pie " 1386 | - type: tablecell 1387 | header: true 1388 | attributes: [] 1389 | content: 1390 | - type: paragraph 1391 | content: 1392 | - type: formatted 1393 | markup: italic 1394 | content: 1395 | - type: text 1396 | text: "hey!!" 1397 | - type: tablecell 1398 | header: true 1399 | attributes: [] 1400 | content: 1401 | - type: paragraph 1402 | content: 1403 | - type: text 1404 | text: footer 1405 | 1406 | # a simple gallery tag 1407 | - case: simple gallery 1408 | input: | 1409 | 1410 | File:Abc 1411 | 1412 | out: 1413 | type: document 1414 | content: 1415 | - type: gallery 1416 | attributes: [] 1417 | content: 1418 | - type: internalreference 1419 | target: 1420 | - type: text 1421 | text: File:Abc 1422 | options: [] 1423 | caption: [] 1424 | 1425 | # an empty gallery 1426 | - case: empty gallery 1427 | input: | 1428 | 1429 | 1430 | out: 1431 | type: document 1432 | content: 1433 | - type: gallery 1434 | attributes: [] 1435 | content: [] 1436 | 1437 | # empty gallery with whitespace 1438 | - case: empty whitespace gallery 1439 | input: " \n \n\n\n \n\t\n " 1440 | out: 1441 | type: document 1442 | content: 1443 | - type: gallery 1444 | attributes: [] 1445 | content: [] 1446 | 1447 | # simple gallery with whitespace 1448 | - case: simple whitespace gallery 1449 | input: " \n \nFile:ABC \n\n \n\t\n " 1450 | out: 1451 | type: document 1452 | content: 1453 | - type: gallery 1454 | attributes: [] 1455 | content: 1456 | - type: internalreference 1457 | target: 1458 | - type: text 1459 | text: "File:ABC " 1460 | caption: [] 1461 | options: [] 1462 | 1463 | # a gallery with figure captions 1464 | - case: caption gallery 1465 | input: | 1466 | 1467 | File:Abc|this is a figure [[caption]] 1468 | File:This is a new file 1469 | 1470 | out: 1471 | type: document 1472 | content: 1473 | - type: gallery 1474 | attributes: [] 1475 | content: 1476 | - type: internalreference 1477 | target: 1478 | - type: text 1479 | text: File:Abc 1480 | options: [] 1481 | caption: 1482 | - type: text 1483 | text: "this is a figure " 1484 | - type: internalreference 1485 | target: 1486 | - type: text 1487 | text: caption 1488 | caption: [] 1489 | options: [] 1490 | - type: internalreference 1491 | target: 1492 | - type: text 1493 | text: File:This is a new file 1494 | caption: [] 1495 | options: [] 1496 | 1497 | # template with a heading as content 1498 | - case: template with heading content 1499 | input: | 1500 | {{noprint| 1501 | == caption 1502 | content 1503 | }} 1504 | out: 1505 | type: document 1506 | content: 1507 | - type: template 1508 | name: 1509 | - type: text 1510 | text: noprint 1511 | content: 1512 | - type: templateargument 1513 | name: "1" 1514 | value: 1515 | - type: heading 1516 | depth: 2 1517 | caption: 1518 | - type: text 1519 | text: caption 1520 | content: 1521 | - type: paragraph 1522 | content: 1523 | - type: text 1524 | text: content 1525 | 1526 | -------------------------------------------------------------------------------- /linter_notes.md: -------------------------------------------------------------------------------- 1 | - non-incrementing list item depth 2 | - template content should only be content attributes 3 | -------------------------------------------------------------------------------- /src/ast.rs: -------------------------------------------------------------------------------- 1 | /// Data structures describing the parsed document. 2 | 3 | #[cfg(feature = "no_position")] 4 | use serde::{Serialize, SerializeMap, Serializer}; 5 | use serde_derive::{Deserialize, Serialize}; 6 | 7 | /** 8 | * Element types used in the abstract syntax tree (AST). 9 | * 10 | * Each element must keep track of its position in the original 11 | * input document. After parsing, the document tree can be serialized by serde. 12 | */ 13 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 14 | #[serde(tag = "type", rename_all = "lowercase", deny_unknown_fields)] 15 | pub enum Element { 16 | Document(Document), 17 | Heading(Heading), 18 | Text(Text), 19 | Formatted(Formatted), 20 | Paragraph(Paragraph), 21 | Template(Template), 22 | TemplateArgument(TemplateArgument), 23 | InternalReference(InternalReference), 24 | ExternalReference(ExternalReference), 25 | ListItem(ListItem), 26 | List(List), 27 | Table(Table), 28 | TableRow(TableRow), 29 | TableCell(TableCell), 30 | Comment(Comment), 31 | HtmlTag(HtmlTag), 32 | Gallery(Gallery), 33 | Error(Error), 34 | } 35 | 36 | /// The document root. 37 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 38 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 39 | pub struct Document { 40 | #[serde(default)] 41 | pub position: Span, 42 | pub content: Vec, 43 | } 44 | 45 | /// Headings make a hierarchical document structure. 46 | /// Headings of higher depths have other headings as parents. 47 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 48 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 49 | pub struct Heading { 50 | #[serde(default)] 51 | pub position: Span, 52 | pub depth: usize, 53 | pub caption: Vec, 54 | pub content: Vec, 55 | } 56 | 57 | /// Simple text. 58 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 59 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 60 | pub struct Text { 61 | #[serde(default)] 62 | pub position: Span, 63 | pub text: String, 64 | } 65 | 66 | /// A formatting wrapper, usually around text. 67 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 68 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 69 | pub struct Formatted { 70 | #[serde(default)] 71 | pub position: Span, 72 | pub markup: MarkupType, 73 | pub content: Vec, 74 | } 75 | 76 | /// Paragraphs are separated by newlines in the input document. 77 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 78 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 79 | pub struct Paragraph { 80 | #[serde(default)] 81 | pub position: Span, 82 | pub content: Vec, 83 | } 84 | 85 | /// A mediawiki template. 86 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 87 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 88 | pub struct Template { 89 | #[serde(default)] 90 | pub position: Span, 91 | pub name: Vec, 92 | pub content: Vec, 93 | } 94 | 95 | /// Argument of a mediawiki template. 96 | /// Empty name indicate anonymous arguments. 97 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 98 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 99 | pub struct TemplateArgument { 100 | #[serde(default)] 101 | pub position: Span, 102 | pub name: String, 103 | pub value: Vec, 104 | } 105 | 106 | /// A reference to internal data, such as embedded files 107 | /// or other articles. 108 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 109 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 110 | pub struct InternalReference { 111 | #[serde(default)] 112 | pub position: Span, 113 | pub target: Vec, 114 | pub options: Vec>, 115 | pub caption: Vec, 116 | } 117 | 118 | /// External reference, usually hyperlinks. 119 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 120 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 121 | pub struct ExternalReference { 122 | #[serde(default)] 123 | pub position: Span, 124 | pub target: String, 125 | pub caption: Vec, 126 | } 127 | 128 | /// List item of a certain `ListItemKind`. 129 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 130 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 131 | pub struct ListItem { 132 | #[serde(default)] 133 | pub position: Span, 134 | pub depth: usize, 135 | pub kind: ListItemKind, 136 | pub content: Vec, 137 | } 138 | 139 | /// List of items. The `ListItemKind` of its children 140 | /// can be heterogenous. 141 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 142 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 143 | pub struct List { 144 | #[serde(default)] 145 | pub position: Span, 146 | pub content: Vec, 147 | } 148 | 149 | /// A mediawiki table. `attributes` represent html 150 | /// attributes assigned to the table. 151 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 152 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 153 | pub struct Table { 154 | #[serde(default)] 155 | pub position: Span, 156 | pub attributes: Vec, 157 | pub caption: Vec, 158 | pub caption_attributes: Vec, 159 | pub rows: Vec, 160 | } 161 | 162 | /// A table row. `attributes` represent html 163 | /// attributes assigned to the table. 164 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 165 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 166 | pub struct TableRow { 167 | #[serde(default)] 168 | pub position: Span, 169 | pub attributes: Vec, 170 | pub cells: Vec, 171 | } 172 | 173 | /// A single table cell. `attributes` represent html 174 | /// attributes assigned to the table. `header` is true 175 | /// if this cell is marked as a header cell. 176 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 177 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 178 | pub struct TableCell { 179 | #[serde(default)] 180 | pub position: Span, 181 | pub header: bool, 182 | pub attributes: Vec, 183 | pub content: Vec, 184 | } 185 | 186 | /// Comments in the input document. 187 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 188 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 189 | pub struct Comment { 190 | #[serde(default)] 191 | pub position: Span, 192 | pub text: String, 193 | } 194 | 195 | /// Html tags not encoding formatting elements. 196 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 197 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 198 | pub struct HtmlTag { 199 | #[serde(default)] 200 | pub position: Span, 201 | pub name: String, 202 | pub attributes: Vec, 203 | pub content: Vec, 204 | } 205 | 206 | /// Gallery of images (or interal references in general). 207 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 208 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 209 | pub struct Gallery { 210 | #[serde(default)] 211 | pub position: Span, 212 | pub attributes: Vec, 213 | pub content: Vec, 214 | } 215 | 216 | /// Indicates an erroneous part of the document tree. 217 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 218 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 219 | pub struct Error { 220 | #[serde(default)] 221 | pub position: Span, 222 | pub message: String, 223 | } 224 | 225 | /// Types of markup a section of text may have. 226 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Copy)] 227 | #[serde(rename_all = "lowercase")] 228 | pub enum MarkupType { 229 | NoWiki, 230 | Bold, 231 | Italic, 232 | Math, 233 | StrikeThrough, 234 | Underline, 235 | Code, 236 | Blockquote, 237 | Preformatted, 238 | } 239 | 240 | /// Types of markup a section of text may have. 241 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Copy)] 242 | #[serde(rename_all = "lowercase")] 243 | pub enum ListItemKind { 244 | Unordered, 245 | Definition, 246 | DefinitionTerm, 247 | Ordered, 248 | } 249 | 250 | /** 251 | * Represents a position in the source document. 252 | * 253 | * The `PartialEq` implementation allows for a "any" position (all zero), which is 254 | * equal to any other position. This is used to reduce clutter in tests, where 255 | * a default Position ("{}") can be used where the actual representation is irrelevant. 256 | */ 257 | #[derive(Debug, Serialize, Deserialize, Clone)] 258 | #[serde( 259 | rename_all = "lowercase", 260 | default = "Position::any_position", 261 | deny_unknown_fields 262 | )] 263 | pub struct Position { 264 | pub offset: usize, 265 | pub line: usize, 266 | pub col: usize, 267 | } 268 | 269 | /// Holds position information (start and end) for one element 270 | #[derive(Debug, Deserialize, PartialEq, Clone)] 271 | #[cfg_attr(not(feature = "no_position"), derive(Serialize))] 272 | #[serde(rename_all = "lowercase", default = "Span::any", deny_unknown_fields)] 273 | pub struct Span { 274 | pub start: Position, 275 | pub end: Position, 276 | } 277 | 278 | /// Represents a pair of html tag attribute and value. 279 | #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] 280 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 281 | pub struct TagAttribute { 282 | #[serde(default)] 283 | pub position: Span, 284 | pub key: String, 285 | pub value: String, 286 | } 287 | 288 | /// Position of a source line of code. 289 | #[derive(Debug, Serialize, Deserialize, PartialEq)] 290 | pub struct SourceLine<'input> { 291 | pub start: usize, 292 | pub content: &'input str, 293 | pub end: usize, 294 | } 295 | 296 | impl<'input> SourceLine<'input> { 297 | /// checks if `pos` is at a line start 298 | pub fn starts_line(pos: usize, slocs: &[SourceLine]) -> bool { 299 | for sloc in slocs { 300 | if sloc.start == pos { 301 | return true; 302 | } 303 | } 304 | false 305 | } 306 | } 307 | 308 | impl MarkupType { 309 | /// Match an HTML tag name to it's markup type. 310 | pub fn by_tag_name(tag: &str) -> MarkupType { 311 | match &tag.to_lowercase()[..] { 312 | "math" => MarkupType::Math, 313 | "del" | "s" => MarkupType::StrikeThrough, 314 | "nowiki" => MarkupType::NoWiki, 315 | "u" | "ins" => MarkupType::Underline, 316 | "code" => MarkupType::Code, 317 | "blockquote" => MarkupType::Blockquote, 318 | "pre" => MarkupType::Preformatted, 319 | _ => panic!("markup type lookup not implemented for {}!", tag), 320 | } 321 | } 322 | } 323 | 324 | impl Element { 325 | /// returns the source code position of an element. 326 | pub fn get_position(&self) -> &Span { 327 | match *self { 328 | Element::Document(ref e) => &e.position, 329 | Element::Heading(ref e) => &e.position, 330 | Element::Text(ref e) => &e.position, 331 | Element::Formatted(ref e) => &e.position, 332 | Element::Paragraph(ref e) => &e.position, 333 | Element::Template(ref e) => &e.position, 334 | Element::TemplateArgument(ref e) => &e.position, 335 | Element::InternalReference(ref e) => &e.position, 336 | Element::ExternalReference(ref e) => &e.position, 337 | Element::List(ref e) => &e.position, 338 | Element::ListItem(ref e) => &e.position, 339 | Element::Table(ref e) => &e.position, 340 | Element::TableRow(ref e) => &e.position, 341 | Element::TableCell(ref e) => &e.position, 342 | Element::Comment(ref e) => &e.position, 343 | Element::HtmlTag(ref e) => &e.position, 344 | Element::Gallery(ref e) => &e.position, 345 | Element::Error(ref e) => &e.position, 346 | } 347 | } 348 | 349 | /// returns a mutable reference the source code position of an element. 350 | pub fn get_position_mut(&mut self) -> &mut Span { 351 | match *self { 352 | Element::Document(ref mut e) => &mut e.position, 353 | Element::Heading(ref mut e) => &mut e.position, 354 | Element::Text(ref mut e) => &mut e.position, 355 | Element::Formatted(ref mut e) => &mut e.position, 356 | Element::Paragraph(ref mut e) => &mut e.position, 357 | Element::Template(ref mut e) => &mut e.position, 358 | Element::TemplateArgument(ref mut e) => &mut e.position, 359 | Element::InternalReference(ref mut e) => &mut e.position, 360 | Element::ExternalReference(ref mut e) => &mut e.position, 361 | Element::List(ref mut e) => &mut e.position, 362 | Element::ListItem(ref mut e) => &mut e.position, 363 | Element::Table(ref mut e) => &mut e.position, 364 | Element::TableRow(ref mut e) => &mut e.position, 365 | Element::TableCell(ref mut e) => &mut e.position, 366 | Element::Comment(ref mut e) => &mut e.position, 367 | Element::HtmlTag(ref mut e) => &mut e.position, 368 | Element::Gallery(ref mut e) => &mut e.position, 369 | Element::Error(ref mut e) => &mut e.position, 370 | } 371 | } 372 | 373 | /// returns the variant name of an element. 374 | pub fn get_variant_name(&self) -> &str { 375 | match *self { 376 | Element::Document(_) => "Document", 377 | Element::Heading(_) => "Heading", 378 | Element::Text(_) => "Text", 379 | Element::Formatted(_) => "Formatted", 380 | Element::Paragraph(_) => "Paragraph", 381 | Element::Template(_) => "Template", 382 | Element::TemplateArgument(_) => "TemplateArgument", 383 | Element::InternalReference(_) => "InternalReference", 384 | Element::ExternalReference(_) => "ExternalReference", 385 | Element::List(_) => "List", 386 | Element::ListItem(_) => "ListItem", 387 | Element::Table(_) => "Table", 388 | Element::TableRow(_) => "TableRow", 389 | Element::TableCell(_) => "TableCell", 390 | Element::Comment(_) => "Comment", 391 | Element::HtmlTag(_) => "HtmlTag", 392 | Element::Gallery(_) => "Gallery", 393 | Element::Error(_) => "Error", 394 | } 395 | } 396 | } 397 | 398 | impl Position { 399 | pub fn new(offset: usize, slocs: &[SourceLine]) -> Self { 400 | for (i, sloc) in slocs.iter().enumerate() { 401 | if offset >= sloc.start && offset < sloc.end { 402 | return Position { 403 | offset, 404 | line: i + 1, 405 | col: sloc.content[0..offset - sloc.start].chars().count() + 1, 406 | }; 407 | } 408 | } 409 | Position { 410 | offset, 411 | line: slocs.len() + 1, 412 | col: 0, 413 | } 414 | } 415 | 416 | pub fn any_position() -> Self { 417 | Position { 418 | offset: 0, 419 | line: 0, 420 | col: 0, 421 | } 422 | } 423 | } 424 | 425 | impl Span { 426 | pub fn any() -> Self { 427 | Span { 428 | start: Position::any_position(), 429 | end: Position::any_position(), 430 | } 431 | } 432 | 433 | pub fn new(posl: usize, posr: usize, source_lines: &[SourceLine]) -> Self { 434 | Span { 435 | start: Position::new(posl, source_lines), 436 | end: Position::new(posr, source_lines), 437 | } 438 | } 439 | } 440 | 441 | impl Default for Span { 442 | fn default() -> Self { 443 | Self::any() 444 | } 445 | } 446 | 447 | #[cfg(feature = "no_position")] 448 | impl Serialize for Span { 449 | fn serialize(&self, serializer: S) -> Result 450 | where 451 | S: Serializer, 452 | { 453 | let map = serializer.serialize_map(None)?; 454 | map.end() 455 | } 456 | } 457 | 458 | impl PartialEq for Position { 459 | fn eq(&self, other: &Position) -> bool { 460 | // comparing with "any" position is always true 461 | if (other.offset == 0 && other.line == 0 && other.col == 0) 462 | || (self.offset == 0 && self.line == 0 && self.col == 0) 463 | { 464 | return true; 465 | } 466 | 467 | self.offset == other.offset && self.line == other.line && self.col == other.col 468 | } 469 | } 470 | 471 | impl TagAttribute { 472 | pub fn new(position: Span, key: String, value: String) -> Self { 473 | TagAttribute { 474 | position, 475 | key, 476 | value, 477 | } 478 | } 479 | } 480 | -------------------------------------------------------------------------------- /src/default_transformations.rs: -------------------------------------------------------------------------------- 1 | use crate::ast::*; 2 | use crate::error::TransformationError; 3 | use crate::transformations::*; 4 | use crate::util; 5 | use std::usize; 6 | 7 | /// Settings for general transformations. 8 | pub struct GeneralSettings {} 9 | 10 | /// Moves flat headings into a hierarchical structure based on their depth. 11 | pub fn fold_headings_transformation(mut root: Element, settings: &GeneralSettings) -> TResult { 12 | // append following deeper headings than current_depth in content to the result list. 13 | fn move_deeper_headings<'a>( 14 | trans: &TFuncInplace<&'a GeneralSettings>, 15 | root_content: &mut Vec, 16 | settings: &'a GeneralSettings, 17 | ) -> TListResult { 18 | let mut result = vec![]; 19 | let mut current_heading_index = 0; 20 | 21 | // current maximum depth level, every deeper heading will be moved 22 | let mut current_depth = usize::MAX; 23 | 24 | for child in root_content.drain(..) { 25 | if let Element::Heading(cur_heading) = child { 26 | if cur_heading.depth > current_depth { 27 | let last = result.get_mut(current_heading_index); 28 | if let Some(&mut Element::Heading(ref mut e)) = last { 29 | e.content.push(Element::Heading(cur_heading)); 30 | } 31 | } else { 32 | // pick a new reference heading if the new one 33 | // is equally deep or more shallow 34 | current_heading_index = result.len(); 35 | current_depth = cur_heading.depth; 36 | result.push(Element::Heading(cur_heading)); 37 | } 38 | } else { 39 | if current_depth < usize::MAX { 40 | return Err(TransformationError { 41 | cause: "a non-heading element was found after a heading. \ 42 | This should not happen." 43 | .to_string(), 44 | position: child.get_position().clone(), 45 | transformation_name: String::from("fold_headings_transformation"), 46 | tree: child.clone(), 47 | }); 48 | } 49 | result.push(child); 50 | } 51 | } 52 | 53 | // recurse transformation 54 | result = apply_func_drain(trans, &mut result, settings)?; 55 | Ok(result) 56 | }; 57 | root = recurse_inplace_template( 58 | &fold_headings_transformation, 59 | root, 60 | settings, 61 | &move_deeper_headings, 62 | )?; 63 | Ok(root) 64 | } 65 | 66 | /// Moves list items of higher depth into separate sub-lists. 67 | /// If a list is started with a deeper item than one, this transformation still applies, 68 | /// although this should later be a linter error. 69 | pub fn fold_lists_transformation(mut root: Element, settings: &GeneralSettings) -> TResult { 70 | // move list items which are deeper than the current level into new sub-lists. 71 | fn move_deeper_items<'a>( 72 | trans: &TFuncInplace<&'a GeneralSettings>, 73 | root_content: &mut Vec, 74 | settings: &'a GeneralSettings, 75 | ) -> TListResult { 76 | // the currently least deep list item, every deeper 77 | // list item will be moved to a new sublist 78 | let mut lowest_depth = usize::MAX; 79 | for child in &root_content[..] { 80 | if let Element::ListItem(ref e) = *child { 81 | if e.depth < lowest_depth { 82 | lowest_depth = e.depth; 83 | } 84 | } else { 85 | return Err(TransformationError { 86 | cause: String::from("A list should not contain non-listitems."), 87 | transformation_name: String::from("fold_lists_transformation"), 88 | position: child.get_position().clone(), 89 | tree: child.clone(), 90 | }); 91 | } 92 | } 93 | 94 | let mut result = vec![]; 95 | // create a new sublist when encountering a lower item 96 | let mut create_sublist = true; 97 | 98 | for child in root_content.drain(..) { 99 | if let Element::ListItem(cur_item) = child { 100 | if cur_item.depth > lowest_depth { 101 | // this error is returned if the sublist to append to was not found 102 | let build_found_error = |origin: &ListItem| TransformationError { 103 | cause: "sublist was not instantiated properly.".into(), 104 | transformation_name: "fold_lists_transformation".into(), 105 | position: origin.position.clone(), 106 | tree: Element::ListItem(origin.clone()), 107 | }; 108 | 109 | if create_sublist { 110 | // create a new sublist 111 | create_sublist = false; 112 | 113 | if result.is_empty() { 114 | result.push(Element::ListItem(ListItem { 115 | position: cur_item.position.clone(), 116 | depth: lowest_depth, 117 | kind: cur_item.kind, 118 | content: vec![], 119 | })); 120 | } 121 | if let Some(&mut Element::ListItem(ref mut last)) = result.last_mut() { 122 | last.content.push(Element::List(List { 123 | position: cur_item.position.clone(), 124 | content: vec![], 125 | })); 126 | } else { 127 | return Err(build_found_error(&cur_item)); 128 | } 129 | } 130 | 131 | if let Some(&mut Element::ListItem(ref mut item)) = result.last_mut() { 132 | if let Some(&mut Element::List(ref mut l)) = item.content.last_mut() { 133 | l.content.push(Element::ListItem(cur_item)); 134 | } else { 135 | return Err(build_found_error(&cur_item)); 136 | } 137 | } else { 138 | return Err(build_found_error(&cur_item)); 139 | } 140 | } else { 141 | result.push(Element::ListItem(cur_item)); 142 | create_sublist = true; 143 | } 144 | } else { 145 | result.push(child); 146 | }; 147 | } 148 | result = apply_func_drain(trans, &mut result, settings)?; 149 | Ok(result) 150 | }; 151 | 152 | if let Element::List { .. } = root { 153 | root = recurse_inplace_template( 154 | &fold_lists_transformation, 155 | root, 156 | settings, 157 | &move_deeper_items, 158 | )?; 159 | } else { 160 | root = recurse_inplace(&fold_lists_transformation, root, settings)?; 161 | }; 162 | Ok(root) 163 | } 164 | 165 | /// Transform whitespace-only paragraphs to empty paragraphs. 166 | pub fn whitespace_paragraphs_to_empty(mut root: Element, settings: &GeneralSettings) -> TResult { 167 | if let Element::Paragraph(ref mut par) = root { 168 | let mut is_only_whitespace = true; 169 | for child in &par.content[..] { 170 | if let Element::Text(ref text) = *child { 171 | if !util::is_whitespace(&text.text) { 172 | is_only_whitespace = false; 173 | break; 174 | } 175 | } else { 176 | is_only_whitespace = false; 177 | break; 178 | } 179 | } 180 | if is_only_whitespace { 181 | par.content.drain(..); 182 | } 183 | }; 184 | root = recurse_inplace(&whitespace_paragraphs_to_empty, root, settings)?; 185 | Ok(root) 186 | } 187 | 188 | /// Reduce consecutive paragraphs and absorb trailing text into one, 189 | /// if not separated by a blank paragraph. 190 | pub fn collapse_paragraphs( 191 | mut root: Element, 192 | settings: &GeneralSettings, 193 | ) -> Result { 194 | fn squash_empty_paragraphs<'a>( 195 | trans: &TFuncInplace<&'a GeneralSettings>, 196 | root_content: &mut Vec, 197 | settings: &'a GeneralSettings, 198 | ) -> TListResult { 199 | let mut result = vec![]; 200 | let mut last_empty = false; 201 | 202 | for mut child in root_content.drain(..) { 203 | if let Element::Paragraph(ref mut par) = child { 204 | if par.content.is_empty() { 205 | last_empty = true; 206 | continue; 207 | } 208 | 209 | // if the last paragraph was not empty, append to it. 210 | if !last_empty { 211 | if let Some(&mut Element::Paragraph(ref mut last)) = result.last_mut() { 212 | // Add a space on line break 213 | last.content.push(Element::Text(Text { 214 | text: " ".into(), 215 | position: last.position.clone(), 216 | })); 217 | last.content.append(&mut par.content); 218 | last.position.end = par.position.end.clone(); 219 | continue; 220 | } 221 | } 222 | }; 223 | 224 | result.push(child); 225 | last_empty = false; 226 | } 227 | result = apply_func_drain(trans, &mut result, settings)?; 228 | Ok(result) 229 | } 230 | root = recurse_inplace_template( 231 | &collapse_paragraphs, 232 | root, 233 | settings, 234 | &squash_empty_paragraphs, 235 | )?; 236 | Ok(root) 237 | } 238 | 239 | /// Collapse consecutive text tags into one, removing duplicate whitespace. 240 | pub fn collapse_consecutive_text( 241 | mut root: Element, 242 | settings: &GeneralSettings, 243 | ) -> Result { 244 | fn squash_text<'a>( 245 | trans: &TFuncInplace<&'a GeneralSettings>, 246 | root_content: &mut Vec, 247 | settings: &'a GeneralSettings, 248 | ) -> TListResult { 249 | let mut result = vec![]; 250 | 251 | for mut child in root_content.drain(..) { 252 | if let Element::Text(ref mut text) = child { 253 | if let Some(&mut Element::Text(ref mut last)) = result.last_mut() { 254 | if util::is_whitespace(&text.text) { 255 | last.text.push(' '); 256 | } else { 257 | last.text.push_str(&text.text); 258 | } 259 | last.position.end = text.position.end.clone(); 260 | continue; 261 | } 262 | }; 263 | result.push(child); 264 | } 265 | result = apply_func_drain(trans, &mut result, settings)?; 266 | Ok(result) 267 | } 268 | root = recurse_inplace_template(&collapse_consecutive_text, root, settings, &squash_text)?; 269 | Ok(root) 270 | } 271 | 272 | /// Enumerate anonymous template arguments as "1", "2", ... 273 | pub fn enumerate_anon_args(mut root: Element, settings: &GeneralSettings) -> TResult { 274 | if let Element::Template(ref mut template) = root { 275 | let mut counter = 1; 276 | for child in &mut template.content { 277 | if let Element::TemplateArgument(ref mut arg) = *child { 278 | if arg.name.trim().is_empty() { 279 | arg.name.clear(); 280 | arg.name.push_str(&counter.to_string()); 281 | counter += 1; 282 | } 283 | } 284 | } 285 | }; 286 | recurse_inplace(&enumerate_anon_args, root, settings) 287 | } 288 | 289 | // taken from https://github.com/portstrom/parse_wiki_text/blob/master/src/default.rs 290 | const PROTOCOLS: [&str; 28] = [ 291 | "//", 292 | "bitcoin:", 293 | "ftp://", 294 | "ftps://", 295 | "geo:", 296 | "git://", 297 | "gopher://", 298 | "http://", 299 | "https://", 300 | "irc://", 301 | "ircs://", 302 | "magnet:", 303 | "mailto:", 304 | "mms://", 305 | "news:", 306 | "nntp://", 307 | "redis://", 308 | "sftp://", 309 | "sip:", 310 | "sips:", 311 | "sms:", 312 | "ssh://", 313 | "svn://", 314 | "tel:", 315 | "telnet://", 316 | "urn:", 317 | "worldwind://", 318 | "xmpp:", 319 | ]; 320 | 321 | /// only keep external references with actual urls 322 | pub fn validate_external_refs(mut root: Element, settings: &GeneralSettings) -> TResult { 323 | fn validate_erefs_vec<'a>( 324 | trans: &TFuncInplace<&'a GeneralSettings>, 325 | root_content: &mut Vec, 326 | settings: &'a GeneralSettings, 327 | ) -> TListResult { 328 | let mut result = vec![]; 329 | 330 | for mut child in root_content.drain(..) { 331 | if let Element::ExternalReference(ref mut eref) = child { 332 | let is_uri = PROTOCOLS.iter().any(|p| eref.target.trim().starts_with(p)); 333 | if is_uri { 334 | eref.target = eref.target.trim().to_string(); 335 | result.push(child); 336 | } else { 337 | result.push(Element::Text(Text { 338 | position: Span { 339 | start: eref.position.start.clone(), 340 | end: eref 341 | .caption 342 | .iter() 343 | .next() 344 | .map(|c| c.get_position().start.clone()) 345 | .unwrap_or(eref.position.end.clone()), 346 | }, 347 | text: format!("[{}", eref.target), 348 | })); 349 | result.append(&mut eref.caption); 350 | result.push(Element::Text(Text { 351 | position: Span { 352 | start: { 353 | let mut s = eref.position.end.clone(); 354 | s.col -= 1; 355 | s.offset -= 1; 356 | s 357 | }, 358 | end: eref.position.end.clone(), 359 | }, 360 | text: "]".to_string(), 361 | })); 362 | } 363 | } else { 364 | result.push(child); 365 | } 366 | } 367 | result = apply_func_drain(trans, &mut result, settings)?; 368 | Ok(result) 369 | } 370 | root = recurse_inplace_template(&validate_external_refs, root, settings, &validate_erefs_vec)?; 371 | Ok(root) 372 | } 373 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | //! Error structures 2 | 3 | use crate::ast::{Element, Position, Span}; 4 | use crate::grammar; 5 | use crate::util::{get_source_lines, is_whitespace, shorten_str}; 6 | use colored::*; 7 | use serde_derive::{Deserialize, Serialize}; 8 | use std::error; 9 | use std::fmt; 10 | 11 | /// The number of lines to display as error context. 12 | const ERROR_CONTEXT_LINES: usize = 5; 13 | 14 | /// Generic error type for high-level errors of this libaray. 15 | #[derive(Debug, Serialize, Deserialize, PartialEq)] 16 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 17 | pub enum MWError { 18 | ParseError(ParseError), 19 | TransformationError(TransformationError), 20 | } 21 | 22 | /// The parser error with source code context. 23 | #[derive(Debug, Serialize, Deserialize, PartialEq)] 24 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 25 | pub struct ParseError { 26 | pub position: Position, 27 | pub expected: Vec, 28 | pub context: Vec, 29 | pub context_start: usize, 30 | pub context_end: usize, 31 | } 32 | 33 | /// Error structure for syntax tree transformations. 34 | #[derive(Debug, Serialize, Deserialize, PartialEq)] 35 | #[serde(rename_all = "lowercase", deny_unknown_fields)] 36 | pub struct TransformationError { 37 | pub cause: String, 38 | pub position: Span, 39 | pub transformation_name: String, 40 | pub tree: Element, 41 | } 42 | 43 | impl ParseError { 44 | pub fn from(err: &grammar::ParseError, input: &str) -> Self { 45 | let source_lines = get_source_lines(input); 46 | let line_count = source_lines.len(); 47 | 48 | let line = if err.line <= line_count { 49 | err.line 50 | } else { 51 | source_lines.len() 52 | } - 1; 53 | 54 | let start = if line < ERROR_CONTEXT_LINES { 55 | 0 56 | } else { 57 | line - ERROR_CONTEXT_LINES 58 | }; 59 | 60 | let end = if line + ERROR_CONTEXT_LINES >= line_count { 61 | line_count - 1 62 | } else { 63 | line + ERROR_CONTEXT_LINES 64 | }; 65 | 66 | let mut token_str = vec![]; 67 | for token in &err.expected { 68 | token_str.push(String::from(*token)); 69 | } 70 | 71 | let mut context = vec![]; 72 | for sloc in source_lines[start..=end].iter() { 73 | context.push(String::from(sloc.content)); 74 | } 75 | 76 | ParseError { 77 | position: Position::new(err.offset, &source_lines), 78 | context, 79 | expected: token_str, 80 | context_start: start, 81 | context_end: end, 82 | } 83 | } 84 | } 85 | 86 | impl error::Error for ParseError { 87 | fn description(&self) -> &str { 88 | "Could not continue to parse, because no rules could be matched." 89 | } 90 | } 91 | 92 | impl fmt::Display for ParseError { 93 | fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { 94 | let error_message = format!( 95 | "ERROR in line {} at column {}: Could not continue to parse, expected one of: ", 96 | self.position.line, self.position.col 97 | ) 98 | .red() 99 | .bold(); 100 | 101 | let mut token_str = vec![]; 102 | for token in &self.expected { 103 | if is_whitespace(token) { 104 | token_str.push(format!("{:?}", token)); 105 | } else { 106 | token_str.push(token.to_string()); 107 | } 108 | } 109 | 110 | write!(f, "{}", error_message)?; 111 | writeln!(f, "{}", token_str.join(", ").blue().bold())?; 112 | 113 | for (i, content) in self.context.iter().enumerate() { 114 | let lineno = format!("{} |", self.context_start + i + 1); 115 | let lineno_col; 116 | 117 | let formatted_content; 118 | // the erroneous line 119 | if self.context_start + i + 1 == self.position.line { 120 | formatted_content = content.red(); 121 | lineno_col = lineno.red().bold(); 122 | } else { 123 | formatted_content = shorten_str(content).normal(); 124 | lineno_col = lineno.blue().bold() 125 | } 126 | 127 | writeln!(f, "{} {}", lineno_col, formatted_content)?; 128 | } 129 | 130 | Ok(()) 131 | } 132 | } 133 | 134 | impl error::Error for TransformationError { 135 | fn description(&self) -> &str { 136 | &self.cause 137 | } 138 | } 139 | 140 | impl fmt::Display for TransformationError { 141 | fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { 142 | let message = format!( 143 | "ERROR applying transformation \"{}\" to Elemtn at {}:{} to {}:{}: {}", 144 | self.transformation_name, 145 | self.position.start.line, 146 | self.position.start.col, 147 | self.position.end.line, 148 | self.position.end.col, 149 | self.cause 150 | ); 151 | writeln!(f, "{}", message.red().bold()) 152 | } 153 | } 154 | 155 | impl error::Error for MWError { 156 | fn description(&self) -> &str { 157 | match *self { 158 | MWError::ParseError(ref e) => e.description(), 159 | MWError::TransformationError(ref e) => e.description(), 160 | } 161 | } 162 | } 163 | 164 | impl fmt::Display for MWError { 165 | fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { 166 | match *self { 167 | MWError::ParseError(ref e) => write!(f, "{}", e), 168 | MWError::TransformationError(ref e) => write!(f, "{}", e), 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/grammar.rs: -------------------------------------------------------------------------------- 1 | include!(concat!(env!("OUT_DIR"), "/grammar.rs")); 2 | -------------------------------------------------------------------------------- /src/grammar.rustpeg: -------------------------------------------------------------------------------- 1 | use crate::ast::*; 2 | use crate::util::combine; 3 | 4 | #![arguments(source_lines: &[SourceLine])] 5 | 6 | // the main document entry point. 7 | pub document -> Element 8 | = posl:#position top:paragraph* content:heading* EOF posr:#position 9 | { 10 | let mut res = top; 11 | let mut content = content; 12 | res.append(&mut content); 13 | 14 | Element::Document(Document { 15 | position: Span::new(posl, posr, source_lines), 16 | content: res, 17 | }) 18 | } 19 | 20 | head_fmt -> Element 21 | = FormattedTextTemplate> 22 | 23 | // A heading is a caption paragraph followed by content paragraphs. 24 | heading -> Element 25 | = posl:#position d:$('='+) _ cap:head_fmt* _ '='* _ (nl / EOF) pars:paragraph* posr:#position 26 | { 27 | Element::Heading(Heading { 28 | position: Span::new(posl, posr, source_lines), 29 | depth: d.len(), 30 | caption: cap, 31 | content: pars, 32 | }) 33 | } 34 | 35 | // a paragraph is a block element, some or no text followed by a newline. 36 | // The fmt_rule parameter is only applied to plain top-level text. All nested formatting 37 | // uses the standard formatted rule. This keeps formatted text or html tags from beeing 38 | // ripped apart. 39 | ParagraphTemplate 40 | = list 41 | / table 42 | / gallery 43 | / (t:template _ (nl / EOF) {t}) 44 | / (i:internal_ref _ (nl / EOF) {i}) 45 | / (c:html_comment _ (nl / EOF) {c}) 46 | / posl:#position PAR_START_GUARD text:fmt_rule* _ (nl / EOF) posr:#position 47 | { 48 | Element::Paragraph(Paragraph { 49 | position: Span::new(posl, posr, source_lines), 50 | content: text, 51 | }) 52 | } 53 | 54 | // the standard paragraph 55 | paragraph -> Element 56 | = ParagraphTemplate 57 | 58 | 59 | // === Template parsing === 60 | template_fmt -> Element 61 | = FormattedTextTemplate> 62 | template_par -> Element 63 | = ParagraphTemplate 64 | 65 | // mediawiki templates have a name followed by a sequence of arguments. 66 | template -> Element 67 | = posl:#position !(MAGIC_WORDS) "{{" ws n:(template_fmt)* ws 68 | attrs:('|' t:template_arg {t})* "}}" posr:#position 69 | { 70 | Element::Template(Template { 71 | position: Span::new(posl, posr, source_lines), 72 | name: n, 73 | content: attrs 74 | }) 75 | } 76 | 77 | template_arg -> Element 78 | = posl:#position ws name:(n:template_arg_name ws '=' {n})? ws 79 | value:(h:heading* p:template_par* f:template_fmt* {(h, (p, f))}) posr:#position 80 | { 81 | Element::TemplateArgument(TemplateArgument { 82 | position: Span::new(posl, posr, source_lines), 83 | name: name.unwrap_or_default(), 84 | value: combine((value.0, combine(value.1))) 85 | }) 86 | } 87 | 88 | 89 | // === mediawiki lists === 90 | list -> Element 91 | = posl:#position items:(li:list_item ++ (nl / EOF) {li}) nl? posr:#position 92 | { 93 | Element::List(List { 94 | position: Span::new(posl, posr, source_lines), 95 | content: items, 96 | }) 97 | } 98 | 99 | list_item -> Element 100 | = posl:#position s:$([*#:;]+) _ content:formatted* _ posr:#position 101 | { 102 | let kind = match s.chars().last() { 103 | Some('*') => ListItemKind::Unordered, 104 | Some('#') => ListItemKind::Ordered, 105 | Some(':') => ListItemKind::Definition, 106 | Some(';') => ListItemKind::DefinitionTerm, 107 | _ => panic!("undefined list start: {:?} \ 108 | this is an implementation error!", s.chars().last()) 109 | }; 110 | Element::ListItem(ListItem { 111 | position: Span::new(posl, posr, source_lines), 112 | depth: s.len(), 113 | kind, 114 | content, 115 | }) 116 | } 117 | 118 | 119 | // === mediawiki tables === 120 | 121 | table_start = "{|" / "{{(!}}" 122 | table_end = "|}" / "{{!)}}" 123 | table_caption_sep = "|+" / "{{!+}}" 124 | table_row_sep = "|-" / "{{!-}}" 125 | table_pipe = '|' / "{{!}}" 126 | cell_sep -> &'input str 127 | = $("||") / $("!!") / $('|') / $('!') / $("{{!}}") / $("{{!!}}") 128 | 129 | table -> Element 130 | = posl:#position table_start attr:table_attrs? ws caption:table_caption? 131 | first_cells:table_cell* rows:table_row* table_end posr:#position 132 | { 133 | let (cap_attrs, cap_pars) = caption.unwrap_or_default(); 134 | let mut rows = rows; 135 | if first_cells.len() > 0 { 136 | rows.insert(0, Element::TableRow(TableRow { 137 | position: Span::new(0, 0, source_lines), 138 | cells: first_cells, 139 | attributes: vec![], 140 | })); 141 | } 142 | 143 | Element::Table(Table { 144 | position: Span::new(posl, posr, source_lines), 145 | rows, 146 | attributes: attr.unwrap_or_default(), 147 | caption: cap_pars, 148 | caption_attributes: cap_attrs, 149 | }) 150 | } 151 | 152 | table_attrs -> Vec 153 | = _ attr:(html_attr ** (whitespace+)) _ {attr} 154 | table_fmt -> Element 155 | = !(cell_sep) FormattedTextTemplate> 156 | table_par -> Element 157 | = ParagraphTemplate 158 | 159 | table_caption -> (Vec, Vec) 160 | = table_caption_sep _ attr:(t:table_attrs table_pipe {t})? _ 161 | pars:(p:table_par* f:table_fmt* {combine((p, f))}) 162 | { 163 | (attr.unwrap_or_default(), pars) 164 | } 165 | 166 | 167 | row_sep -> Vec 168 | = table_row_sep attr:table_attrs nl {attr} 169 | 170 | table_row -> Element 171 | = posl:#position !(table_end) sep:row_sep c:table_cell* posr:#position 172 | { 173 | Element::TableRow(TableRow { 174 | position: Span::new(posl, posr, source_lines), 175 | cells: c, 176 | attributes: sep, 177 | }) 178 | } 179 | 180 | table_cell -> Element 181 | = posl:#position !(table_end / row_sep) sep:cell_sep 182 | attr:(a:table_attrs table_pipe !(table_pipe) {a})? 183 | _ content:(p:table_par* f:table_fmt* {combine((p, f))}) posr:#position 184 | { 185 | Element::TableCell(TableCell { 186 | position: Span::new(posl, posr, source_lines), 187 | content, 188 | attributes: attr.unwrap_or_default(), 189 | header: sep.starts_with('!'), 190 | }) 191 | } 192 | 193 | // === References === 194 | 195 | // internal references, may have pipe-separated options 196 | iref_fmt -> Element 197 | = FormattedTextTemplate> 198 | iref_par -> Element 199 | = ParagraphTemplate 200 | 201 | internal_ref -> Element 202 | = posl:#position "[[" _ tar:iref_fmt* _ "|"? _ t:(pars:iref_par* _ fmts:iref_fmt* {(pars, fmts)}) ++ (_ '|' _) "]]" posr:#position 203 | { 204 | let mut t = t; 205 | let mut t: Vec> = t.drain(..).map(combine).collect(); 206 | Element::InternalReference(InternalReference { 207 | position: Span::new(posl, posr, source_lines), 208 | target: tar, 209 | caption: t.pop().unwrap_or_default(), 210 | options: t, 211 | }) 212 | } 213 | 214 | // external references (hyperlink) with only url and optional caption 215 | external_ref -> Element 216 | = posl:#position '[' u:url ws:_ cap:formatted* ']' posr:#position 217 | { 218 | Element::ExternalReference(ExternalReference { 219 | position: Span::new(posl, posr, source_lines), 220 | target: format!("{}{}", u, ws), 221 | caption: cap 222 | }) 223 | } 224 | 225 | // === Galleries === 226 | 227 | gallery_sep = (_ nl _)+ 228 | 229 | gallery_file -> Element 230 | = flp:#position content:(f:iref_fmt+ {f}) ++ '|' frp:#position 231 | { 232 | let mut content = content; 233 | Element::InternalReference(InternalReference { 234 | position: Span::new(flp, frp, source_lines), 235 | target: content.remove(0), 236 | caption: content.pop().unwrap_or_default(), 237 | options: content, 238 | }) 239 | } 240 | 241 | gallery -> Element 242 | = posl:#position attr:TagOpen<"gallery"i> 243 | ws files:(gallery_file ** gallery_sep) ws 244 | TagClose<"gallery"i> posr:#position 245 | { 246 | Element::Gallery(Gallery { 247 | position: Span::new(posl, posr, source_lines), 248 | attributes: attr.1, 249 | content: files, 250 | }) 251 | } 252 | 253 | // === Inline markup === 254 | 255 | // quoted formatted text cannot start with a single quote, except they are "Included" 256 | QuoteFormattedTemplate = text:((!('\'') t:formatted {t}) / included) {text} 257 | 258 | // quote formatting cannot be nested into it self 259 | strong_formatted -> Element 260 | = QuoteFormattedTemplate 261 | emph_formatted -> Element 262 | = QuoteFormattedTemplate 263 | strong_par -> Element 264 | = !(list / table / gallery) e:ParagraphTemplate {e} 265 | emph_par -> Element 266 | = !(list / table / gallery) e:ParagraphTemplate {e} 267 | 268 | strong -> Element 269 | = posl:#position strong_lit 270 | inner:(strong_par / strong_formatted)+ 271 | strong_lit posr:#position 272 | { 273 | Element::Formatted(Formatted { 274 | position: Span::new(posl, posr, source_lines), 275 | content: inner, 276 | markup: MarkupType::Bold 277 | }) 278 | } 279 | 280 | emph -> Element 281 | = posl:#position emph_lit 282 | inner:(emph_par / emph_formatted)+ 283 | emph_lit posr:#position 284 | { 285 | Element::Formatted(Formatted { 286 | position: Span::new(posl, posr, source_lines), 287 | content: inner, 288 | markup: MarkupType::Italic 289 | }) 290 | } 291 | 292 | // html markup 293 | math -> Element 294 | = inner:MarkupTag<"math"i, math_text*> {inner} 295 | strike_through -> Element 296 | = inner:MarkupTag<"del"i, p:paragraph* f:formatted* {combine((p, f))}> {inner} 297 | / inner:MarkupTag<"s"i, p:paragraph* f:formatted* {combine((p, f))}> {inner} 298 | underline -> Element 299 | = inner:MarkupTag<"ins"i, p:paragraph* f:formatted* {combine((p, f))}> {inner} 300 | / inner:MarkupTag<"u"i, p:paragraph* f:formatted* {combine((p, f))}> {inner} 301 | nowiki -> Element 302 | = inner:MarkupTag<"nowiki"i, nowiki_text*> {inner} 303 | code -> Element 304 | = inner:MarkupTag<"code"i, code_text*> {inner} 305 | blockquote -> Element 306 | = inner:MarkupTag<"blockquote"i, p:paragraph* f:formatted* {combine((p, f))}> {inner} 307 | pre_formatted -> Element 308 | = inner:MarkupTag<"pre"i, preformatted_text*> {inner} 309 | 310 | 311 | // Template for formatted text with a specific rule for plain text. 312 | FormattedTextTemplate 313 | = fmt:( 314 | text_rule 315 | / strong 316 | / emph 317 | / template 318 | / internal_ref 319 | / external_ref 320 | 321 | / html_comment 322 | / math 323 | / nowiki 324 | / strike_through 325 | / underline 326 | / code 327 | / blockquote 328 | / pre_formatted 329 | 330 | / any_tag 331 | / whitespace_elem 332 | ) {fmt} 333 | 334 | // Standard text element for most contexts 335 | formatted -> Element 336 | = f:FormattedTextTemplate {f} 337 | 338 | 339 | // === embedded html === 340 | 341 | html_attr -> TagAttribute 342 | = posl:#position key:tag_name _ '=' _ value:(quoted_text / tag_safe_literal) posr:#position 343 | { 344 | TagAttribute::new(Span::new(posl, posr, source_lines), key, value) 345 | } 346 | 347 | TagInner 348 | = n:name _ attrs:(a:html_attr _ {a})* {(n, attrs)} 349 | TagOpen 350 | = #quiet<'<' _ inner:TagInner _ '>' {inner}> / #expected("opening html tag") 351 | TagClose 352 | = #quiet<('<' _ '/' _ TagInner _ '>') / '<' _ '/' _ '>'> / #expected("closing html tag") 353 | 354 | // a generic html tag (self-closing or with inner elements) 355 | HtmlTag 356 | = (tag:TagOpen i:inner TagClose {(tag.0, tag.1, i)}) 357 | / ("<" _ tag:TagInner _ "/" _ ">" {(tag.0, tag.1, vec![])}) 358 | 359 | any_open 360 | = TagOpen {()} 361 | any_close 362 | = TagClose {()} 363 | 364 | // matches any valid html tag (except builtins like "gallery") 365 | // with inner Text / Paragraph / Heading, creating a HtmlTag Element. 366 | any_tag -> Element 367 | = posl:#position 368 | t:HtmlTag<(!HTML_BLOCK_ELEMENTS n:tag_name {n}), p:paragraph* f:formatted* h:heading* {combine((p, combine((f, h))))}> 369 | posr:#position 370 | { 371 | Element::HtmlTag(HtmlTag { 372 | position: Span::new(posl, posr, source_lines), 373 | name: t.0, 374 | attributes: t.1, 375 | content: t.2 376 | }) 377 | } 378 | 379 | // macro for simple formatting markup tags. Matches markup type by tag name (see ast.rs) 380 | MarkupTag 381 | = posl:#position tag_info:HtmlTag<$(name), inner> posr:#position 382 | { 383 | Element::Formatted(Formatted { 384 | position: Span::new(posl, posr, source_lines), 385 | content: tag_info.2, 386 | markup: MarkupType::by_tag_name(tag_info.0), 387 | }) 388 | } 389 | 390 | 391 | // html comments may contain any text. 392 | html_comment_start = "" 394 | 395 | html_comment -> Element 396 | = posl:#position html_comment_start 397 | s:CharString<(!(html_comment_end) c:$. {c})>? 398 | html_comment_end posr:#position 399 | { 400 | Element::Comment(Comment { 401 | position: Span::new(posl, posr, source_lines), 402 | text: s.unwrap_or_default(), 403 | }) 404 | } 405 | 406 | // === primitive terminals === 407 | 408 | emph_lit = "''" 409 | strong_lit = "'''" 410 | nl = '\n' 411 | EOF = #quiet / #expected("EOF") 412 | 413 | 414 | // === text primitives === 415 | 416 | Text 417 | = posl:#position s:CharString posr:#position 418 | { 419 | Element::Text(Text { 420 | position: Span::new(posl, posr, source_lines), 421 | text: s 422 | }) 423 | } 424 | 425 | CharString 426 | = chars:C+ { chars.iter().map(|s| s.to_owned()).collect() } 427 | 428 | EnclosedLiteral 429 | = ClosingChar text:CharString ClosingChar { text } 430 | 431 | 432 | // === various text types === 433 | 434 | normal_text -> Element 435 | = #quiet> / #expected("normal text") 436 | math_text -> Element 437 | = #quiet> / #expected("LaTeX source code") 438 | template_arg_name -> String 439 | = #quiet> / #expected("template attribute name") 440 | nowiki_text -> Element 441 | = #quiet $.>> / #expected("any text") 442 | code_text -> Element 443 | = #quiet $. >> / #expected("any text") 444 | preformatted_text -> Element 445 | = #quiet $. >> / #expected ("any text") 446 | url -> String 447 | = #quiet> / #expected("a word of text (e.g. url)") 448 | tag_safe_literal -> String 449 | = #quiet> / #expected("tag attribute value") 450 | quoted_text -> String 451 | = #quiet / EnclosedLiteral<'\''>> / #expected("quoted text") 452 | tag_name -> String 453 | = #quiet> / #expected("tag / attribute name") 454 | 455 | _ -> &'input str = #quiet / #expected("whitespace") 456 | ws -> &'input str = #quiet / #expected("whitespace (including newlines)") 457 | whitespace_elem -> Element 458 | = Text 459 | 460 | // === character classes === 461 | // These characters are allowed within certain contexts, 462 | // excluded characters have special meaning and break texts 463 | 464 | math_char -> &'input str = !TagClose<"math"i> $. 465 | normal_char -> &'input str 466 | = !([\n\r \t{}\[\]] / emph_lit / 467 | any_open / any_close / any_tag / html_comment_start) $. 468 | 469 | heading_char -> &'input str 470 | = !('='+ _ (nl / EOF)) c:normal_char {c} 471 | table_char -> &'input str 472 | = !(cell_sep) c:normal_char {c} 473 | template_char -> &'input str 474 | = !'|' c:normal_char {c} 475 | template_arg_char -> &'input str 476 | = ![|<>=!*#:;/] c:normal_char {c} 477 | whitespace -> &'input str 478 | = $(' ') / $('\t') 479 | tag_char -> &'input str 480 | = $([^<>/ =]) 481 | url_char -> &'input str 482 | = $([^ \]]) 483 | 484 | // a paragraph may not start with these symbols as they indicate other elements 485 | PAR_START_GUARD = !([=!|;#:*] / EOF) 486 | 487 | // tags which should be parsed as block elements, rather than html tags. 488 | HTML_BLOCK_ELEMENTS = ("gallery"i) 489 | 490 | // magic words which cannot be interpreted as templates 491 | MAGIC_WORDS = table_start / table_end / table_caption_sep / 492 | table_row_sep / table_pipe / cell_sep 493 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod ast; 2 | mod error; 3 | #[allow( 4 | clippy::unused_unit, 5 | clippy::unit_arg, 6 | clippy::cyclomatic_complexity, 7 | clippy::len_zero, 8 | clippy::single_match, 9 | clippy::naive_bytecount, 10 | clippy::suspicious_else_formatting 11 | )] 12 | mod grammar; 13 | #[cfg(test)] 14 | mod tests; 15 | mod traversion; 16 | mod util; 17 | 18 | // public exports 19 | pub use self::ast::*; 20 | pub use self::error::*; 21 | pub use self::traversion::Traversion; 22 | 23 | pub mod transformations; 24 | 25 | mod default_transformations; 26 | use self::default_transformations::*; 27 | 28 | /// Parse the input document to generate a document tree. 29 | /// After parsing, some transformations are applied to the result. 30 | pub fn parse(input: &str) -> Result { 31 | let source_lines = util::get_source_lines(input); 32 | 33 | #[cfg(feature = "ptime")] 34 | let starttime = time::precise_time_ns(); 35 | 36 | let result = match grammar::document(input, &source_lines) { 37 | Err(e) => Err(error::MWError::ParseError(error::ParseError::from( 38 | &e, input, 39 | ))), 40 | Ok(r) => Ok(r), 41 | }?; 42 | 43 | #[cfg(feature = "ptime")] 44 | let parsedtime = time::precise_time_ns(); 45 | 46 | let settings = GeneralSettings {}; 47 | let trans_result = apply_transformations(result, &settings); 48 | 49 | #[cfg(feature = "ptime")] 50 | { 51 | eprintln!( 52 | "Parse Timer: Parsing took {} ms.", 53 | ((parsedtime - starttime) as f64) / 1.0e6 54 | ); 55 | eprintln!( 56 | "Parse Timer: Transformation took {} ms.", 57 | ((time::precise_time_ns() - parsedtime) as f64) / 1.0e6 58 | ); 59 | } 60 | 61 | trans_result.map_err(error::MWError::TransformationError) 62 | } 63 | 64 | fn apply_transformations( 65 | mut root: Element, 66 | settings: &GeneralSettings, 67 | ) -> transformations::TResult { 68 | root = validate_external_refs(root, settings)?; 69 | root = fold_headings_transformation(root, settings)?; 70 | root = fold_lists_transformation(root, settings)?; 71 | root = whitespace_paragraphs_to_empty(root, settings)?; 72 | root = collapse_paragraphs(root, settings)?; 73 | root = collapse_consecutive_text(root, settings)?; 74 | root = enumerate_anon_args(root, settings)?; 75 | Ok(root) 76 | } 77 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | //! This program takes Media Wiki source code and produces a yaml syntax tree. 2 | //! 3 | //! It aims to provide fast offline processing with debug information 4 | //! (element position) included. The resulting tree represents the input 5 | //! document on a syntactic level. Please refer to the `mediawiki_parser` 6 | //! documentation for a description of possible elements of the abstract 7 | //! syntax tree. 8 | 9 | use mediawiki_parser; 10 | use serde_json; 11 | use serde_yaml; 12 | use std::fs; 13 | use std::io; 14 | use std::io::prelude::*; 15 | use std::io::BufReader; 16 | use std::path::PathBuf; 17 | use std::process; 18 | use structopt::StructOpt; 19 | 20 | #[derive(Debug, StructOpt)] 21 | /// This program takes MediaWiki source code and produces 22 | /// a yaml syntax tree on stdout. 23 | struct Args { 24 | /// Path to the input file. 25 | /// If none is provided, stdin is used. 26 | #[structopt(short = "i", long = "input", parse(from_os_str))] 27 | pub input_file: Option, 28 | 29 | /// Ouput the result as JSON 30 | #[structopt(short = "j", long = "json")] 31 | pub use_json: bool, 32 | } 33 | 34 | /// read contents of a `io::Reader` into a string 35 | fn read_from_reader(reader: &mut io::Read) -> String { 36 | let mut buffer = io::BufReader::new(reader); 37 | let mut content = String::new(); 38 | buffer 39 | .read_to_string(&mut content) 40 | .expect("Could not read fron file!"); 41 | content 42 | } 43 | 44 | /// Read a file from disk and store to string. 45 | fn read_file(filename: &PathBuf) -> String { 46 | let file = fs::File::open(filename).expect("Could not open file!"); 47 | let mut reader = BufReader::new(file); 48 | read_from_reader(&mut reader) 49 | } 50 | 51 | /// Read a file from stdin from to string. 52 | fn read_stdin() -> String { 53 | read_from_reader(&mut io::stdin()) 54 | } 55 | 56 | fn main() { 57 | let args = Args::from_args(); 58 | let input = if let Some(path) = args.input_file { 59 | read_file(&path) 60 | } else { 61 | read_stdin() 62 | }; 63 | 64 | let result = mediawiki_parser::parse(&input); 65 | match result { 66 | Ok(r) => { 67 | if args.use_json { 68 | serde_json::to_writer(io::stdout(), &r).expect("could not serialize json!"); 69 | } else { 70 | serde_yaml::to_writer(io::stdout(), &r).expect("could not serialize yaml!"); 71 | }; 72 | println!(); 73 | } 74 | Err(e) => { 75 | eprintln!("{}", e); 76 | if args.use_json { 77 | serde_json::to_writer(io::stdout(), &e).expect("could not serialize json!"); 78 | } else { 79 | serde_yaml::to_writer(io::stdout(), &e).expect("could not serialize yaml!"); 80 | }; 81 | println!(); 82 | process::exit(1); 83 | } 84 | }; 85 | } 86 | -------------------------------------------------------------------------------- /src/tests/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | pub mod generated { 3 | include!(concat!(env!("OUT_DIR"), "/tests_generated.rs")); 4 | } 5 | -------------------------------------------------------------------------------- /src/transformations.rs: -------------------------------------------------------------------------------- 1 | //! Functions and types for source tree transformations. 2 | 3 | use crate::ast::*; 4 | use crate::error::TransformationError; 5 | 6 | /// Transformation result type 7 | pub type TResult = Result; 8 | 9 | /// Result type for a list of transformed elements. 10 | pub type TListResult = Result, TransformationError>; 11 | 12 | /// Signature of an in-place transformation function 13 | pub type TFuncInplace = Fn(Element, S) -> TResult; 14 | 15 | /// Signature of a cloning transformation function 16 | pub type TFunc = Fn(&Element, &[&Element], S) -> TResult; 17 | 18 | /// Apply a given transformation function to a list of elements, without mutating the original. 19 | pub fn apply_func_clone( 20 | func: &TFunc, 21 | content: &[Element], 22 | path: &[&Element], 23 | settings: S, 24 | ) -> TListResult { 25 | let mut result = vec![]; 26 | for child in content { 27 | result.push(func(child, path, settings)?); 28 | } 29 | Ok(result) 30 | } 31 | 32 | /// Apply a given transformation to every item in a list, consuming this list. 33 | pub fn apply_func_drain( 34 | func: &TFuncInplace, 35 | content: &mut Vec, 36 | settings: S, 37 | ) -> TListResult { 38 | let mut result = vec![]; 39 | for child in content.drain(..) { 40 | result.push(func(child, settings)?); 41 | } 42 | Ok(result) 43 | } 44 | 45 | /// Recursively apply a transformation function `func` to all children of element `root`. 46 | pub fn recurse_inplace(func: &TFuncInplace, root: Element, settings: S) -> TResult { 47 | recurse_inplace_template(func, root, settings, &apply_func_drain) 48 | } 49 | 50 | /// Recursively apply a function `content_func` to the children list of a node. 51 | pub fn recurse_inplace_template( 52 | func: &TFuncInplace, 53 | mut root: Element, 54 | settings: S, 55 | content_func: &Fn(&TFuncInplace, &mut Vec, S) -> TListResult, 56 | ) -> TResult { 57 | match root { 58 | Element::Document(ref mut e) => { 59 | let mut temp = content_func(func, &mut e.content, settings)?; 60 | e.content.append(&mut temp); 61 | } 62 | Element::Formatted(ref mut e) => { 63 | let mut temp = content_func(func, &mut e.content, settings)?; 64 | e.content.append(&mut temp); 65 | } 66 | Element::Paragraph(ref mut e) => { 67 | let mut temp = content_func(func, &mut e.content, settings)?; 68 | e.content.append(&mut temp); 69 | } 70 | Element::ListItem(ref mut e) => { 71 | let mut temp = content_func(func, &mut e.content, settings)?; 72 | e.content.append(&mut temp); 73 | } 74 | Element::List(ref mut e) => { 75 | let mut temp = content_func(func, &mut e.content, settings)?; 76 | e.content.append(&mut temp); 77 | } 78 | Element::TableCell(ref mut e) => { 79 | let mut temp = content_func(func, &mut e.content, settings)?; 80 | e.content.append(&mut temp); 81 | } 82 | Element::HtmlTag(ref mut e) => { 83 | let mut temp = content_func(func, &mut e.content, settings)?; 84 | e.content.append(&mut temp); 85 | } 86 | Element::Gallery(ref mut e) => { 87 | let mut temp = content_func(func, &mut e.content, settings)?; 88 | e.content.append(&mut temp); 89 | } 90 | Element::Heading(ref mut e) => { 91 | let mut content = content_func(func, &mut e.content, settings)?; 92 | let mut caption = content_func(func, &mut e.caption, settings)?; 93 | e.caption.append(&mut caption); 94 | e.content.append(&mut content); 95 | } 96 | Element::Template(ref mut e) => { 97 | let mut name = content_func(func, &mut e.name, settings)?; 98 | let mut content = content_func(func, &mut e.content, settings)?; 99 | e.name.append(&mut name); 100 | e.content.append(&mut content); 101 | } 102 | Element::TemplateArgument(ref mut e) => { 103 | let mut value = content_func(func, &mut e.value, settings)?; 104 | e.value.append(&mut value); 105 | } 106 | Element::InternalReference(ref mut e) => { 107 | let mut target = content_func(func, &mut e.target, settings)?; 108 | let mut caption = content_func(func, &mut e.caption, settings)?; 109 | 110 | let mut new_options = vec![]; 111 | for mut option in e.options.drain(..) { 112 | new_options.push(content_func(func, &mut option, settings)?); 113 | } 114 | 115 | e.target.append(&mut target); 116 | e.options.append(&mut new_options); 117 | e.caption.append(&mut caption); 118 | } 119 | Element::ExternalReference(ref mut e) => { 120 | let mut caption = content_func(func, &mut e.caption, settings)?; 121 | e.caption.append(&mut caption); 122 | } 123 | Element::Table(ref mut e) => { 124 | let mut caption = content_func(func, &mut e.caption, settings)?; 125 | let mut rows = content_func(func, &mut e.rows, settings)?; 126 | e.caption.append(&mut caption); 127 | e.rows.append(&mut rows); 128 | } 129 | Element::TableRow(ref mut e) => { 130 | let mut cells = content_func(func, &mut e.cells, settings)?; 131 | e.cells.append(&mut cells); 132 | } 133 | Element::Text(_) | Element::Comment(_) | Element::Error(_) => (), 134 | }; 135 | Ok(root) 136 | } 137 | 138 | /// Recursively apply a transformation function `func` to all children of element `root`, cloning the input. 139 | pub fn recurse_clone( 140 | func: &TFunc, 141 | root: &Element, 142 | path: &[&Element], 143 | settings: S, 144 | ) -> TResult { 145 | recurse_clone_template(func, root, path, settings, &apply_func_clone) 146 | } 147 | 148 | /// Recursively apply a function `content_func` to the children list of a node, cloning the input. 149 | pub fn recurse_clone_template( 150 | func: &TFunc, 151 | root: &Element, 152 | path: &[&Element], 153 | settings: S, 154 | content_func: &Fn(&TFunc, &[Element], &[&Element], S) -> TListResult, 155 | ) -> TResult { 156 | let mut path = path.to_owned(); 157 | path.push(root); 158 | let new = match *root { 159 | Element::Document(ref e) => Element::Document(Document { 160 | position: e.position.clone(), 161 | content: content_func(func, &e.content, &path, settings)?, 162 | }), 163 | Element::Heading(ref e) => Element::Heading(Heading { 164 | position: e.position.clone(), 165 | depth: e.depth, 166 | caption: content_func(func, &e.caption, &path, settings)?, 167 | content: content_func(func, &e.content, &path, settings)?, 168 | }), 169 | Element::Formatted(ref e) => Element::Formatted(Formatted { 170 | position: e.position.clone(), 171 | markup: e.markup, 172 | content: content_func(func, &e.content, &path, settings)?, 173 | }), 174 | Element::Paragraph(ref e) => Element::Paragraph(Paragraph { 175 | position: e.position.clone(), 176 | content: content_func(func, &e.content, &path, settings)?, 177 | }), 178 | Element::Template(ref e) => Element::Template(Template { 179 | position: e.position.clone(), 180 | name: content_func(func, &e.name, &path, settings)?, 181 | content: content_func(func, &e.content, &path, settings)?, 182 | }), 183 | Element::TemplateArgument(ref e) => Element::TemplateArgument(TemplateArgument { 184 | position: e.position.clone(), 185 | name: e.name.clone(), 186 | value: content_func(func, &e.value, &path, settings)?, 187 | }), 188 | Element::InternalReference(ref e) => { 189 | let mut new_options = vec![]; 190 | for option in &e.options { 191 | new_options.push(content_func(func, &option, &path, settings)?); 192 | } 193 | 194 | Element::InternalReference(InternalReference { 195 | position: e.position.clone(), 196 | target: content_func(func, &e.target, &path, settings)?, 197 | options: new_options, 198 | caption: content_func(func, &e.caption, &path, settings)?, 199 | }) 200 | } 201 | Element::ExternalReference(ref e) => Element::ExternalReference(ExternalReference { 202 | position: e.position.clone(), 203 | target: e.target.clone(), 204 | caption: content_func(func, &e.caption, &path, settings)?, 205 | }), 206 | Element::ListItem(ref e) => Element::ListItem(ListItem { 207 | position: e.position.clone(), 208 | depth: e.depth, 209 | kind: e.kind, 210 | content: content_func(func, &e.content, &path, settings)?, 211 | }), 212 | Element::List(ref e) => Element::List(List { 213 | position: e.position.clone(), 214 | content: content_func(func, &e.content, &path, settings)?, 215 | }), 216 | Element::Table(ref e) => Element::Table(Table { 217 | position: e.position.clone(), 218 | attributes: e.attributes.clone(), 219 | caption: content_func(func, &e.caption, &path, settings)?, 220 | caption_attributes: e.caption_attributes.clone(), 221 | rows: content_func(func, &e.rows, &path, settings)?, 222 | }), 223 | Element::TableRow(ref e) => Element::TableRow(TableRow { 224 | position: e.position.clone(), 225 | attributes: e.attributes.clone(), 226 | cells: content_func(func, &e.cells, &path, settings)?, 227 | }), 228 | Element::TableCell(ref e) => Element::TableCell(TableCell { 229 | position: e.position.clone(), 230 | header: e.header, 231 | attributes: e.attributes.clone(), 232 | content: content_func(func, &e.content, &path, settings)?, 233 | }), 234 | Element::Comment(ref e) => Element::Comment(e.clone()), 235 | Element::Text(ref e) => Element::Text(e.clone()), 236 | Element::Error(ref e) => Element::Error(e.clone()), 237 | Element::HtmlTag(ref e) => Element::HtmlTag(HtmlTag { 238 | position: e.position.clone(), 239 | name: e.name.clone(), 240 | attributes: e.attributes.clone(), 241 | content: content_func(func, &e.content, &path, settings)?, 242 | }), 243 | Element::Gallery(ref e) => Element::Gallery(Gallery { 244 | position: e.position.clone(), 245 | attributes: e.attributes.clone(), 246 | content: content_func(func, &e.content, &path, settings)?, 247 | }), 248 | }; 249 | path.pop(); 250 | Ok(new) 251 | } 252 | -------------------------------------------------------------------------------- /src/traversion.rs: -------------------------------------------------------------------------------- 1 | //! Helper trait for operations reading from the document tree. 2 | 3 | use super::ast::Element; 4 | use std::io; 5 | 6 | /// Implements a traversion over a tree of `Element`. 7 | /// 8 | /// All fields of the traversion struct can be mutated, 9 | /// external settings cannot. 10 | pub trait Traversion<'a, S: Copy + ?Sized> { 11 | /// push to the traversion path. 12 | fn path_push(&mut self, elem: &'a Element); 13 | /// pop from the traversion path. 14 | fn path_pop(&mut self) -> Option<&'a Element>; 15 | /// get the traversion path. 16 | fn get_path(&self) -> &Vec<&'a Element>; 17 | /// template method for handling single nodes. 18 | /// if the result is `false`, handling is complete and 19 | /// children of this node are not considered, 20 | /// otherwise `work()` is recursively called for all children. 21 | fn work(&mut self, _root: &'a Element, _settings: S, _out: &mut io::Write) -> io::Result { 22 | Ok(true) 23 | } 24 | 25 | /// template method for handling a vector of nodes. 26 | /// if the result is `false`, handling is complete and 27 | /// children of the vector's elements are not considered, 28 | /// otherwise `work()` is recursively called for all children. 29 | fn work_vec( 30 | &mut self, 31 | _root: &'a [Element], 32 | _settings: S, 33 | _out: &mut io::Write, 34 | ) -> io::Result { 35 | Ok(true) 36 | } 37 | 38 | /// run this traversion for a vector of elements. 39 | fn run_vec( 40 | &mut self, 41 | content: &'a [Element], 42 | settings: S, 43 | out: &mut io::Write, 44 | ) -> io::Result<()> { 45 | if !self.work_vec(content, settings, out)? { 46 | return Ok(()); 47 | } 48 | for elem in &content[..] { 49 | self.run(elem, settings, out)?; 50 | } 51 | Ok(()) 52 | } 53 | /// run this traversion for an element. 54 | fn run(&mut self, root: &'a Element, settings: S, out: &mut io::Write) -> io::Result<()> { 55 | self.path_push(root); 56 | 57 | // break if work function breaks recursion. 58 | if !self.work(root, settings, out)? { 59 | return Ok(()); 60 | } 61 | match *root { 62 | Element::Document(ref e) => self.run_vec(&e.content, settings, out)?, 63 | Element::Formatted(ref e) => self.run_vec(&e.content, settings, out)?, 64 | Element::Paragraph(ref e) => self.run_vec(&e.content, settings, out)?, 65 | Element::ListItem(ref e) => self.run_vec(&e.content, settings, out)?, 66 | Element::List(ref e) => self.run_vec(&e.content, settings, out)?, 67 | Element::TableCell(ref e) => self.run_vec(&e.content, settings, out)?, 68 | Element::HtmlTag(ref e) => self.run_vec(&e.content, settings, out)?, 69 | Element::Gallery(ref e) => self.run_vec(&e.content, settings, out)?, 70 | Element::Heading(ref e) => { 71 | self.run_vec(&e.caption, settings, out)?; 72 | self.run_vec(&e.content, settings, out)?; 73 | } 74 | Element::Template(ref e) => { 75 | self.run_vec(&e.name, settings, out)?; 76 | self.run_vec(&e.content, settings, out)?; 77 | } 78 | Element::TemplateArgument(ref e) => self.run_vec(&e.value, settings, out)?, 79 | Element::InternalReference(ref e) => { 80 | self.run_vec(&e.target, settings, out)?; 81 | for option in &e.options { 82 | self.run_vec(option, settings, out)?; 83 | } 84 | self.run_vec(&e.caption, settings, out)?; 85 | } 86 | Element::ExternalReference(ref e) => self.run_vec(&e.caption, settings, out)?, 87 | Element::Table(ref e) => { 88 | self.run_vec(&e.caption, settings, out)?; 89 | self.run_vec(&e.rows, settings, out)?; 90 | } 91 | Element::TableRow(ref e) => self.run_vec(&e.cells, settings, out)?, 92 | Element::Text(_) | Element::Comment(_) | Element::Error(_) => (), 93 | } 94 | self.path_pop(); 95 | Ok(()) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | //! Utility functions and types 2 | 3 | use crate::ast; 4 | 5 | /// The terminal width. 6 | const TERMINAL_WIDTH: usize = 80; 7 | 8 | pub fn combine(t: (Vec, Vec)) -> Vec { 9 | let (mut t1, mut t2) = t; 10 | t1.append(&mut t2); 11 | t1 12 | } 13 | 14 | /// Compiles a list of start and end positions of the input source lines. 15 | /// 16 | /// This representation is used to calculate line and column position from the input offset. 17 | pub fn get_source_lines(source: &str) -> Vec { 18 | let mut pos = 0; 19 | let mut result = Vec::new(); 20 | 21 | for line in source.split('\n') { 22 | result.push(ast::SourceLine { 23 | start: pos, 24 | content: line, 25 | end: pos + line.len() + 1, 26 | }); 27 | pos += line.len() + 1; 28 | } 29 | result 30 | } 31 | 32 | /// Tests if a string is entirely whitespace 33 | pub fn is_whitespace(input: &str) -> bool { 34 | input.chars().all(|c| c.is_whitespace()) 35 | } 36 | 37 | /// Shorten a string to fit into `TERMINAL_WIDTH`. 38 | pub fn shorten_str(input: &str) -> String { 39 | let input_len = input.chars().count(); 40 | 41 | if input.len() < TERMINAL_WIDTH { 42 | return String::from(input); 43 | } 44 | 45 | let filler = " .. "; 46 | let mut result = String::new(); 47 | let half_text_size = (TERMINAL_WIDTH - filler.chars().count()) / 2; 48 | 49 | for (char_count, c) in input.chars().enumerate() { 50 | if char_count < half_text_size { 51 | result.push(c); 52 | } 53 | if char_count == half_text_size { 54 | result.push_str(filler); 55 | } 56 | if char_count >= input_len - half_text_size { 57 | result.push(c); 58 | } 59 | } 60 | result 61 | } 62 | 63 | #[cfg(test)] 64 | mod tests { 65 | use super::*; 66 | 67 | #[test] 68 | fn test_is_whitespace() { 69 | for arg in &["", " ", "\t", "\n", "\t\t\t", "\n\t "] { 70 | assert!(is_whitespace(arg), "is_whitespace({:?})", arg); 71 | } 72 | 73 | for arg in &["a", " a", "\t\\", " \nä\t\t\t "] { 74 | assert!(!is_whitespace(arg), "!is_whitespace({:?})", arg); 75 | } 76 | } 77 | } 78 | --------------------------------------------------------------------------------