├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.MD ├── benchmark ├── benchmark.erl └── benchmark_stream.erl ├── c_src ├── .gitattributes ├── .gitignore ├── Makefile ├── allocators.cc ├── allocators.h ├── bytebuffer.cc ├── bytebuffer.h ├── element_encoder.cc ├── element_encoder.h ├── erlxml.cc ├── erlxml.h ├── erlxml_nif.cc ├── erlxml_nif.h ├── macros.h ├── nif.mk ├── nif_utils.cc ├── nif_utils.h ├── pugixml │ ├── .gitignore │ ├── pugiconfig.hpp │ ├── pugixml.cpp │ └── pugixml.hpp ├── utf8_cleanup.cc ├── utf8_cleanup.h ├── xmlstreamparser.cc └── xmlstreamparser.h ├── include └── erlxml.hrl ├── rebar.config ├── rebar.lock ├── src ├── erlxml.app.src ├── erlxml.erl ├── erlxml_nif.erl └── erlxml_utils.erl └── test ├── .gitignore ├── data ├── invalid_token_EF_B7_90.txt ├── invalid_token_EF_B7_9F.txt ├── invalid_token_EF_B7_A4.txt ├── invalid_token_EF_B7_AF.txt ├── invalid_token_EF_BF_BE.txt ├── invalid_token_EF_BF_BF.txt ├── stream.txt ├── succeeded_C3_AF__C2_BF__C2_B0.txt ├── succeeded_C6_87.txt ├── succeeded_EF_B7_89.txt ├── succeeded_EF_B7_B0.txt ├── succeeded_EF_B8_80.txt ├── succeeded_EF_BF_AE.txt └── succeeded_F0_90_8C_88.txt └── integrity_test.erl /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | /.rebar/ 3 | /deps/ 4 | /ebin/ 5 | /erlxml_nif.xcodeproj/ 6 | /priv/ 7 | *.iml 8 | *.DS_Store 9 | /log/ 10 | /DerivedData/ 11 | /_build/ 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | language: erlang 3 | 4 | matrix: 5 | 6 | include: 7 | - os: linux 8 | dist: bionic 9 | otp_release: 25.3.2.6 10 | 11 | - os: linux 12 | dist: focal 13 | otp_release: 27.0 14 | 15 | - os: linux 16 | dist: jammy 17 | otp_release: 26.1.1 18 | 19 | - os: osx 20 | osx_image: xcode13.4 21 | language: generic 22 | env: 23 | - HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=true 24 | - HOMEBREW_NO_INSTALL_UPGRADE=true 25 | - HOMEBREW_NO_INSTALL_CLEANUP=true 26 | - HOMEBREW_NO_AUTO_UPDATE=true 27 | cache: 28 | directories: 29 | - $HOME/Library/Caches/Homebrew 30 | - /usr/local/Homebrew 31 | 32 | before_script: 33 | 34 | - if [[ $TRAVIS_OS_NAME == osx ]]; then brew install --force-bottle erlang || true; fi 35 | 36 | - curl https://s3.amazonaws.com/rebar3/rebar3 --output rebar3 && chmod +x rebar3 37 | 38 | script: 39 | - ./rebar3 compile 40 | 41 | after_success: 42 | - ./rebar3 eunit 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Silviu Caragea 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | REBAR ?= rebar3 2 | 3 | compile: 4 | @make V=0 -C c_src -j 8 5 | 6 | clean: 7 | @make -C c_src clean 8 | 9 | bench_encoding: 10 | $(REBAR) as bench compile 11 | erl -pa _build/bench/lib/*/ebin -noshell \ 12 | -eval "lists:foreach( \ 13 | fun(Module) -> \ 14 | lists:foreach(fun(N) -> benchmark:bench_encoding(Module, 600000, N) end, [1, 5, 10]) \ 15 | end, [erlxml, exml, fast_xml])" \ 16 | -eval "init:stop()." 17 | 18 | bench_parsing: 19 | $(REBAR) as bench compile 20 | erl -pa _build/bench/lib/*/ebin -noshell \ 21 | -eval "lists:foreach( \ 22 | fun(Module) -> \ 23 | lists:foreach(fun(N) -> benchmark:bench_parsing(Module, 600000, N) end, [1, 5, 10]) \ 24 | end, [erlxml, exml, fast_xml])" \ 25 | -eval "init:stop()." 26 | 27 | bench_streaming: 28 | $(REBAR) as bench compile 29 | erl -pa _build/bench/lib/*/ebin -noshell \ 30 | -eval "lists:foreach( \ 31 | fun(Module) -> \ 32 | lists:foreach(fun(N) -> benchmark_stream:bench(Module, \"test/data/stream.txt\", 60000, N) end, [1, 5, 10]) \ 33 | end, [erlxml, exml, fast_xml])" \ 34 | -eval "init:stop()." 35 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # erlxml 2 | 3 | *erlxml - Erlang XML parsing library based on pugixml* 4 | 5 | [![Build Status](https://app.travis-ci.com/silviucpp/erlxml.svg?branch=master)](https://travis-ci.com/github/silviucpp/erlxml) 6 | [![GitHub](https://img.shields.io/github/license/silviucpp/erlxml)](https://github.com/silviucpp/erlxml/blob/master/LICENSE) 7 | [![Hex.pm](https://img.shields.io/hexpm/v/erlxml2)](https://hex.pm/packages/erlxml2) 8 | 9 | # Implementation notes 10 | 11 | [pugixml][1] is the fastest dom parser available in c++ based on the benchmarks available [here][2]. The streaming parser works by dividing the 12 | stream into independent stanzas, which are then processed using pugixml. While the splitting algorithm is quite fast, it is designed for simplicity, 13 | which currently imposes some limitations on the streaming mode: 14 | 15 | - Does not support `CDATA` 16 | - Does not support comments containing special XML characters 17 | - Does not support `DOCTYPE` declarations 18 | 19 | All of the above limitations apply only to streaming mode and not to DOM parsing mode. 20 | 21 | ### Getting starting: 22 | 23 | ##### DOM parsing 24 | 25 | ```erlang 26 | erlxml:parse(<<"Some Value">>). 27 | ``` 28 | 29 | Which results in 30 | 31 | ```erlang 32 | {ok,{xmlel,<<"foo">>, 33 | [{<<"attr1">>,<<"bar">>}], 34 | [{xmlcdata,<<"Some Value">>}]}} 35 | ``` 36 | 37 | ##### Generate an XML document from Erlang terms 38 | 39 | ```erlang 40 | Xml = {xmlel,<<"foo">>, 41 | [{<<"attr1">>,<<"bar">>}], % Attributes 42 | [{xmlcdata,<<"Some Value">>}] % Elements 43 | }, 44 | erlxml:to_binary(Xml). 45 | ``` 46 | 47 | Which results in 48 | 49 | ```erlang 50 | <<"Some Value">> 51 | ``` 52 | 53 | ##### Streaming parsing 54 | 55 | ```erlang 56 | Chunk1 = <<">, 57 | Chunk2 = <<"\">Some Value">>, 58 | {ok, Parser} = erlxml:new_stream(), 59 | {ok,[{xmlstreamstart,<<"stream">>,[]}]} = erlxml:parse_stream(Parser, Chunk1), 60 | Rs = erlxml:parse_stream(Parser, Chunk2), 61 | {ok,[{xmlel,<<"foo">>, 62 | [{<<"attr1">>,<<"bar">>}], 63 | [{xmlcdata,<<"Some Value">>}]}, 64 | {xmlstreamend,<<"stream">>}]} = Rs. 65 | ``` 66 | 67 | ### Options 68 | 69 | When you create a stream using `new_stream/1` you can specify the following options: 70 | 71 | - `stanza_limit` - Specify the maximum size a stanza can have. In case the library parses more than this number of bytes 72 | without finding a stanza will return and error `{error, {max_stanza_limit_hit, binary()}}`. Example: `{stanza_limit, 65000}`. By default, it is 0 that means unlimited. 73 | 74 | - `strip_non_utf8` - Will strip from attributes values and node values elements all invalid utf8 characters. This is considered 75 | user input and might have malformed chars. Default is `false`. 76 | 77 | ### Benchmarks 78 | 79 | The benchmark code is inside the benchmark `folder`. The performances are compared against: 80 | 81 | - [exml][3] version used: 3.4.1 82 | - [fast_xml][4] version used: 1.1.55 83 | 84 | All tests are running with three different concurrency levels (how many erlang processes are spawn) 85 | 86 | - C1 (concurrency level 1) 87 | - C5 (concurrency level 5) 88 | - C10 (concurrency level 10) 89 | 90 | ##### DOM parsing 91 | 92 | Parse the same stanza defined in `benchmark/benchmark.erl` for 600000 times: 93 | 94 | ```sh 95 | make bench_parsing 96 | ``` 97 | 98 | | Library | C1 (ms) | C5 (ms) | C10 (ms) | 99 | |:----------:|:------------:|:---------:|:---------:| 100 | | erlxml | 1875.128 | 417.368 | 315.65 | 101 | | exml | 2417.334 | 578.226 | 407.516 | 102 | | fast_xml | 24159.517 | 5854.817 | 4007.837 | 103 | 104 | Note: 105 | 106 | - Starting version 3.0.0, [exml][3] saw significant improvements by replacing Expat with RapidXML. 107 | - `erlxml` delivers the best performance, followed by `exml`, while `fast_xml` performs the worst (huge difference). 108 | 109 | ##### Generate an XML document from Erlang terms 110 | 111 | Encode the same erlang term defined in `benchmark/benchmark.erl` for 600000 times: 112 | 113 | ```sh 114 | make bench_encoding 115 | ``` 116 | 117 | | Library | C1 (ms) | C5 (ms) | C10 (ms) | 118 | |:-----------:|:--------:|:-------:|:--------:| 119 | | `erlxml` | 1381.338 | 322.851 | 251.936 | 120 | | `exml` | 1333.54 | 301.625 | 234.295 | 121 | | `fast_xml` | 1019.238 | 238.676 | 198.69 | 122 | 123 | Note: 124 | 125 | - `fast_xml` delivers the best performance, followed by `exml`, and `erlxml` with almost the same performance. 126 | - `erlxml` improved encoding performance in version `2.1.0` by removing unnecessary memory copy and string length computing. 127 | 128 | ##### Streaming parsing 129 | 130 | Test is located in `benchmark/benchmark_stream.erl`, and will load all stanza's from `test/data/stream.txt` and run the parsing mode over that stanza's for 60000 times: 131 | 132 | ```sh 133 | make bench_streaming 134 | ``` 135 | 136 | ```sh 137 | ### engine: erlxml concurrency: 1 -> 2337.112 ms 193.81 MB/sec total bytes processed: 452.96 MB 138 | ### engine: erlxml concurrency: 5 -> 598.737 ms 756.52 MB/sec total bytes processed: 452.96 MB 139 | ### engine: erlxml concurrency: 10 -> 407.379 ms 1.09 GB/sec total bytes processed: 452.96 MB 140 | ### engine: exml concurrency: 1 -> 11790.975 ms 38.42 MB/sec total bytes processed: 452.96 MB 141 | ### engine: exml concurrency: 5 -> 2552.339 ms 177.47 MB/sec total bytes processed: 452.96 MB 142 | ### engine: exml concurrency: 10 -> 1840.267 ms 246.14 MB/sec total bytes processed: 452.96 MB 143 | ### engine: fast_xml concurrency: 1 -> 22677.758 ms 19.97 MB/sec total bytes processed: 452.96 MB 144 | ### engine: fast_xml concurrency: 5 -> 5184.096 ms 87.37 MB/sec total bytes processed: 452.96 MB 145 | ### engine: fast_xml concurrency: 10 -> 3854.402 ms 117.52 MB/sec total bytes processed: 452.96 MB 146 | ``` 147 | 148 | | Library | C1 (MB/s) | C5 (MB/s) | C10 (MB/s) | 149 | |:-----------:|:--------------:|:---------:|:----------:| 150 | | erlxml | 193.81 | 756.52 | 1090 | 151 | | exml | 38.42 | 177.47 | 246 | 152 | | fast_xml | 19.97 | 87.37 | 117 | 153 | 154 | Notes: 155 | 156 | - `erlxml` is the clear winner. 157 | 158 | [1]:http://pugixml.org 159 | [2]:http://pugixml.org/benchmark.html 160 | [3]:https://github.com/esl/exml 161 | [4]:https://github.com/processone/fast_xml 162 | -------------------------------------------------------------------------------- /benchmark/benchmark.erl: -------------------------------------------------------------------------------- 1 | -module(benchmark). 2 | 3 | -export([ 4 | bench_encoding/3, 5 | bench_parsing/3 6 | ]). 7 | 8 | -define(ELEMENT, {xmlel,<<"iq">>, 9 | [{<<"type">>,<<"get">>}], 10 | [{xmlel,<<"query">>, 11 | [{<<"xmlns">>,<<"jabber:iq:bulk">>}], 12 | [{xmlel,<<"r">>, [{<<"ver">>,<<"1489702723756">>},{<<"client">>,<<"0.41.16">>}], []}, 13 | {xmlel,<<"b">>,[{<<"ver">>,<<"1470998323471">>}],[]}, 14 | {xmlel,<<"pf">>, [{<<"type">>,<<"roster">>},{<<"ver">>,<<"1473925451360">>}], []}, 15 | {xmlel,<<"pf">>, [{<<"type">>,<<"addressbook">>},{<<"ver">>,<<"1410177959174">>}], []}]}, 16 | {xmlel,<<"message">>, 17 | [{<<"from">>,<<"user@wdomain/resource-TMnXEhgkGN">>}, 18 | {<<"to">>,<<"user2@domain/mac70c36269">>}, 19 | {<<"ts">>,<<"1490255206161">>}, 20 | {<<"id">>,<<"YgP1z-18834681">>}, 21 | {<<"type">>,<<"headline">>}], 22 | [{xmlel,<<"backendmessage">>, 23 | [{<<"xmlns">>,<<"notification">>},{<<"push">>,<<"0">>}], 24 | [{xmlel,<<"resreceived">>,[], 25 | [{xmlel,<<"accountId">>,[], 26 | [{xmlcdata,<<"23423534534">>}]}, 27 | {xmlel,<<"amount">>,[], 28 | [{xmlcdata, 29 | <<"0.200000000000000">>}]}, 30 | {xmlel,<<"type">>,[], 31 | [{xmlcdata,<<"AEW-12">>}]}, 32 | {xmlel,<<"description">>,[], 33 | [{xmlcdata,<<"884340">>}]}]}]}]}]} 34 | ). 35 | 36 | -define(STANZA, <<" 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 23423534534 48 | 49 | 50 | 0.200000000000000 51 | 52 | 53 | AEW-12 54 | 55 | 56 | 884340 57 | 58 | 59 | 60 | 61 | ">>). 62 | 63 | bench_encoding(Engine, Number, Concurrency) -> 64 | init(Engine), 65 | to_binary_ok(Engine, to_binary(Engine)), 66 | bench(Engine, fun() -> to_binary(Engine) end, Number, Concurrency). 67 | 68 | bench_parsing(Engine, Number, Concurrency) -> 69 | init(Engine), 70 | parse_ok(Engine, parse(Engine, ?STANZA)), 71 | bench(Engine, fun() -> parse(Engine, ?STANZA) end, Number, Concurrency). 72 | 73 | % internals 74 | 75 | bench(Engine, Fun, Number, Concurrency) -> 76 | Self = self(), 77 | List = lists:seq(1, Concurrency), 78 | LoopNumbers = Number div Concurrency, 79 | 80 | A = os:timestamp(), 81 | Pids = [spawn_link(fun() -> loop(LoopNumbers, Fun), Self ! {self(), done} end) || _ <- List], 82 | [receive {Pid, done} -> ok end || Pid <- Pids], 83 | B = os:timestamp(), 84 | 85 | print(Engine, Concurrency, Number, A, B). 86 | 87 | print(Engine, Concurrency, Num, A, B) -> 88 | Microsecs = timer:now_diff(B, A), 89 | Milliseconds = Microsecs/1000, 90 | Secs = Milliseconds/1000, 91 | StanzaPerSec = Num/Secs, 92 | io:format("### engine: ~p concurrency ~p -> ~p ms ~.2f stanza/sec ~n", [Engine, Concurrency, Milliseconds, StanzaPerSec]). 93 | 94 | loop(0, _Fun) -> 95 | ok; 96 | loop(Nr, Fun) -> 97 | Fun(), 98 | loop(Nr-1, Fun). 99 | 100 | init(fast_xml) -> 101 | application:ensure_all_started(fast_xml); 102 | init(_) -> 103 | ok. 104 | 105 | to_binary(erlxml) -> 106 | erlxml:to_binary(?ELEMENT); 107 | to_binary(exml) -> 108 | exml:to_binary(?ELEMENT); 109 | to_binary(fast_xml) -> 110 | fxml:element_to_binary(?ELEMENT). 111 | 112 | parse(erlxml, Data) -> 113 | erlxml:parse(Data); 114 | parse(exml, Data) -> 115 | exml:parse(Data); 116 | parse(fast_xml, Data) -> 117 | fxml_stream:parse_element(Data). 118 | 119 | to_binary_ok(_Engine, Value) -> 120 | true == is_binary(Value). 121 | 122 | parse_ok(fast_xml, Data) -> 123 | true == is_binary(Data); 124 | parse_ok(_, Data) -> 125 | {ok, _} = Data. 126 | 127 | -------------------------------------------------------------------------------- /benchmark/benchmark_stream.erl: -------------------------------------------------------------------------------- 1 | -module(benchmark_stream). 2 | 3 | -define(CHUNK_SIZE, 1024). 4 | 5 | -export([ 6 | bench/4 7 | ]). 8 | 9 | bench(Module, File, Number, Concurrency) -> 10 | {Chunks, BinarySize} = readlines(File, ?CHUNK_SIZE), 11 | 12 | Self = self(), 13 | List = lists:seq(1, Concurrency), 14 | LoopNumbers = Number div Concurrency, 15 | 16 | ProcFun = fun() -> 17 | {ok, Parser} = new_parser(Module), 18 | NewParser1 = run_parser([<<"">>], Module, Parser), 19 | NewParser2 = loop(LoopNumbers, Chunks, Module, NewParser1), 20 | NewParser3 = run_parser([<<"">>], Module, NewParser2), 21 | close(Module, NewParser3), 22 | Self ! {self(), done} 23 | end, 24 | 25 | A = os:timestamp(), 26 | Pids = [spawn_link(ProcFun) || _ <- List], 27 | [receive {Pid, done} -> ok end || Pid <- Pids], 28 | B = os:timestamp(), 29 | 30 | print(Module, Concurrency, BinarySize*Number, A, B). 31 | 32 | loop(0, _Chunks, _Module, Parser) -> 33 | Parser; 34 | loop(Nr, Chunks, Module, Parser) -> 35 | NewParser = run_parser(Chunks, Module, Parser), 36 | loop(Nr-1, Chunks, Module, NewParser). 37 | 38 | run_parser([H|T], Module, Parser) -> 39 | case stream_parse(Module, Parser, H) of 40 | {ok, _} -> 41 | run_parser(T, Module, Parser); 42 | {ok, NewParser, _} -> 43 | run_parser(T, Module, NewParser) 44 | end; 45 | run_parser([], _Module, Parser) -> 46 | Parser. 47 | 48 | new_parser(erlxml) -> 49 | erlxml:new_stream([{stanza_limit, 65000}]); 50 | new_parser(exml) -> 51 | exml_stream:new_parser(); 52 | new_parser(fast_xml) -> 53 | Parent = self(), 54 | ConsumerPid = spawn_link(fun() -> ok = fxml_receive_till_end(), Parent ! {fxml_completed, self()} end), 55 | {ok, fxml_stream:new(ConsumerPid)}; 56 | new_parser(dummy) -> 57 | {ok, null}. 58 | 59 | stream_parse(erlxml, Parser, Data) -> 60 | erlxml:parse_stream(Parser, Data); 61 | stream_parse(exml, Parser , Data) -> 62 | exml_stream:parse(Parser, Data); 63 | stream_parse(fast_xml, Parser, Data) -> 64 | {ok, fxml_stream:parse(Parser, Data)}; 65 | stream_parse(dummy, _Parser , _Data) -> 66 | {ok, []}. 67 | 68 | close(fast_xml, Parser) -> 69 | fxml_stream:close(Parser), 70 | receive 71 | {fxml_completed, _ConsumerPid} -> 72 | ok 73 | end; 74 | close(exml, Parser) -> 75 | exml_stream:free_parser(Parser); 76 | close(_Module, _Parser) -> 77 | ok. 78 | 79 | readlines(FileName, LengthChunks) -> 80 | {ok, Device} = file:open(FileName, [read]), 81 | Lines = get_lines(Device), 82 | Binary = binary_join(Lines), 83 | Size = byte_size(Binary), 84 | {build_chunks(Binary, LengthChunks, []), Size}. 85 | 86 | build_chunks(Binary, Length, Acc) -> 87 | case byte_size(Binary) > Length of 88 | true -> 89 | <> = Binary, 90 | build_chunks(Rest, Length, [Chunk | Acc]); 91 | _ -> 92 | lists:reverse([Binary|Acc]) 93 | end. 94 | 95 | get_lines(Device) -> 96 | lists:reverse(get_lines(Device, [])). 97 | 98 | get_lines(Device, Accum) -> 99 | case io:get_line(Device, "") of 100 | eof -> 101 | file:close(Device), Accum; 102 | Line -> 103 | get_lines(Device, [list_to_binary(Line)|Accum]) 104 | end. 105 | 106 | binary_join([Part]) -> 107 | Part; 108 | binary_join([Head|Tail]) -> 109 | lists:foldl(fun (Value, Acc) -> <> end, Head, Tail). 110 | 111 | print(Module, Concurrency, Bytes, A, B) -> 112 | Microsecond = timer:now_diff(B, A), 113 | Milliseconds = Microsecond /1000, 114 | Secs = Milliseconds/1000, 115 | BytesPerSec = Bytes/Secs, 116 | io:format("### engine: ~p concurrency: ~p -> ~p ms ~s/sec total bytes processed: ~s ~n", [Module, Concurrency, Milliseconds, format_size(BytesPerSec), format_size(Bytes)]). 117 | 118 | format_size(Size) -> 119 | format_size(Size, ["B","KB","MB","GB","TB","PB"]). 120 | 121 | format_size(S, [_|[_|_] = L]) when S >= 1024 -> format_size(S/1024, L); 122 | format_size(S, [M|_]) -> 123 | io_lib:format("~.2f ~s", [float(S), M]). 124 | 125 | fxml_receive_till_end() -> 126 | receive 127 | {'$gen_event', Msg} -> 128 | case Msg of 129 | {xmlstreamend, _} -> 130 | ok; 131 | _ -> 132 | fxml_receive_till_end() 133 | end 134 | end. 135 | -------------------------------------------------------------------------------- /c_src/.gitattributes: -------------------------------------------------------------------------------- 1 | *.cpp linguist-language=Erlang 2 | *.hpp linguist-language=Erlang 3 | -------------------------------------------------------------------------------- /c_src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | /env.mk 3 | -------------------------------------------------------------------------------- /c_src/Makefile: -------------------------------------------------------------------------------- 1 | 2 | PROJECT_NIF_NAME=erlxml_nif 3 | 4 | include nif.mk 5 | 6 | ifeq ($(UNAME_SYS), darwin) 7 | LDFLAGS += -Wl,-U,_enif_alloc \ 8 | -Wl,-U,_enif_free \ 9 | -Wl,-U,_enif_alloc_resource \ 10 | -Wl,-U,_enif_open_resource_type \ 11 | -Wl,-U,_enif_release_resource \ 12 | -Wl,-U,_enif_priv_data \ 13 | -Wl,-U,_enif_self \ 14 | -Wl,-U,_enif_consume_timeslice \ 15 | -Wl,-U,_enif_inspect_binary \ 16 | -Wl,-U,_enif_inspect_iolist_as_binary \ 17 | -Wl,-U,_enif_is_binary \ 18 | -Wl,-U,_enif_is_identical \ 19 | -Wl,-U,_enif_is_list \ 20 | -Wl,-U,_enif_get_int \ 21 | -Wl,-U,_enif_get_list_cell \ 22 | -Wl,-U,_enif_get_resource \ 23 | -Wl,-U,_enif_get_tuple \ 24 | -Wl,-U,_enif_make_atom \ 25 | -Wl,-U,_enif_make_badarg \ 26 | -Wl,-U,_enif_make_existing_atom \ 27 | -Wl,-U,_enif_make_double \ 28 | -Wl,-U,_enif_make_new_binary \ 29 | -Wl,-U,_enif_make_resource \ 30 | -Wl,-U,_enif_make_string_len \ 31 | -Wl,-U,_enif_make_tuple \ 32 | -Wl,-U,_enif_make_list \ 33 | -Wl,-U,_enif_make_ulong \ 34 | -Wl,-U,_enif_get_ulong \ 35 | -Wl,-U,_enif_make_list_cell \ 36 | -Wl,-U,_enif_make_reverse_list 37 | endif 38 | 39 | CXXFLAGS += -DNDEBUG -I pugixml \ 40 | -g -Wextra -Werror -Wno-ignored-qualifiers -Wno-unused-const-variable -Wno-missing-field-initializers -fno-exceptions -fno-rtti -std=c++17 41 | 42 | LDFLAGS += -lstdc++ 43 | -------------------------------------------------------------------------------- /c_src/allocators.cc: -------------------------------------------------------------------------------- 1 | #include "allocators.h" 2 | #include "erl_nif.h" 3 | 4 | void* erlxml_allocate(size_t size) 5 | { 6 | return enif_alloc(size); 7 | } 8 | 9 | void erlxml_deallocate(void* ptr) 10 | { 11 | enif_free(ptr); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /c_src/allocators.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_ALLOCATORS_H_ 2 | #define ERLXML_C_SRC_ALLOCATORS_H_ 3 | 4 | #include 5 | 6 | void* erlxml_allocate(size_t size); 7 | void erlxml_deallocate(void* ptr); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /c_src/bytebuffer.cc: -------------------------------------------------------------------------------- 1 | #include "bytebuffer.h" 2 | #include "allocators.h" 3 | 4 | #include 5 | #include 6 | 7 | ByteBuffer::ByteBuffer() 8 | { 9 | Construct(NULL, 1024); 10 | } 11 | 12 | ByteBuffer::ByteBuffer(size_t size) 13 | { 14 | Construct(NULL, size); 15 | } 16 | 17 | ByteBuffer::ByteBuffer(const uint8_t* bytes, size_t len) 18 | { 19 | Construct(bytes, len); 20 | } 21 | 22 | void ByteBuffer::Construct(const uint8_t* bytes, size_t len) 23 | { 24 | start_ = 0; 25 | size_ = len; 26 | bytes_ = static_cast(erlxml_allocate(size_)); 27 | 28 | if (bytes) 29 | { 30 | end_ = len; 31 | memcpy(bytes_, bytes, end_); 32 | } 33 | else 34 | { 35 | end_ = 0; 36 | } 37 | } 38 | 39 | ByteBuffer::~ByteBuffer() 40 | { 41 | erlxml_deallocate(bytes_); 42 | } 43 | 44 | bool ByteBuffer::ReadBytes(uint8_t* val, size_t len) 45 | { 46 | if (len > Length()) 47 | return false; 48 | 49 | memcpy(val, bytes_ + start_, len); 50 | start_ += len; 51 | return true; 52 | } 53 | 54 | void ByteBuffer::WriteBytes(const uint8_t* val, size_t len) 55 | { 56 | memcpy(ReserveWriteBuffer(len), val, len); 57 | } 58 | 59 | uint8_t* ByteBuffer::ReserveWriteBuffer(size_t len) 60 | { 61 | if (Length() + len > Capacity()) 62 | { 63 | if(!LeftShift()) 64 | Resize(Length() + len); 65 | else if (Length() + len > Capacity()) 66 | Resize(Length() + len); 67 | } 68 | 69 | uint8_t* start = bytes_ + end_; 70 | end_ += len; 71 | return start; 72 | } 73 | 74 | void ByteBuffer::Resize(size_t size) 75 | { 76 | if(size == size_) 77 | return; 78 | 79 | if (size > size_) 80 | size = std::max(size, 3 * size_ / 2); 81 | 82 | size_t len = std::min(end_ - start_, size); 83 | uint8_t* new_bytes = static_cast(erlxml_allocate(size)); 84 | memcpy(new_bytes, bytes_ + start_, len); 85 | erlxml_deallocate(bytes_); 86 | start_ = 0; 87 | end_ = len; 88 | size_ = size; 89 | bytes_ = new_bytes; 90 | } 91 | 92 | bool ByteBuffer::Consume(size_t size) 93 | { 94 | if (size > Length()) 95 | return false; 96 | 97 | start_ += size; 98 | return true; 99 | } 100 | 101 | bool ByteBuffer::LeftShift() 102 | { 103 | if(start_ == 0) 104 | return false; 105 | 106 | size_t length = end_ - start_; 107 | 108 | memmove(bytes_, bytes_ + start_, length); 109 | start_ = 0; 110 | end_ = length; 111 | return true; 112 | } 113 | 114 | void ByteBuffer::Clear() 115 | { 116 | start_ = end_ = 0; 117 | } 118 | -------------------------------------------------------------------------------- /c_src/bytebuffer.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLTLS_C_SRC_BYTEBUFFER_H_ 2 | #define ERLTLS_C_SRC_BYTEBUFFER_H_ 3 | 4 | #include "macros.h" 5 | 6 | #include 7 | #include 8 | 9 | class ByteBuffer 10 | { 11 | 12 | public: 13 | 14 | ByteBuffer(); 15 | ByteBuffer(size_t size); 16 | ByteBuffer(const uint8_t* bytes, size_t len); 17 | 18 | ~ByteBuffer(); 19 | 20 | const uint8_t* Data() const { return bytes_ + start_; } 21 | size_t Length() const { return end_ - start_; } 22 | size_t Capacity() const { return size_ - start_; } 23 | 24 | bool ReadBytes(uint8_t* val, size_t len); 25 | void WriteBytes(const uint8_t* val, size_t len); 26 | 27 | uint8_t* ReserveWriteBuffer(size_t len); 28 | void Resize(size_t size); 29 | bool Consume(size_t size); 30 | void Clear(); 31 | 32 | private: 33 | 34 | void Construct(const uint8_t* bytes, size_t size); 35 | bool LeftShift(); 36 | 37 | uint8_t* bytes_; 38 | size_t size_; 39 | size_t start_; 40 | size_t end_; 41 | 42 | DISALLOW_COPY_AND_ASSIGN(ByteBuffer); 43 | }; 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /c_src/element_encoder.cc: -------------------------------------------------------------------------------- 1 | 2 | #include "element_encoder.h" 3 | 4 | #include "nif_utils.h" 5 | #include "erlxml_nif.h" 6 | #include "utf8_cleanup.h" 7 | #include "macros.h" 8 | 9 | #include 10 | #include 11 | 12 | static const int kXmlelArity = 4; 13 | static const int kXmlcdataArity = 2; 14 | 15 | ERL_NIF_TERM from_binary(ErlNifEnv* env, const char* data, size_t length, bool strip_non_utf8) 16 | { 17 | if(strip_non_utf8) 18 | { 19 | size_t new_size = utf8_cleanup(const_cast(data), length); 20 | return make_binary(env, data, new_size); 21 | } 22 | 23 | return make_binary(env, data, length); 24 | } 25 | 26 | // all the time we iterate over attributes and childrens in reverse order 27 | // to make sure we don't have to do lists:reverse in erlang 28 | 29 | bool pugi2stream_start(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list) 30 | { 31 | if(node.type() != pugi::node_element) 32 | return false; 33 | 34 | ERL_NIF_TERM name = make_binary(env, node.name(), strlen(node.name())); 35 | ERL_NIF_TERM attrs = enif_make_list(env, 0); 36 | 37 | for (pugi::xml_attribute_iterator ait = node.attributes_end(); ait != node.attributes_begin();) 38 | { 39 | --ait; 40 | ERL_NIF_TERM key = make_binary(env, ait->name(), strlen(ait->name())); 41 | ERL_NIF_TERM value = from_binary(env, ait->value(), strlen(ait->value()), strip_non_utf8); 42 | attrs = enif_make_list_cell(env, enif_make_tuple2(env, key, value), attrs); 43 | } 44 | 45 | ERL_NIF_TERM xmlstreamstart = enif_make_tuple3(env, ATOMS.atomXmlStreamStart, name, attrs); 46 | *list = enif_make_list_cell(env, xmlstreamstart, *list); 47 | 48 | return true; 49 | } 50 | 51 | void pugi2term(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list) 52 | { 53 | switch(node.type()) 54 | { 55 | case pugi::node_element: 56 | { 57 | ERL_NIF_TERM name = make_binary(env, node.name(), strlen(node.name())); 58 | ERL_NIF_TERM attrs = enif_make_list(env, 0); 59 | ERL_NIF_TERM childrens = enif_make_list(env, 0); 60 | 61 | for (pugi::xml_attribute_iterator ait = node.attributes_end(); ait != node.attributes_begin();) 62 | { 63 | --ait; 64 | ERL_NIF_TERM key = make_binary(env, ait->name(), strlen(ait->name())); 65 | ERL_NIF_TERM value = from_binary(env, ait->value(), strlen(ait->value()), strip_non_utf8); 66 | attrs = enif_make_list_cell(env, enif_make_tuple2(env, key, value), attrs); 67 | } 68 | 69 | for (pugi::xml_node_iterator nit = node.end(); nit != node.begin();) 70 | { 71 | --nit; 72 | pugi2term(env, *nit, strip_non_utf8, &childrens); 73 | } 74 | 75 | ERL_NIF_TERM xmlel = enif_make_tuple4(env, ATOMS.atomXmlel, name, attrs, childrens); 76 | *list = enif_make_list_cell(env, xmlel, *list); 77 | break; 78 | } 79 | 80 | case pugi::node_pcdata: 81 | { 82 | ERL_NIF_TERM value = from_binary(env, node.value(), strlen(node.value()), strip_non_utf8); 83 | *list = enif_make_list_cell(env, enif_make_tuple2(env, ATOMS.atomXmlcdata, value), *list); 84 | break; 85 | } 86 | 87 | default:; 88 | } 89 | } 90 | 91 | bool parse_attributes(ErlNifEnv* env, ERL_NIF_TERM list, pugi::xml_node& node) 92 | { 93 | ERL_NIF_TERM head; 94 | const ERL_NIF_TERM *items; 95 | int arity; 96 | 97 | while(enif_get_list_cell(env, list, &head, &list)) 98 | { 99 | if(!enif_get_tuple(env, head, &arity, &items) || arity != 2) 100 | return false; 101 | 102 | ErlNifBinary key; 103 | ErlNifBinary value; 104 | 105 | if(!get_binary(env, items[0], &key) || !get_binary(env, items[1], &value)) 106 | return false; 107 | 108 | node.append_attribute(STRING_VIEW(key)).set_value(STRING_VIEW(value)); 109 | } 110 | 111 | return true; 112 | } 113 | 114 | bool parse_childrens(ErlNifEnv* env, ERL_NIF_TERM list, pugi::xml_node& node) 115 | { 116 | ERL_NIF_TERM head; 117 | 118 | while(enif_get_list_cell(env, list, &head, &list)) 119 | { 120 | if(!term2pugi(env, head, node)) 121 | return false; 122 | } 123 | 124 | return true; 125 | } 126 | 127 | bool term2pugi(ErlNifEnv* env, ERL_NIF_TERM element, pugi::xml_node& node) 128 | { 129 | const ERL_NIF_TERM *items; 130 | int arity; 131 | 132 | if(!enif_get_tuple(env, element, &arity, &items)) 133 | return false; 134 | 135 | if(arity == kXmlelArity && enif_is_identical(ATOMS.atomXmlel, items[0])) 136 | { 137 | //parse xmlel 138 | ErlNifBinary name; 139 | 140 | if(!get_binary(env, items[1], &name)) 141 | return false; 142 | 143 | pugi::xml_node element = node.append_child(STRING_VIEW(name)); 144 | 145 | if(!parse_attributes(env, items[2], element)) 146 | return false; 147 | 148 | if(!parse_childrens(env, items[3], element)) 149 | return false; 150 | } 151 | else if(arity == kXmlcdataArity && enif_is_identical(ATOMS.atomXmlcdata, items[0])) 152 | { 153 | ErlNifBinary value; 154 | 155 | if(!get_binary(env, items[1], &value)) 156 | return false; 157 | 158 | node.append_child(pugi::node_pcdata).set_value(STRING_VIEW(value)); 159 | } 160 | else 161 | { 162 | return false; 163 | } 164 | 165 | return true; 166 | } 167 | -------------------------------------------------------------------------------- /c_src/element_encoder.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_ELEMENTS_ENCODER_H_ 2 | #define ERLXML_C_SRC_ELEMENTS_ENCODER_H_ 3 | 4 | #include "pugixml.hpp" 5 | #include "erl_nif.h" 6 | 7 | bool pugi2stream_start(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list); 8 | void pugi2term(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list); 9 | bool term2pugi(ErlNifEnv* env, ERL_NIF_TERM element, pugi::xml_node& node); 10 | 11 | #endif 12 | 13 | -------------------------------------------------------------------------------- /c_src/erlxml.cc: -------------------------------------------------------------------------------- 1 | #include "erlxml.h" 2 | #include "erlxml_nif.h" 3 | #include "nif_utils.h" 4 | #include "xmlstreamparser.h" 5 | #include "element_encoder.h" 6 | 7 | const char kErrorFailedToAllocXmlStream[] = "failed to alloc stream object"; 8 | const char kErrorBadOwner[] = "erlxml session was created on a different process"; 9 | 10 | struct enif_erlxml_stream 11 | { 12 | XmlStreamParser* parser; 13 | ERL_NIF_TERM owner_pid; 14 | }; 15 | 16 | struct stream_options 17 | { 18 | stream_options() : stanza_limit(0), strip_invalid_utf8(false) {} 19 | 20 | size_t stanza_limit; 21 | bool strip_invalid_utf8; 22 | }; 23 | 24 | struct parser_data 25 | { 26 | parser_data(ErlNifEnv* e, ERL_NIF_TERM t) : env(e), term(t) {} 27 | 28 | ErlNifEnv* env; 29 | ERL_NIF_TERM term; 30 | }; 31 | 32 | struct xml_string_writer: pugi::xml_writer 33 | { 34 | ByteBuffer buffer; 35 | 36 | void write(const void* data, size_t size) 37 | { 38 | buffer.WriteBytes(reinterpret_cast(data), size); 39 | } 40 | }; 41 | 42 | void enif_stream_parser_free(ErlNifEnv* env, void* obj) 43 | { 44 | UNUSED(env); 45 | 46 | enif_erlxml_stream* stream = static_cast(obj); 47 | 48 | if(stream->parser != NULL) 49 | delete stream->parser; 50 | } 51 | 52 | bool handle_start_stream(void* user_data, pugi::xml_document& doc, bool strip_non_utf8) 53 | { 54 | parser_data* wp = reinterpret_cast(user_data); 55 | return pugi2stream_start(wp->env, doc.first_child(), strip_non_utf8, &wp->term); 56 | } 57 | 58 | void handle_stanza(void* user_data, pugi::xml_document& doc, bool strip_non_utf8) 59 | { 60 | parser_data* wp = reinterpret_cast(user_data); 61 | pugi2term(wp->env, doc.first_child(), strip_non_utf8, &wp->term); 62 | } 63 | 64 | void handle_end_stream(void* user_data, const std::string& rootname) 65 | { 66 | parser_data* wp = reinterpret_cast(user_data); 67 | ERL_NIF_TERM name = make_binary(wp->env, rootname.c_str(), rootname.length()); 68 | ERL_NIF_TERM xmlstreamstart = enif_make_tuple2(wp->env, ATOMS.atomXmlStreamEnd, name); 69 | wp->term = enif_make_list_cell(wp->env, xmlstreamstart, wp->term); 70 | } 71 | 72 | ERL_NIF_TERM parse_stream_options(ErlNifEnv* env, ERL_NIF_TERM list, stream_options* opts) 73 | { 74 | if(!enif_is_list(env, list)) 75 | return make_bad_options(env, list); 76 | 77 | ERL_NIF_TERM head; 78 | const ERL_NIF_TERM *items; 79 | int arity; 80 | 81 | while(enif_get_list_cell(env, list, &head, &list)) 82 | { 83 | if(!enif_get_tuple(env, head, &arity, &items) || arity != 2) 84 | return make_bad_options(env, head); 85 | 86 | ERL_NIF_TERM key = items[0]; 87 | ERL_NIF_TERM value = items[1]; 88 | 89 | if(enif_is_identical(key, ATOMS.atomStanzaLimit)) 90 | { 91 | if(!enif_get_uint64(env, value, &opts->stanza_limit)) 92 | return make_bad_options(env, head); 93 | } 94 | else if(enif_is_identical(key, ATOMS.atomStripInvalidUtf8)) 95 | { 96 | if(!get_boolean(value, &opts->strip_invalid_utf8)) 97 | return make_bad_options(env, head); 98 | } 99 | else 100 | { 101 | return make_bad_options(env, head); 102 | } 103 | } 104 | 105 | return ATOMS.atomOk; 106 | } 107 | 108 | ERL_NIF_TERM enif_stream_parser_new(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 109 | { 110 | UNUSED(argc); 111 | 112 | erlxml_data* data = static_cast(enif_priv_data(env)); 113 | 114 | stream_options opts; 115 | 116 | ERL_NIF_TERM parse_result = parse_stream_options(env, argv[0], &opts); 117 | 118 | if(!enif_is_identical(parse_result, ATOMS.atomOk)) 119 | return parse_result; 120 | 121 | enif_erlxml_stream* nif_stream = static_cast(enif_alloc_resource(data->res_xml_stream_parser, sizeof(enif_erlxml_stream))); 122 | 123 | if(nif_stream == NULL) 124 | return make_error(env, kErrorFailedToAllocXmlStream); 125 | 126 | ErlNifPid current_pid; 127 | enif_self(env, ¤t_pid); 128 | 129 | nif_stream->parser = new XmlStreamParser(opts.stanza_limit, opts.strip_invalid_utf8, handle_start_stream, handle_end_stream, handle_stanza); 130 | nif_stream->owner_pid = enif_make_pid(env, ¤t_pid); 131 | 132 | ERL_NIF_TERM term = enif_make_resource(env, nif_stream); 133 | enif_release_resource(nif_stream); 134 | return enif_make_tuple2(env, ATOMS.atomOk, term); 135 | } 136 | 137 | ERL_NIF_TERM enif_stream_parser_feed(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 138 | { 139 | UNUSED(argc); 140 | 141 | erlxml_data* data = static_cast(enif_priv_data(env)); 142 | 143 | enif_erlxml_stream* stream = NULL; 144 | ErlNifBinary bin; 145 | 146 | if(!enif_get_resource(env, argv[0], data->res_xml_stream_parser, (void**) &stream)) 147 | return make_badarg(env); 148 | 149 | if(!get_binary(env, argv[1], &bin)) 150 | return make_badarg(env); 151 | 152 | ErlNifPid current_pid; 153 | 154 | if(enif_self(env, ¤t_pid) && !enif_is_identical(stream->owner_pid, enif_make_pid(env, ¤t_pid))) 155 | return make_error(env, kErrorBadOwner); 156 | 157 | parser_data parser_data(env, enif_make_list(env, 0)); 158 | XmlStreamParser::parse_result result = stream->parser->FeedData(bin.data, bin.size, &parser_data); 159 | 160 | consume_timeslice(env, bin.size); 161 | 162 | switch (result) 163 | { 164 | case XmlStreamParser::kParseOk: 165 | if(!enif_make_reverse_list(env, parser_data.term, &parser_data.term)) 166 | return make_error(env, "failed to reverse the element list"); 167 | 168 | return make_ok_result(env, parser_data.term); 169 | 170 | case XmlStreamParser::kParseInvalidXml: 171 | case XmlStreamParser::kParseStanzaLimitHit: 172 | { 173 | ERL_NIF_TERM error_tag = (result == XmlStreamParser::kParseInvalidXml ? ATOMS.atomErrorInvalidStanza : ATOMS.atomErrorMaxStanzaLimitHit); 174 | const char* data = reinterpret_cast(stream->parser->GetBufferedData()->Data()); 175 | ERL_NIF_TERM binary = make_binary(env, data, stream->parser->GetBufferedData()->Length()); 176 | stream->parser->Reset(true); 177 | return make_error(env, enif_make_tuple2(env, error_tag, binary)); 178 | } 179 | 180 | default: 181 | return make_error(env, "unknown error"); 182 | } 183 | } 184 | 185 | ERL_NIF_TERM enif_stream_parser_reset(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 186 | { 187 | UNUSED(argc); 188 | 189 | erlxml_data* data = static_cast(enif_priv_data(env)); 190 | 191 | enif_erlxml_stream* stream = NULL; 192 | ErlNifPid current_pid; 193 | 194 | if(!enif_get_resource(env, argv[0], data->res_xml_stream_parser, (void**) &stream)) 195 | return make_badarg(env); 196 | 197 | if(enif_self(env, ¤t_pid) && !enif_is_identical(stream->owner_pid, enif_make_pid(env, ¤t_pid))) 198 | return make_error(env, kErrorBadOwner); 199 | 200 | stream->parser->Reset(true); 201 | return ATOMS.atomOk; 202 | } 203 | 204 | ERL_NIF_TERM enif_dom_parse(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 205 | { 206 | UNUSED(argc); 207 | 208 | ErlNifBinary bin; 209 | 210 | if(!get_binary(env, argv[0], &bin)) 211 | return make_badarg(env); 212 | 213 | pugi::xml_document pugi_doc; 214 | 215 | if(pugi_doc.load_buffer(bin.data, bin.size, pugi::parse_default).status != pugi::status_ok) 216 | return make_error(env, ATOMS.atomErrorInvalidStanza); 217 | 218 | ERL_NIF_TERM list = enif_make_list(env, 0); 219 | pugi2term(env, pugi_doc.first_child(), false, &list); 220 | 221 | ERL_NIF_TERM head; 222 | ERL_NIF_TERM tail; 223 | 224 | if(!enif_get_list_cell(env, list, &head, &tail)) 225 | return make_error(env, ATOMS.atomErrorInvalidStanza); 226 | 227 | return make_ok_result(env, head); 228 | } 229 | 230 | ERL_NIF_TERM enif_dom_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) 231 | { 232 | UNUSED(argc); 233 | 234 | pugi::xml_document doc; 235 | 236 | if(!term2pugi(env, argv[0], doc)) 237 | return make_badarg(env); 238 | 239 | xml_string_writer w; 240 | doc.document_element().print(w, "\t", pugi::format_raw); 241 | return make_binary(env, reinterpret_cast(w.buffer.Data()), w.buffer.Length()); 242 | } 243 | -------------------------------------------------------------------------------- /c_src/erlxml.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_ERLXML_H_ 2 | #define ERLXML_C_SRC_ERLXML_H_ 3 | 4 | #include "erl_nif.h" 5 | 6 | ERL_NIF_TERM enif_stream_parser_new(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]); 7 | void enif_stream_parser_free(ErlNifEnv* env, void* obj); 8 | 9 | ERL_NIF_TERM enif_stream_parser_feed(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]); 10 | ERL_NIF_TERM enif_stream_parser_reset(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]); 11 | ERL_NIF_TERM enif_dom_parse(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]); 12 | ERL_NIF_TERM enif_dom_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /c_src/erlxml_nif.cc: -------------------------------------------------------------------------------- 1 | #include "erlxml_nif.h" 2 | #include "nif_utils.h" 3 | #include "macros.h" 4 | #include "pugixml.hpp" 5 | #include "allocators.h" 6 | #include "erlxml.h" 7 | 8 | const char kAtomOk[] = "ok"; 9 | const char kAtomError[] = "error"; 10 | const char kAtomTrue[] = "true"; 11 | const char kAtomFalse[] = "false"; 12 | const char kAtomBadArg[] = "badarg"; 13 | const char kAtomOptions[] = "options"; 14 | 15 | const char kAtomStanzaLimit[] = "stanza_limit"; 16 | const char kAtomStripInvalidUtf8[] = "strip_non_utf8"; 17 | 18 | const char kAtomErrorInvalidStanza[] = "invalid_stanza"; 19 | const char kAtomErrorMaxStanzaLimitHit[] = "max_stanza_limit_hit"; 20 | 21 | const char kAtomXmlel[] = "xmlel"; 22 | const char kAtomXmlcdata[] = "xmlcdata"; 23 | const char kAtomXmlStreamStart[] = "xmlstreamstart"; 24 | const char kAtomXmlStreamEnd[] = "xmlstreamend"; 25 | 26 | atoms ATOMS; 27 | 28 | void open_resources(ErlNifEnv* env, erlxml_data* data) 29 | { 30 | ErlNifResourceFlags flags = static_cast(ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER); 31 | data->res_xml_stream_parser = enif_open_resource_type(env, NULL, "res_xml_stream_parser", enif_stream_parser_free, flags, NULL); 32 | } 33 | 34 | int on_nif_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info) 35 | { 36 | UNUSED(load_info); 37 | 38 | pugi::set_memory_management_functions(erlxml_allocate, erlxml_deallocate); 39 | 40 | ATOMS.atomOk = make_atom(env, kAtomOk); 41 | ATOMS.atomError = make_atom(env, kAtomError); 42 | ATOMS.atomTrue = make_atom(env, kAtomTrue); 43 | ATOMS.atomFalse = make_atom(env, kAtomFalse); 44 | ATOMS.atomOptions = make_atom(env, kAtomOptions); 45 | ATOMS.atomBadArg = make_atom(env, kAtomBadArg); 46 | 47 | ATOMS.atomErrorInvalidStanza = make_atom(env, kAtomErrorInvalidStanza); 48 | ATOMS.atomErrorMaxStanzaLimitHit = make_atom(env, kAtomErrorMaxStanzaLimitHit); 49 | 50 | ATOMS.atomStanzaLimit = make_atom(env, kAtomStanzaLimit); 51 | ATOMS.atomStripInvalidUtf8 = make_atom(env, kAtomStripInvalidUtf8); 52 | 53 | ATOMS.atomXmlel = make_atom(env, kAtomXmlel); 54 | ATOMS.atomXmlcdata = make_atom(env, kAtomXmlcdata); 55 | ATOMS.atomXmlStreamStart = make_atom(env, kAtomXmlStreamStart); 56 | ATOMS.atomXmlStreamEnd = make_atom(env, kAtomXmlStreamEnd); 57 | 58 | erlxml_data* data = static_cast(enif_alloc(sizeof(erlxml_data))); 59 | open_resources(env, data); 60 | 61 | *priv_data = data; 62 | return 0; 63 | } 64 | 65 | void on_nif_unload(ErlNifEnv* env, void* priv_data) 66 | { 67 | UNUSED(env); 68 | 69 | erlxml_data* data = static_cast(priv_data); 70 | enif_free(data); 71 | } 72 | 73 | int on_nif_upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM info) 74 | { 75 | UNUSED(old_priv); 76 | UNUSED(info); 77 | 78 | erlxml_data* data = static_cast(enif_alloc(sizeof(erlxml_data))); 79 | open_resources(env, data); 80 | 81 | *priv = data; 82 | return 0; 83 | } 84 | 85 | static ErlNifFunc nif_funcs[] = 86 | { 87 | {"new_stream", 1, enif_stream_parser_new}, 88 | {"feed_stream", 2, enif_stream_parser_feed}, 89 | {"reset_stream", 1, enif_stream_parser_reset}, 90 | {"dom_parse", 1, enif_dom_parse}, 91 | {"to_binary", 1, enif_dom_to_binary} 92 | }; 93 | 94 | ERL_NIF_INIT(erlxml_nif, nif_funcs, on_nif_load, NULL, on_nif_upgrade, on_nif_unload) 95 | -------------------------------------------------------------------------------- /c_src/erlxml_nif.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_ERLXML_NIF_H_ 2 | #define ERLXML_C_SRC_ERLXML_NIF_H_ 3 | 4 | #include "erl_nif.h" 5 | 6 | struct atoms 7 | { 8 | ERL_NIF_TERM atomOk; 9 | ERL_NIF_TERM atomError; 10 | ERL_NIF_TERM atomTrue; 11 | ERL_NIF_TERM atomFalse; 12 | ERL_NIF_TERM atomBadArg; 13 | ERL_NIF_TERM atomOptions; 14 | 15 | //errors 16 | ERL_NIF_TERM atomErrorInvalidStanza; 17 | ERL_NIF_TERM atomErrorMaxStanzaLimitHit; 18 | 19 | //options 20 | ERL_NIF_TERM atomStanzaLimit; 21 | ERL_NIF_TERM atomStripInvalidUtf8; 22 | 23 | //elements 24 | 25 | ERL_NIF_TERM atomXmlel; 26 | ERL_NIF_TERM atomXmlcdata; 27 | ERL_NIF_TERM atomXmlStreamStart; 28 | ERL_NIF_TERM atomXmlStreamEnd; 29 | 30 | }; 31 | 32 | struct erlxml_data 33 | { 34 | ErlNifResourceType* res_xml_stream_parser; 35 | }; 36 | 37 | extern atoms ATOMS; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /c_src/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_MACROS_H_ 2 | #define ERLXML_C_SRC_MACROS_H_ 3 | 4 | #define UNUSED(expr) do { (void)(expr); } while (0) 5 | #define STRING_VIEW(bin) std::string_view(reinterpret_cast(bin.data), bin.size) 6 | 7 | #define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) 8 | #define DISALLOW_COPY_AND_ASSIGN(TypeName) TypeName(const TypeName&); DISALLOW_ASSIGN(TypeName) 9 | #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) TypeName(); DISALLOW_COPY_AND_ASSIGN(TypeName) 10 | 11 | #ifdef NDEBUG 12 | #define ASSERT(x) UNUSED(x) 13 | #else 14 | #include 15 | #define ASSERT(x) assert(x) 16 | #endif 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /c_src/nif.mk: -------------------------------------------------------------------------------- 1 | # Based on c_src.mk from erlang.mk by Loic Hoguin 2 | # https://github.com/ninenines/erlang.mk/blob/master/plugins/c_src.mk 3 | 4 | CURDIR := $(shell pwd) 5 | BASEDIR := $(abspath $(CURDIR)/..) 6 | 7 | ifndef REBAR_BARE_COMPILER_OUTPUT_DIR 8 | PRIV_DIR ?= $(BASEDIR)/priv 9 | else 10 | PRIV_DIR ?= $(REBAR_BARE_COMPILER_OUTPUT_DIR)/priv 11 | endif 12 | 13 | C_SRC_DIR = $(CURDIR) 14 | C_SRC_ENV ?= $(C_SRC_DIR)/env.mk 15 | C_SRC_OUTPUT ?= $(PRIV_DIR)/$(PROJECT_NIF_NAME).so 16 | 17 | #regenerate all the time the env.mk 18 | ifneq ($(wildcard $(C_SRC_DIR)),) 19 | GEN_ENV ?= $(shell erl -noshell -s init stop -eval "file:write_file(\"$(C_SRC_ENV)\", \ 20 | io_lib:format( \ 21 | \"ERTS_INCLUDE_DIR ?= ~s/erts-~s/include/~n\" \ 22 | \"ERL_INTERFACE_INCLUDE_DIR ?= ~s~n\" \ 23 | \"ERL_INTERFACE_LIB_DIR ?= ~s~n\", \ 24 | [code:root_dir(), erlang:system_info(version), \ 25 | code:lib_dir(erl_interface, include), \ 26 | code:lib_dir(erl_interface, lib)])), \ 27 | halt().") 28 | $(GEN_ENV) 29 | endif 30 | 31 | include $(C_SRC_ENV) 32 | 33 | # System type and C compiler/flags. 34 | 35 | UNAME_SYS_ORG := $(shell uname -s) 36 | UNAME_SYS = $(shell echo $(UNAME_SYS_ORG) | tr A-Z a-z) 37 | 38 | ifeq ($(UNAME_SYS), darwin) 39 | CC ?= cc 40 | CFLAGS ?= -O3 -std=c99 -finline-functions -Wall 41 | CXXFLAGS ?= -O3 -Wall 42 | LDFLAGS ?= 43 | else ifeq ($(UNAME_SYS), freebsd) 44 | CC ?= cc 45 | CFLAGS ?= -O3 -std=c99 -finline-functions -Wall 46 | CXXFLAGS ?= -O3 -finline-functions -Wall 47 | LDFLAGS ?= -Wl,--exclude-libs=ALL 48 | else ifeq ($(UNAME_SYS), linux) 49 | CC ?= gcc 50 | CFLAGS ?= -O3 -std=c99 -finline-functions -Wall 51 | CXXFLAGS ?= -O3 -finline-functions -Wall 52 | LDFLAGS ?= -Wl,--exclude-libs=ALL 53 | endif 54 | 55 | CFLAGS += -fPIC -I $(ERTS_INCLUDE_DIR) -I $(ERL_INTERFACE_INCLUDE_DIR) 56 | CXXFLAGS += -fPIC -I $(ERTS_INCLUDE_DIR) -I $(ERL_INTERFACE_INCLUDE_DIR) 57 | LDFLAGS += -L $(ERL_INTERFACE_LIB_DIR) -shared -lei 58 | 59 | # Verbosity. 60 | 61 | c_verbose_0 = @echo " C " $(?F); 62 | c_verbose = $(c_verbose_$(V)) 63 | 64 | cpp_verbose_0 = @echo " CPP " $(?F); 65 | cpp_verbose = $(cpp_verbose_$(V)) 66 | 67 | link_verbose_0 = @echo " LD " $(@F); 68 | link_verbose = $(link_verbose_$(V)) 69 | 70 | SOURCES := $(shell find $(C_SRC_DIR) -type f \( -name "*.c" -o -name "*.C" -o -name "*.cc" -o -name "*.cpp" \)) 71 | OBJECTS = $(addsuffix .o, $(basename $(SOURCES))) 72 | 73 | COMPILE_C = $(c_verbose) $(CC) $(CFLAGS) $(CPPFLAGS) -c 74 | COMPILE_CPP = $(cpp_verbose) $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c 75 | 76 | $(C_SRC_OUTPUT): $(OBJECTS) 77 | @mkdir -p $(PRIV_DIR)/ 78 | $(link_verbose) $(CC) $(OBJECTS) $(LDFLAGS) -o $(C_SRC_OUTPUT) 79 | 80 | %.o: %.c 81 | $(COMPILE_C) $(OUTPUT_OPTION) $< 82 | 83 | %.o: %.cc 84 | $(COMPILE_CPP) $(OUTPUT_OPTION) $< 85 | 86 | %.o: %.C 87 | $(COMPILE_CPP) $(OUTPUT_OPTION) $< 88 | 89 | %.o: %.cpp 90 | $(COMPILE_CPP) $(OUTPUT_OPTION) $< 91 | 92 | clean: 93 | @rm -f $(C_SRC_OUTPUT) $(OBJECTS); rm -f $(C_SRC_ENV) 94 | -------------------------------------------------------------------------------- /c_src/nif_utils.cc: -------------------------------------------------------------------------------- 1 | #include "nif_utils.h" 2 | #include "erlxml_nif.h" 3 | #include "macros.h" 4 | 5 | #include 6 | 7 | // This should correspond to the similar define in erlxml_nif.erl 8 | #define MAX_BYTES_TO_NIF 20000 9 | 10 | ERL_NIF_TERM make_atom(ErlNifEnv* env, const char* name) 11 | { 12 | ERL_NIF_TERM ret; 13 | 14 | if(enif_make_existing_atom(env, name, &ret, ERL_NIF_LATIN1)) 15 | return ret; 16 | 17 | return enif_make_atom(env, name); 18 | } 19 | 20 | ERL_NIF_TERM make_binary(ErlNifEnv* env, const char* buff, size_t length) 21 | { 22 | ERL_NIF_TERM term; 23 | uint8_t *destination_buffer = enif_make_new_binary(env, length, &term); 24 | memcpy(destination_buffer, buff, length); 25 | return term; 26 | } 27 | 28 | ERL_NIF_TERM make_error(ErlNifEnv* env, const char* error) 29 | { 30 | return make_error(env, make_binary(env, error, strlen(error))); 31 | } 32 | 33 | ERL_NIF_TERM make_error(ErlNifEnv* env, ERL_NIF_TERM term) 34 | { 35 | return enif_make_tuple2(env, ATOMS.atomError, term); 36 | } 37 | 38 | ERL_NIF_TERM make_bad_options(ErlNifEnv* env, ERL_NIF_TERM term) 39 | { 40 | return make_error(env, enif_make_tuple(env, 2, ATOMS.atomOptions, term)); 41 | } 42 | 43 | ERL_NIF_TERM make_badarg(ErlNifEnv* env) 44 | { 45 | return enif_make_tuple2(env, ATOMS.atomError, ATOMS.atomBadArg); 46 | } 47 | 48 | ERL_NIF_TERM make_ok_result(ErlNifEnv* env, ERL_NIF_TERM term) 49 | { 50 | return enif_make_tuple(env, 2, ATOMS.atomOk, term); 51 | } 52 | 53 | void consume_timeslice(ErlNifEnv *env, size_t bytes) 54 | { 55 | int cost = static_cast((bytes * 100) / MAX_BYTES_TO_NIF); 56 | 57 | if(cost) 58 | enif_consume_timeslice(env, cost > 100 ? 100 : cost); 59 | } 60 | 61 | bool get_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin) 62 | { 63 | if(enif_is_binary(env, term)) 64 | return enif_inspect_binary(env, term, bin); 65 | 66 | return enif_inspect_iolist_as_binary(env, term, bin); 67 | } 68 | 69 | bool get_boolean(ERL_NIF_TERM term, bool* val) 70 | { 71 | if(enif_is_identical(term, ATOMS.atomTrue)) 72 | { 73 | *val = true; 74 | return true; 75 | } 76 | 77 | if(enif_is_identical(term, ATOMS.atomFalse)) 78 | { 79 | *val = false; 80 | return true; 81 | } 82 | 83 | return false; 84 | } 85 | -------------------------------------------------------------------------------- /c_src/nif_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLTLS_C_SRC_NIF_UTILS_H_ 2 | #define ERLTLS_C_SRC_NIF_UTILS_H_ 3 | 4 | #include 5 | 6 | #include "erl_nif.h" 7 | 8 | ERL_NIF_TERM make_atom(ErlNifEnv* env, const char* name); 9 | ERL_NIF_TERM make_error(ErlNifEnv* env, const char* error); 10 | ERL_NIF_TERM make_error(ErlNifEnv* env, ERL_NIF_TERM term); 11 | ERL_NIF_TERM make_bad_options(ErlNifEnv* env, ERL_NIF_TERM term); 12 | ERL_NIF_TERM make_badarg(ErlNifEnv* env); 13 | ERL_NIF_TERM make_binary(ErlNifEnv* env, const char* buff, size_t length); 14 | ERL_NIF_TERM make_ok_result(ErlNifEnv* env, ERL_NIF_TERM term); 15 | 16 | void consume_timeslice(ErlNifEnv *env, size_t bytes); 17 | 18 | bool get_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin); 19 | bool get_boolean(ERL_NIF_TERM term, bool* val); 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /c_src/pugixml/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | -------------------------------------------------------------------------------- /c_src/pugixml/pugiconfig.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * pugixml parser - version 1.15 3 | * -------------------------------------------------------- 4 | * Copyright (C) 2006-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | * Report bugs and download new versions at https://pugixml.org/ 6 | * 7 | * This library is distributed under the MIT License. See notice at the end 8 | * of this file. 9 | * 10 | * This work is based on the pugxml parser, which is: 11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) 12 | */ 13 | 14 | #ifndef HEADER_PUGICONFIG_HPP 15 | #define HEADER_PUGICONFIG_HPP 16 | 17 | // Uncomment this to enable wchar_t mode 18 | // #define PUGIXML_WCHAR_MODE 19 | 20 | // Uncomment this to enable compact mode 21 | // #define PUGIXML_COMPACT 22 | 23 | // Uncomment this to disable XPath 24 | #define PUGIXML_NO_XPATH 25 | 26 | // Uncomment this to disable STL 27 | // #define PUGIXML_NO_STL 28 | 29 | // Uncomment this to disable exceptions 30 | #define PUGIXML_NO_EXCEPTIONS 31 | 32 | // Set this to control attributes for public classes/functions, i.e.: 33 | // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL 34 | // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL 35 | // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall 36 | // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead 37 | 38 | // Tune these constants to adjust memory-related behavior 39 | // #define PUGIXML_MEMORY_PAGE_SIZE 32768 40 | // #define PUGIXML_MEMORY_OUTPUT_STACK 10240 41 | // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096 42 | 43 | // Tune this constant to adjust max nesting for XPath queries 44 | // #define PUGIXML_XPATH_DEPTH_LIMIT 1024 45 | 46 | // Uncomment this to switch to header-only version 47 | // #define PUGIXML_HEADER_ONLY 48 | 49 | // Uncomment this to enable long long support (usually enabled automatically) 50 | #define PUGIXML_HAS_LONG_LONG 51 | 52 | // Uncomment this to enable support for std::string_view (usually enabled automatically) 53 | #define PUGIXML_HAS_STRING_VIEW 54 | 55 | #endif 56 | 57 | /** 58 | * Copyright (c) 2006-2025 Arseny Kapoulkine 59 | * 60 | * Permission is hereby granted, free of charge, to any person 61 | * obtaining a copy of this software and associated documentation 62 | * files (the "Software"), to deal in the Software without 63 | * restriction, including without limitation the rights to use, 64 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 65 | * copies of the Software, and to permit persons to whom the 66 | * Software is furnished to do so, subject to the following 67 | * conditions: 68 | * 69 | * The above copyright notice and this permission notice shall be 70 | * included in all copies or substantial portions of the Software. 71 | * 72 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 73 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 74 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 75 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 76 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 77 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 78 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 79 | * OTHER DEALINGS IN THE SOFTWARE. 80 | */ 81 | -------------------------------------------------------------------------------- /c_src/pugixml/pugixml.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * pugixml parser - version 1.15 3 | * -------------------------------------------------------- 4 | * Copyright (C) 2006-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | * Report bugs and download new versions at https://pugixml.org/ 6 | * 7 | * This library is distributed under the MIT License. See notice at the end 8 | * of this file. 9 | * 10 | * This work is based on the pugxml parser, which is: 11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) 12 | */ 13 | 14 | // Define version macro; evaluates to major * 1000 + minor * 10 + patch so that it's safe to use in less-than comparisons 15 | // Note: pugixml used major * 100 + minor * 10 + patch format up until 1.9 (which had version identifier 190); starting from pugixml 1.10, the minor version number is two digits 16 | #ifndef PUGIXML_VERSION 17 | # define PUGIXML_VERSION 1150 // 1.15 18 | #endif 19 | 20 | // Include user configuration file (this can define various configuration macros) 21 | #include "pugiconfig.hpp" 22 | 23 | #ifndef HEADER_PUGIXML_HPP 24 | #define HEADER_PUGIXML_HPP 25 | 26 | // Include stddef.h for size_t and ptrdiff_t 27 | #include 28 | 29 | // Include exception header for XPath 30 | #if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS) 31 | # include 32 | #endif 33 | 34 | // Include STL headers 35 | #ifndef PUGIXML_NO_STL 36 | # include 37 | # include 38 | # include 39 | #endif 40 | 41 | // Check if std::string_view is available 42 | #if !defined(PUGIXML_HAS_STRING_VIEW) && !defined(PUGIXML_NO_STL) 43 | # if __cplusplus >= 201703L 44 | # define PUGIXML_HAS_STRING_VIEW 45 | # elif defined(_MSVC_LANG) && _MSVC_LANG >= 201703L 46 | # define PUGIXML_HAS_STRING_VIEW 47 | # endif 48 | #endif 49 | 50 | // Include string_view if appropriate 51 | #ifdef PUGIXML_HAS_STRING_VIEW 52 | # include 53 | #endif 54 | 55 | // Macro for deprecated features 56 | #ifndef PUGIXML_DEPRECATED 57 | # if defined(__GNUC__) 58 | # define PUGIXML_DEPRECATED __attribute__((deprecated)) 59 | # elif defined(_MSC_VER) && _MSC_VER >= 1300 60 | # define PUGIXML_DEPRECATED __declspec(deprecated) 61 | # else 62 | # define PUGIXML_DEPRECATED 63 | # endif 64 | #endif 65 | 66 | // If no API is defined, assume default 67 | #ifndef PUGIXML_API 68 | # define PUGIXML_API 69 | #endif 70 | 71 | // If no API for classes is defined, assume default 72 | #ifndef PUGIXML_CLASS 73 | # define PUGIXML_CLASS PUGIXML_API 74 | #endif 75 | 76 | // If no API for functions is defined, assume default 77 | #ifndef PUGIXML_FUNCTION 78 | # define PUGIXML_FUNCTION PUGIXML_API 79 | #endif 80 | 81 | // If the platform is known to have long long support, enable long long functions 82 | #ifndef PUGIXML_HAS_LONG_LONG 83 | # if __cplusplus >= 201103 84 | # define PUGIXML_HAS_LONG_LONG 85 | # elif defined(_MSC_VER) && _MSC_VER >= 1400 86 | # define PUGIXML_HAS_LONG_LONG 87 | # endif 88 | #endif 89 | 90 | // If the platform is known to have move semantics support, compile move ctor/operator implementation 91 | #ifndef PUGIXML_HAS_MOVE 92 | # if __cplusplus >= 201103 93 | # define PUGIXML_HAS_MOVE 94 | # elif defined(_MSC_VER) && _MSC_VER >= 1600 95 | # define PUGIXML_HAS_MOVE 96 | # endif 97 | #endif 98 | 99 | // If C++ is 2011 or higher, use 'noexcept' specifiers 100 | #ifndef PUGIXML_NOEXCEPT 101 | # if __cplusplus >= 201103 102 | # define PUGIXML_NOEXCEPT noexcept 103 | # elif defined(_MSC_VER) && _MSC_VER >= 1900 104 | # define PUGIXML_NOEXCEPT noexcept 105 | # else 106 | # define PUGIXML_NOEXCEPT throw() 107 | # endif 108 | #endif 109 | 110 | // Some functions can not be noexcept in compact mode 111 | #ifdef PUGIXML_COMPACT 112 | # define PUGIXML_NOEXCEPT_IF_NOT_COMPACT 113 | #else 114 | # define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT 115 | #endif 116 | 117 | // If C++ is 2011 or higher, add 'override' qualifiers 118 | #ifndef PUGIXML_OVERRIDE 119 | # if __cplusplus >= 201103 120 | # define PUGIXML_OVERRIDE override 121 | # elif defined(_MSC_VER) && _MSC_VER >= 1700 122 | # define PUGIXML_OVERRIDE override 123 | # else 124 | # define PUGIXML_OVERRIDE 125 | # endif 126 | #endif 127 | 128 | // If C++ is 2011 or higher, use 'nullptr' 129 | #ifndef PUGIXML_NULL 130 | # if __cplusplus >= 201103 131 | # define PUGIXML_NULL nullptr 132 | # elif defined(_MSC_VER) && _MSC_VER >= 1600 133 | # define PUGIXML_NULL nullptr 134 | # else 135 | # define PUGIXML_NULL 0 136 | # endif 137 | #endif 138 | 139 | // Character interface macros 140 | #ifdef PUGIXML_WCHAR_MODE 141 | # define PUGIXML_TEXT(t) L ## t 142 | # define PUGIXML_CHAR wchar_t 143 | #else 144 | # define PUGIXML_TEXT(t) t 145 | # define PUGIXML_CHAR char 146 | #endif 147 | 148 | namespace pugi 149 | { 150 | // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE 151 | typedef PUGIXML_CHAR char_t; 152 | 153 | #ifndef PUGIXML_NO_STL 154 | // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE 155 | typedef std::basic_string string_t; 156 | #endif 157 | 158 | #ifdef PUGIXML_HAS_STRING_VIEW 159 | // String view type used for operations that can work with a length delimited string; depends on PUGIXML_WCHAR_MODE 160 | typedef std::basic_string_view string_view_t; 161 | #endif 162 | } 163 | 164 | // The PugiXML namespace 165 | namespace pugi 166 | { 167 | // Tree node types 168 | enum xml_node_type 169 | { 170 | node_null, // Empty (null) node handle 171 | node_document, // A document tree's absolute root 172 | node_element, // Element tag, i.e. '' 173 | node_pcdata, // Plain character data, i.e. 'text' 174 | node_cdata, // Character data, i.e. '' 175 | node_comment, // Comment tag, i.e. '' 176 | node_pi, // Processing instruction, i.e. '' 177 | node_declaration, // Document declaration, i.e. '' 178 | node_doctype // Document type declaration, i.e. '' 179 | }; 180 | 181 | // Parsing options 182 | 183 | // Minimal parsing mode (equivalent to turning all other flags off). 184 | // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. 185 | const unsigned int parse_minimal = 0x0000; 186 | 187 | // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default. 188 | const unsigned int parse_pi = 0x0001; 189 | 190 | // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default. 191 | const unsigned int parse_comments = 0x0002; 192 | 193 | // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default. 194 | const unsigned int parse_cdata = 0x0004; 195 | 196 | // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree. 197 | // This flag is off by default; turning it on usually results in slower parsing and more memory consumption. 198 | const unsigned int parse_ws_pcdata = 0x0008; 199 | 200 | // This flag determines if character and entity references are expanded during parsing. This flag is on by default. 201 | const unsigned int parse_escapes = 0x0010; 202 | 203 | // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default. 204 | const unsigned int parse_eol = 0x0020; 205 | 206 | // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default. 207 | const unsigned int parse_wconv_attribute = 0x0040; 208 | 209 | // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default. 210 | const unsigned int parse_wnorm_attribute = 0x0080; 211 | 212 | // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default. 213 | const unsigned int parse_declaration = 0x0100; 214 | 215 | // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. 216 | const unsigned int parse_doctype = 0x0200; 217 | 218 | // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only 219 | // of whitespace is added to the DOM tree. 220 | // This flag is off by default; turning it on may result in slower parsing and more memory consumption. 221 | const unsigned int parse_ws_pcdata_single = 0x0400; 222 | 223 | // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default. 224 | const unsigned int parse_trim_pcdata = 0x0800; 225 | 226 | // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document 227 | // is a valid document. This flag is off by default. 228 | const unsigned int parse_fragment = 0x1000; 229 | 230 | // This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of 231 | // the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments. 232 | // This flag is off by default. 233 | const unsigned int parse_embed_pcdata = 0x2000; 234 | 235 | // This flag determines whether determines whether the the two pcdata should be merged or not, if no intermediatory data are parsed in the document. 236 | // This flag is off by default. 237 | const unsigned int parse_merge_pcdata = 0x4000; 238 | 239 | // The default parsing mode. 240 | // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, 241 | // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. 242 | const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; 243 | 244 | // The full parsing mode. 245 | // Nodes of all types are added to the DOM tree, character/reference entities are expanded, 246 | // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. 247 | const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; 248 | 249 | // These flags determine the encoding of input data for XML document 250 | enum xml_encoding 251 | { 252 | encoding_auto, // Auto-detect input encoding using BOM or < / class xml_object_range 328 | { 329 | public: 330 | typedef It const_iterator; 331 | typedef It iterator; 332 | 333 | xml_object_range(It b, It e): _begin(b), _end(e) 334 | { 335 | } 336 | 337 | It begin() const { return _begin; } 338 | It end() const { return _end; } 339 | 340 | bool empty() const { return _begin == _end; } 341 | 342 | private: 343 | It _begin, _end; 344 | }; 345 | 346 | // Writer interface for node printing (see xml_node::print) 347 | class PUGIXML_CLASS xml_writer 348 | { 349 | public: 350 | virtual ~xml_writer(); 351 | 352 | // Write memory chunk into stream/file/whatever 353 | virtual void write(const void* data, size_t size) = 0; 354 | }; 355 | 356 | // xml_writer implementation for FILE* 357 | class PUGIXML_CLASS xml_writer_file: public xml_writer 358 | { 359 | public: 360 | // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio 361 | xml_writer_file(void* file); 362 | 363 | virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE; 364 | 365 | private: 366 | void* file; 367 | }; 368 | 369 | #ifndef PUGIXML_NO_STL 370 | // xml_writer implementation for streams 371 | class PUGIXML_CLASS xml_writer_stream: public xml_writer 372 | { 373 | public: 374 | // Construct writer from an output stream object 375 | xml_writer_stream(std::basic_ostream& stream); 376 | xml_writer_stream(std::basic_ostream& stream); 377 | 378 | virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE; 379 | 380 | private: 381 | std::basic_ostream* narrow_stream; 382 | std::basic_ostream* wide_stream; 383 | }; 384 | #endif 385 | 386 | // A light-weight handle for manipulating attributes in DOM tree 387 | class PUGIXML_CLASS xml_attribute 388 | { 389 | friend class xml_attribute_iterator; 390 | friend class xml_node; 391 | 392 | private: 393 | xml_attribute_struct* _attr; 394 | 395 | typedef void (*unspecified_bool_type)(xml_attribute***); 396 | 397 | public: 398 | // Default constructor. Constructs an empty attribute. 399 | xml_attribute(); 400 | 401 | // Constructs attribute from internal pointer 402 | explicit xml_attribute(xml_attribute_struct* attr); 403 | 404 | // Safe bool conversion operator 405 | operator unspecified_bool_type() const; 406 | 407 | // Borland C++ workaround 408 | bool operator!() const; 409 | 410 | // Comparison operators (compares wrapped attribute pointers) 411 | bool operator==(const xml_attribute& r) const; 412 | bool operator!=(const xml_attribute& r) const; 413 | bool operator<(const xml_attribute& r) const; 414 | bool operator>(const xml_attribute& r) const; 415 | bool operator<=(const xml_attribute& r) const; 416 | bool operator>=(const xml_attribute& r) const; 417 | 418 | // Check if attribute is empty (null) 419 | bool empty() const; 420 | 421 | // Get attribute name/value, or "" if attribute is empty 422 | const char_t* name() const; 423 | const char_t* value() const; 424 | 425 | // Get attribute value, or the default value if attribute is empty 426 | const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; 427 | 428 | // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty 429 | int as_int(int def = 0) const; 430 | unsigned int as_uint(unsigned int def = 0) const; 431 | double as_double(double def = 0) const; 432 | float as_float(float def = 0) const; 433 | 434 | #ifdef PUGIXML_HAS_LONG_LONG 435 | long long as_llong(long long def = 0) const; 436 | unsigned long long as_ullong(unsigned long long def = 0) const; 437 | #endif 438 | 439 | // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty 440 | bool as_bool(bool def = false) const; 441 | 442 | // Set attribute name/value (returns false if attribute is empty or there is not enough memory) 443 | bool set_name(const char_t* rhs); 444 | bool set_name(const char_t* rhs, size_t size); 445 | #ifdef PUGIXML_HAS_STRING_VIEW 446 | bool set_name(string_view_t rhs); 447 | #endif 448 | bool set_value(const char_t* rhs); 449 | bool set_value(const char_t* rhs, size_t size); 450 | #ifdef PUGIXML_HAS_STRING_VIEW 451 | bool set_value(string_view_t rhs); 452 | #endif 453 | 454 | // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") 455 | bool set_value(int rhs); 456 | bool set_value(unsigned int rhs); 457 | bool set_value(long rhs); 458 | bool set_value(unsigned long rhs); 459 | bool set_value(double rhs); 460 | bool set_value(double rhs, int precision); 461 | bool set_value(float rhs); 462 | bool set_value(float rhs, int precision); 463 | bool set_value(bool rhs); 464 | 465 | #ifdef PUGIXML_HAS_LONG_LONG 466 | bool set_value(long long rhs); 467 | bool set_value(unsigned long long rhs); 468 | #endif 469 | 470 | // Set attribute value (equivalent to set_value without error checking) 471 | xml_attribute& operator=(const char_t* rhs); 472 | xml_attribute& operator=(int rhs); 473 | xml_attribute& operator=(unsigned int rhs); 474 | xml_attribute& operator=(long rhs); 475 | xml_attribute& operator=(unsigned long rhs); 476 | xml_attribute& operator=(double rhs); 477 | xml_attribute& operator=(float rhs); 478 | xml_attribute& operator=(bool rhs); 479 | 480 | #ifdef PUGIXML_HAS_STRING_VIEW 481 | xml_attribute& operator=(string_view_t rhs); 482 | #endif 483 | 484 | #ifdef PUGIXML_HAS_LONG_LONG 485 | xml_attribute& operator=(long long rhs); 486 | xml_attribute& operator=(unsigned long long rhs); 487 | #endif 488 | 489 | // Get next/previous attribute in the attribute list of the parent node 490 | xml_attribute next_attribute() const; 491 | xml_attribute previous_attribute() const; 492 | 493 | // Get hash value (unique for handles to the same object) 494 | size_t hash_value() const; 495 | 496 | // Get internal pointer 497 | xml_attribute_struct* internal_object() const; 498 | }; 499 | 500 | #ifdef __BORLANDC__ 501 | // Borland C++ workaround 502 | bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs); 503 | bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs); 504 | #endif 505 | 506 | // A light-weight handle for manipulating nodes in DOM tree 507 | class PUGIXML_CLASS xml_node 508 | { 509 | friend class xml_attribute_iterator; 510 | friend class xml_node_iterator; 511 | friend class xml_named_node_iterator; 512 | 513 | protected: 514 | xml_node_struct* _root; 515 | 516 | typedef void (*unspecified_bool_type)(xml_node***); 517 | 518 | public: 519 | // Default constructor. Constructs an empty node. 520 | xml_node(); 521 | 522 | // Constructs node from internal pointer 523 | explicit xml_node(xml_node_struct* p); 524 | 525 | // Safe bool conversion operator 526 | operator unspecified_bool_type() const; 527 | 528 | // Borland C++ workaround 529 | bool operator!() const; 530 | 531 | // Comparison operators (compares wrapped node pointers) 532 | bool operator==(const xml_node& r) const; 533 | bool operator!=(const xml_node& r) const; 534 | bool operator<(const xml_node& r) const; 535 | bool operator>(const xml_node& r) const; 536 | bool operator<=(const xml_node& r) const; 537 | bool operator>=(const xml_node& r) const; 538 | 539 | // Check if node is empty (null) 540 | bool empty() const; 541 | 542 | // Get node type 543 | xml_node_type type() const; 544 | 545 | // Get node name, or "" if node is empty or it has no name 546 | const char_t* name() const; 547 | 548 | // Get node value, or "" if node is empty or it has no value 549 | // Note: For text node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes. 550 | const char_t* value() const; 551 | 552 | // Get attribute list 553 | xml_attribute first_attribute() const; 554 | xml_attribute last_attribute() const; 555 | 556 | // Get children list 557 | xml_node first_child() const; 558 | xml_node last_child() const; 559 | 560 | // Get next/previous sibling in the children list of the parent node 561 | xml_node next_sibling() const; 562 | xml_node previous_sibling() const; 563 | 564 | // Get parent node 565 | xml_node parent() const; 566 | 567 | // Get root of DOM tree this node belongs to 568 | xml_node root() const; 569 | 570 | // Get text object for the current node 571 | xml_text text() const; 572 | 573 | // Get child, attribute or next/previous sibling with the specified name 574 | xml_node child(const char_t* name) const; 575 | xml_attribute attribute(const char_t* name) const; 576 | xml_node next_sibling(const char_t* name) const; 577 | xml_node previous_sibling(const char_t* name) const; 578 | #ifdef PUGIXML_HAS_STRING_VIEW 579 | xml_node child(string_view_t name) const; 580 | xml_attribute attribute(string_view_t name) const; 581 | xml_node next_sibling(string_view_t name) const; 582 | xml_node previous_sibling(string_view_t name) const; 583 | #endif 584 | 585 | // Get attribute, starting the search from a hint (and updating hint so that searching for a sequence of attributes is fast) 586 | xml_attribute attribute(const char_t* name, xml_attribute& hint) const; 587 | #ifdef PUGIXML_HAS_STRING_VIEW 588 | xml_attribute attribute(string_view_t name, xml_attribute& hint) const; 589 | #endif 590 | 591 | // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA 592 | const char_t* child_value() const; 593 | 594 | // Get child value of child with specified name. Equivalent to child(name).child_value(). 595 | const char_t* child_value(const char_t* name) const; 596 | 597 | // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value) 598 | bool set_name(const char_t* rhs); 599 | bool set_name(const char_t* rhs, size_t size); 600 | #ifdef PUGIXML_HAS_STRING_VIEW 601 | bool set_name(string_view_t rhs); 602 | #endif 603 | bool set_value(const char_t* rhs); 604 | bool set_value(const char_t* rhs, size_t size); 605 | #ifdef PUGIXML_HAS_STRING_VIEW 606 | bool set_value(string_view_t rhs); 607 | #endif 608 | 609 | // Add attribute with specified name. Returns added attribute, or empty attribute on errors. 610 | xml_attribute append_attribute(const char_t* name); 611 | xml_attribute prepend_attribute(const char_t* name); 612 | xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); 613 | xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); 614 | #ifdef PUGIXML_HAS_STRING_VIEW 615 | xml_attribute append_attribute(string_view_t name); 616 | xml_attribute prepend_attribute(string_view_t name); 617 | xml_attribute insert_attribute_after(string_view_t name, const xml_attribute& attr); 618 | xml_attribute insert_attribute_before(string_view_t name, const xml_attribute& attr); 619 | #endif 620 | 621 | // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors. 622 | xml_attribute append_copy(const xml_attribute& proto); 623 | xml_attribute prepend_copy(const xml_attribute& proto); 624 | xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr); 625 | xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr); 626 | 627 | // Add child node with specified type. Returns added node, or empty node on errors. 628 | xml_node append_child(xml_node_type type = node_element); 629 | xml_node prepend_child(xml_node_type type = node_element); 630 | xml_node insert_child_after(xml_node_type type, const xml_node& node); 631 | xml_node insert_child_before(xml_node_type type, const xml_node& node); 632 | 633 | // Add child element with specified name. Returns added node, or empty node on errors. 634 | xml_node append_child(const char_t* name); 635 | xml_node prepend_child(const char_t* name); 636 | xml_node insert_child_after(const char_t* name, const xml_node& node); 637 | xml_node insert_child_before(const char_t* name, const xml_node& node); 638 | #ifdef PUGIXML_HAS_STRING_VIEW 639 | xml_node append_child(string_view_t name); 640 | xml_node prepend_child(string_view_t name); 641 | xml_node insert_child_after(string_view_t, const xml_node& node); 642 | xml_node insert_child_before(string_view_t name, const xml_node& node); 643 | #endif 644 | 645 | // Add a copy of the specified node as a child. Returns added node, or empty node on errors. 646 | xml_node append_copy(const xml_node& proto); 647 | xml_node prepend_copy(const xml_node& proto); 648 | xml_node insert_copy_after(const xml_node& proto, const xml_node& node); 649 | xml_node insert_copy_before(const xml_node& proto, const xml_node& node); 650 | 651 | // Move the specified node to become a child of this node. Returns moved node, or empty node on errors. 652 | xml_node append_move(const xml_node& moved); 653 | xml_node prepend_move(const xml_node& moved); 654 | xml_node insert_move_after(const xml_node& moved, const xml_node& node); 655 | xml_node insert_move_before(const xml_node& moved, const xml_node& node); 656 | 657 | // Remove specified attribute 658 | bool remove_attribute(const xml_attribute& a); 659 | bool remove_attribute(const char_t* name); 660 | #ifdef PUGIXML_HAS_STRING_VIEW 661 | bool remove_attribute(string_view_t name); 662 | #endif 663 | 664 | // Remove all attributes 665 | bool remove_attributes(); 666 | 667 | // Remove specified child 668 | bool remove_child(const xml_node& n); 669 | bool remove_child(const char_t* name); 670 | #ifdef PUGIXML_HAS_STRING_VIEW 671 | bool remove_child(string_view_t name); 672 | #endif 673 | 674 | // Remove all children 675 | bool remove_children(); 676 | 677 | // Parses buffer as an XML document fragment and appends all nodes as children of the current node. 678 | // Copies/converts the buffer, so it may be deleted or changed after the function returns. 679 | // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory. 680 | xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 681 | 682 | // Find attribute using predicate. Returns first attribute for which predicate returned true. 683 | template xml_attribute find_attribute(Predicate pred) const 684 | { 685 | if (!_root) return xml_attribute(); 686 | 687 | for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute()) 688 | if (pred(attrib)) 689 | return attrib; 690 | 691 | return xml_attribute(); 692 | } 693 | 694 | // Find child node using predicate. Returns first child for which predicate returned true. 695 | template xml_node find_child(Predicate pred) const 696 | { 697 | if (!_root) return xml_node(); 698 | 699 | for (xml_node node = first_child(); node; node = node.next_sibling()) 700 | if (pred(node)) 701 | return node; 702 | 703 | return xml_node(); 704 | } 705 | 706 | // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true. 707 | template xml_node find_node(Predicate pred) const 708 | { 709 | if (!_root) return xml_node(); 710 | 711 | xml_node cur = first_child(); 712 | 713 | while (cur._root && cur._root != _root) 714 | { 715 | if (pred(cur)) return cur; 716 | 717 | if (cur.first_child()) cur = cur.first_child(); 718 | else if (cur.next_sibling()) cur = cur.next_sibling(); 719 | else 720 | { 721 | while (!cur.next_sibling() && cur._root != _root) cur = cur.parent(); 722 | 723 | if (cur._root != _root) cur = cur.next_sibling(); 724 | } 725 | } 726 | 727 | return xml_node(); 728 | } 729 | 730 | // Find child node by attribute name/value 731 | xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; 732 | xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; 733 | 734 | #ifndef PUGIXML_NO_STL 735 | // Get the absolute node path from root as a text string. 736 | string_t path(char_t delimiter = '/') const; 737 | #endif 738 | 739 | // Search for a node by path consisting of node names and . or .. elements. 740 | xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; 741 | 742 | // Recursively traverse subtree with xml_tree_walker 743 | bool traverse(xml_tree_walker& walker); 744 | 745 | #ifndef PUGIXML_NO_XPATH 746 | // Select single node by evaluating XPath query. Returns first node from the resulting node set. 747 | xpath_node select_node(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL) const; 748 | xpath_node select_node(const xpath_query& query) const; 749 | 750 | // Select node set by evaluating XPath query 751 | xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL) const; 752 | xpath_node_set select_nodes(const xpath_query& query) const; 753 | 754 | // (deprecated: use select_node instead) Select single node by evaluating XPath query. 755 | PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL) const; 756 | PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const; 757 | 758 | #endif 759 | 760 | // Print subtree using a writer object 761 | void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; 762 | 763 | #ifndef PUGIXML_NO_STL 764 | // Print subtree to stream 765 | void print(std::basic_ostream& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; 766 | void print(std::basic_ostream& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; 767 | #endif 768 | 769 | // Child nodes iterators 770 | typedef xml_node_iterator iterator; 771 | 772 | iterator begin() const; 773 | iterator end() const; 774 | 775 | // Attribute iterators 776 | typedef xml_attribute_iterator attribute_iterator; 777 | 778 | attribute_iterator attributes_begin() const; 779 | attribute_iterator attributes_end() const; 780 | 781 | // Range-based for support 782 | xml_object_range children() const; 783 | xml_object_range attributes() const; 784 | 785 | // Range-based for support for all children with the specified name 786 | // Note: name pointer must have a longer lifetime than the returned object; be careful with passing temporaries! 787 | xml_object_range children(const char_t* name) const; 788 | 789 | // Get node offset in parsed file/string (in char_t units) for debugging purposes 790 | ptrdiff_t offset_debug() const; 791 | 792 | // Get hash value (unique for handles to the same object) 793 | size_t hash_value() const; 794 | 795 | // Get internal pointer 796 | xml_node_struct* internal_object() const; 797 | }; 798 | 799 | #ifdef __BORLANDC__ 800 | // Borland C++ workaround 801 | bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs); 802 | bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs); 803 | #endif 804 | 805 | // A helper for working with text inside PCDATA nodes 806 | class PUGIXML_CLASS xml_text 807 | { 808 | friend class xml_node; 809 | 810 | xml_node_struct* _root; 811 | 812 | typedef void (*unspecified_bool_type)(xml_text***); 813 | 814 | explicit xml_text(xml_node_struct* root); 815 | 816 | xml_node_struct* _data_new(); 817 | xml_node_struct* _data() const; 818 | 819 | public: 820 | // Default constructor. Constructs an empty object. 821 | xml_text(); 822 | 823 | // Safe bool conversion operator 824 | operator unspecified_bool_type() const; 825 | 826 | // Borland C++ workaround 827 | bool operator!() const; 828 | 829 | // Check if text object is empty (null) 830 | bool empty() const; 831 | 832 | // Get text, or "" if object is empty 833 | const char_t* get() const; 834 | 835 | // Get text, or the default value if object is empty 836 | const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; 837 | 838 | // Get text as a number, or the default value if conversion did not succeed or object is empty 839 | int as_int(int def = 0) const; 840 | unsigned int as_uint(unsigned int def = 0) const; 841 | double as_double(double def = 0) const; 842 | float as_float(float def = 0) const; 843 | 844 | #ifdef PUGIXML_HAS_LONG_LONG 845 | long long as_llong(long long def = 0) const; 846 | unsigned long long as_ullong(unsigned long long def = 0) const; 847 | #endif 848 | 849 | // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty 850 | bool as_bool(bool def = false) const; 851 | 852 | // Set text (returns false if object is empty or there is not enough memory) 853 | bool set(const char_t* rhs); 854 | bool set(const char_t* rhs, size_t size); 855 | #ifdef PUGIXML_HAS_STRING_VIEW 856 | bool set(string_view_t rhs); 857 | #endif 858 | 859 | // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") 860 | bool set(int rhs); 861 | bool set(unsigned int rhs); 862 | bool set(long rhs); 863 | bool set(unsigned long rhs); 864 | bool set(double rhs); 865 | bool set(double rhs, int precision); 866 | bool set(float rhs); 867 | bool set(float rhs, int precision); 868 | bool set(bool rhs); 869 | 870 | #ifdef PUGIXML_HAS_LONG_LONG 871 | bool set(long long rhs); 872 | bool set(unsigned long long rhs); 873 | #endif 874 | 875 | // Set text (equivalent to set without error checking) 876 | xml_text& operator=(const char_t* rhs); 877 | xml_text& operator=(int rhs); 878 | xml_text& operator=(unsigned int rhs); 879 | xml_text& operator=(long rhs); 880 | xml_text& operator=(unsigned long rhs); 881 | xml_text& operator=(double rhs); 882 | xml_text& operator=(float rhs); 883 | xml_text& operator=(bool rhs); 884 | 885 | #ifdef PUGIXML_HAS_STRING_VIEW 886 | xml_text& operator=(string_view_t rhs); 887 | #endif 888 | 889 | #ifdef PUGIXML_HAS_LONG_LONG 890 | xml_text& operator=(long long rhs); 891 | xml_text& operator=(unsigned long long rhs); 892 | #endif 893 | 894 | // Get the data node (node_pcdata or node_cdata) for this object 895 | xml_node data() const; 896 | }; 897 | 898 | #ifdef __BORLANDC__ 899 | // Borland C++ workaround 900 | bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs); 901 | bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs); 902 | #endif 903 | 904 | // Child node iterator (a bidirectional iterator over a collection of xml_node) 905 | class PUGIXML_CLASS xml_node_iterator 906 | { 907 | friend class xml_node; 908 | 909 | private: 910 | mutable xml_node _wrap; 911 | xml_node _parent; 912 | 913 | xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent); 914 | 915 | public: 916 | // Iterator traits 917 | typedef ptrdiff_t difference_type; 918 | typedef xml_node value_type; 919 | typedef xml_node* pointer; 920 | typedef xml_node& reference; 921 | 922 | #ifndef PUGIXML_NO_STL 923 | typedef std::bidirectional_iterator_tag iterator_category; 924 | #endif 925 | 926 | // Default constructor 927 | xml_node_iterator(); 928 | 929 | // Construct an iterator which points to the specified node 930 | xml_node_iterator(const xml_node& node); 931 | 932 | // Iterator operators 933 | bool operator==(const xml_node_iterator& rhs) const; 934 | bool operator!=(const xml_node_iterator& rhs) const; 935 | 936 | xml_node& operator*() const; 937 | xml_node* operator->() const; 938 | 939 | xml_node_iterator& operator++(); 940 | xml_node_iterator operator++(int); 941 | 942 | xml_node_iterator& operator--(); 943 | xml_node_iterator operator--(int); 944 | }; 945 | 946 | // Attribute iterator (a bidirectional iterator over a collection of xml_attribute) 947 | class PUGIXML_CLASS xml_attribute_iterator 948 | { 949 | friend class xml_node; 950 | 951 | private: 952 | mutable xml_attribute _wrap; 953 | xml_node _parent; 954 | 955 | xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent); 956 | 957 | public: 958 | // Iterator traits 959 | typedef ptrdiff_t difference_type; 960 | typedef xml_attribute value_type; 961 | typedef xml_attribute* pointer; 962 | typedef xml_attribute& reference; 963 | 964 | #ifndef PUGIXML_NO_STL 965 | typedef std::bidirectional_iterator_tag iterator_category; 966 | #endif 967 | 968 | // Default constructor 969 | xml_attribute_iterator(); 970 | 971 | // Construct an iterator which points to the specified attribute 972 | xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent); 973 | 974 | // Iterator operators 975 | bool operator==(const xml_attribute_iterator& rhs) const; 976 | bool operator!=(const xml_attribute_iterator& rhs) const; 977 | 978 | xml_attribute& operator*() const; 979 | xml_attribute* operator->() const; 980 | 981 | xml_attribute_iterator& operator++(); 982 | xml_attribute_iterator operator++(int); 983 | 984 | xml_attribute_iterator& operator--(); 985 | xml_attribute_iterator operator--(int); 986 | }; 987 | 988 | // Named node range helper 989 | class PUGIXML_CLASS xml_named_node_iterator 990 | { 991 | friend class xml_node; 992 | 993 | public: 994 | // Iterator traits 995 | typedef ptrdiff_t difference_type; 996 | typedef xml_node value_type; 997 | typedef xml_node* pointer; 998 | typedef xml_node& reference; 999 | 1000 | #ifndef PUGIXML_NO_STL 1001 | typedef std::bidirectional_iterator_tag iterator_category; 1002 | #endif 1003 | 1004 | // Default constructor 1005 | xml_named_node_iterator(); 1006 | 1007 | // Construct an iterator which points to the specified node 1008 | // Note: name pointer is stored in the iterator and must have a longer lifetime than iterator itself 1009 | xml_named_node_iterator(const xml_node& node, const char_t* name); 1010 | 1011 | // Iterator operators 1012 | bool operator==(const xml_named_node_iterator& rhs) const; 1013 | bool operator!=(const xml_named_node_iterator& rhs) const; 1014 | 1015 | xml_node& operator*() const; 1016 | xml_node* operator->() const; 1017 | 1018 | xml_named_node_iterator& operator++(); 1019 | xml_named_node_iterator operator++(int); 1020 | 1021 | xml_named_node_iterator& operator--(); 1022 | xml_named_node_iterator operator--(int); 1023 | 1024 | private: 1025 | mutable xml_node _wrap; 1026 | xml_node _parent; 1027 | const char_t* _name; 1028 | 1029 | xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name); 1030 | }; 1031 | 1032 | // Abstract tree walker class (see xml_node::traverse) 1033 | class PUGIXML_CLASS xml_tree_walker 1034 | { 1035 | friend class xml_node; 1036 | 1037 | private: 1038 | int _depth; 1039 | 1040 | protected: 1041 | // Get current traversal depth 1042 | int depth() const; 1043 | 1044 | public: 1045 | xml_tree_walker(); 1046 | virtual ~xml_tree_walker(); 1047 | 1048 | // Callback that is called when traversal begins 1049 | virtual bool begin(xml_node& node); 1050 | 1051 | // Callback that is called for each node traversed 1052 | virtual bool for_each(xml_node& node) = 0; 1053 | 1054 | // Callback that is called when traversal ends 1055 | virtual bool end(xml_node& node); 1056 | }; 1057 | 1058 | // Parsing status, returned as part of xml_parse_result object 1059 | enum xml_parse_status 1060 | { 1061 | status_ok = 0, // No error 1062 | 1063 | status_file_not_found, // File was not found during load_file() 1064 | status_io_error, // Error reading from file/stream 1065 | status_out_of_memory, // Could not allocate memory 1066 | status_internal_error, // Internal error occurred 1067 | 1068 | status_unrecognized_tag, // Parser could not determine tag type 1069 | 1070 | status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction 1071 | status_bad_comment, // Parsing error occurred while parsing comment 1072 | status_bad_cdata, // Parsing error occurred while parsing CDATA section 1073 | status_bad_doctype, // Parsing error occurred while parsing document type declaration 1074 | status_bad_pcdata, // Parsing error occurred while parsing PCDATA section 1075 | status_bad_start_element, // Parsing error occurred while parsing start element tag 1076 | status_bad_attribute, // Parsing error occurred while parsing element attribute 1077 | status_bad_end_element, // Parsing error occurred while parsing end element tag 1078 | status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) 1079 | 1080 | status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) 1081 | 1082 | status_no_document_element // Parsing resulted in a document without element nodes 1083 | }; 1084 | 1085 | // Parsing result 1086 | struct PUGIXML_CLASS xml_parse_result 1087 | { 1088 | // Parsing status (see xml_parse_status) 1089 | xml_parse_status status; 1090 | 1091 | // Last parsed offset (in char_t units from start of input data) 1092 | ptrdiff_t offset; 1093 | 1094 | // Source document encoding 1095 | xml_encoding encoding; 1096 | 1097 | // Default constructor, initializes object to failed state 1098 | xml_parse_result(); 1099 | 1100 | // Cast to bool operator 1101 | operator bool() const; 1102 | 1103 | // Get error description 1104 | const char* description() const; 1105 | }; 1106 | 1107 | // Document class (DOM tree root) 1108 | class PUGIXML_CLASS xml_document: public xml_node 1109 | { 1110 | private: 1111 | char_t* _buffer; 1112 | 1113 | char _memory[192]; 1114 | 1115 | // Non-copyable semantics 1116 | xml_document(const xml_document&); 1117 | xml_document& operator=(const xml_document&); 1118 | 1119 | void _create(); 1120 | void _destroy(); 1121 | void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT; 1122 | 1123 | public: 1124 | // Default constructor, makes empty document 1125 | xml_document(); 1126 | 1127 | // Destructor, invalidates all node/attribute handles to this document 1128 | ~xml_document(); 1129 | 1130 | #ifdef PUGIXML_HAS_MOVE 1131 | // Move semantics support 1132 | xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT; 1133 | xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT; 1134 | #endif 1135 | 1136 | // Removes all nodes, leaving the empty document 1137 | void reset(); 1138 | 1139 | // Removes all nodes, then copies the entire contents of the specified document 1140 | void reset(const xml_document& proto); 1141 | 1142 | #ifndef PUGIXML_NO_STL 1143 | // Load document from stream. 1144 | xml_parse_result load(std::basic_istream& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 1145 | xml_parse_result load(std::basic_istream& stream, unsigned int options = parse_default); 1146 | #endif 1147 | 1148 | // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. 1149 | PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default); 1150 | 1151 | // Load document from zero-terminated string. No encoding conversions are applied. 1152 | xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default); 1153 | 1154 | // Load document from file 1155 | xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 1156 | xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 1157 | 1158 | // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns. 1159 | xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 1160 | 1161 | // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). 1162 | // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed. 1163 | xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 1164 | 1165 | // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). 1166 | // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore). 1167 | xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 1168 | 1169 | // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details). 1170 | void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 1171 | 1172 | #ifndef PUGIXML_NO_STL 1173 | // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). 1174 | void save(std::basic_ostream& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 1175 | void save(std::basic_ostream& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; 1176 | #endif 1177 | 1178 | // Save XML to file 1179 | bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 1180 | bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 1181 | 1182 | // Get document element 1183 | xml_node document_element() const; 1184 | }; 1185 | 1186 | #ifndef PUGIXML_NO_XPATH 1187 | // XPath query return type 1188 | enum xpath_value_type 1189 | { 1190 | xpath_type_none, // Unknown type (query failed to compile) 1191 | xpath_type_node_set, // Node set (xpath_node_set) 1192 | xpath_type_number, // Number 1193 | xpath_type_string, // String 1194 | xpath_type_boolean // Boolean 1195 | }; 1196 | 1197 | // XPath parsing result 1198 | struct PUGIXML_CLASS xpath_parse_result 1199 | { 1200 | // Error message (0 if no error) 1201 | const char* error; 1202 | 1203 | // Last parsed offset (in char_t units from string start) 1204 | ptrdiff_t offset; 1205 | 1206 | // Default constructor, initializes object to failed state 1207 | xpath_parse_result(); 1208 | 1209 | // Cast to bool operator 1210 | operator bool() const; 1211 | 1212 | // Get error description 1213 | const char* description() const; 1214 | }; 1215 | 1216 | // A single XPath variable 1217 | class PUGIXML_CLASS xpath_variable 1218 | { 1219 | friend class xpath_variable_set; 1220 | 1221 | protected: 1222 | xpath_value_type _type; 1223 | xpath_variable* _next; 1224 | 1225 | xpath_variable(xpath_value_type type); 1226 | 1227 | // Non-copyable semantics 1228 | xpath_variable(const xpath_variable&); 1229 | xpath_variable& operator=(const xpath_variable&); 1230 | 1231 | public: 1232 | // Get variable name 1233 | const char_t* name() const; 1234 | 1235 | // Get variable type 1236 | xpath_value_type type() const; 1237 | 1238 | // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error 1239 | bool get_boolean() const; 1240 | double get_number() const; 1241 | const char_t* get_string() const; 1242 | const xpath_node_set& get_node_set() const; 1243 | 1244 | // Set variable value; no type conversion is performed, false is returned on type mismatch error 1245 | bool set(bool value); 1246 | bool set(double value); 1247 | bool set(const char_t* value); 1248 | bool set(const xpath_node_set& value); 1249 | }; 1250 | 1251 | // A set of XPath variables 1252 | class PUGIXML_CLASS xpath_variable_set 1253 | { 1254 | private: 1255 | xpath_variable* _data[64]; 1256 | 1257 | void _assign(const xpath_variable_set& rhs); 1258 | void _swap(xpath_variable_set& rhs); 1259 | 1260 | xpath_variable* _find(const char_t* name) const; 1261 | 1262 | static bool _clone(xpath_variable* var, xpath_variable** out_result); 1263 | static void _destroy(xpath_variable* var); 1264 | 1265 | public: 1266 | // Default constructor/destructor 1267 | xpath_variable_set(); 1268 | ~xpath_variable_set(); 1269 | 1270 | // Copy constructor/assignment operator 1271 | xpath_variable_set(const xpath_variable_set& rhs); 1272 | xpath_variable_set& operator=(const xpath_variable_set& rhs); 1273 | 1274 | #ifdef PUGIXML_HAS_MOVE 1275 | // Move semantics support 1276 | xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT; 1277 | xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT; 1278 | #endif 1279 | 1280 | // Add a new variable or get the existing one, if the types match 1281 | xpath_variable* add(const char_t* name, xpath_value_type type); 1282 | 1283 | // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch 1284 | bool set(const char_t* name, bool value); 1285 | bool set(const char_t* name, double value); 1286 | bool set(const char_t* name, const char_t* value); 1287 | bool set(const char_t* name, const xpath_node_set& value); 1288 | 1289 | // Get existing variable by name 1290 | xpath_variable* get(const char_t* name); 1291 | const xpath_variable* get(const char_t* name) const; 1292 | }; 1293 | 1294 | // A compiled XPath query object 1295 | class PUGIXML_CLASS xpath_query 1296 | { 1297 | private: 1298 | void* _impl; 1299 | xpath_parse_result _result; 1300 | 1301 | typedef void (*unspecified_bool_type)(xpath_query***); 1302 | 1303 | // Non-copyable semantics 1304 | xpath_query(const xpath_query&); 1305 | xpath_query& operator=(const xpath_query&); 1306 | 1307 | public: 1308 | // Construct a compiled object from XPath expression. 1309 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors. 1310 | explicit xpath_query(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL); 1311 | 1312 | // Constructor 1313 | xpath_query(); 1314 | 1315 | // Destructor 1316 | ~xpath_query(); 1317 | 1318 | #ifdef PUGIXML_HAS_MOVE 1319 | // Move semantics support 1320 | xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT; 1321 | xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT; 1322 | #endif 1323 | 1324 | // Get query expression return type 1325 | xpath_value_type return_type() const; 1326 | 1327 | // Evaluate expression as boolean value in the specified context; performs type conversion if necessary. 1328 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1329 | bool evaluate_boolean(const xpath_node& n) const; 1330 | 1331 | // Evaluate expression as double value in the specified context; performs type conversion if necessary. 1332 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1333 | double evaluate_number(const xpath_node& n) const; 1334 | 1335 | #ifndef PUGIXML_NO_STL 1336 | // Evaluate expression as string value in the specified context; performs type conversion if necessary. 1337 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1338 | string_t evaluate_string(const xpath_node& n) const; 1339 | #endif 1340 | 1341 | // Evaluate expression as string value in the specified context; performs type conversion if necessary. 1342 | // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero). 1343 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1344 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead. 1345 | size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const; 1346 | 1347 | // Evaluate expression as node set in the specified context. 1348 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. 1349 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead. 1350 | xpath_node_set evaluate_node_set(const xpath_node& n) const; 1351 | 1352 | // Evaluate expression as node set in the specified context. 1353 | // Return first node in document order, or empty node if node set is empty. 1354 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. 1355 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead. 1356 | xpath_node evaluate_node(const xpath_node& n) const; 1357 | 1358 | // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode) 1359 | const xpath_parse_result& result() const; 1360 | 1361 | // Safe bool conversion operator 1362 | operator unspecified_bool_type() const; 1363 | 1364 | // Borland C++ workaround 1365 | bool operator!() const; 1366 | }; 1367 | 1368 | #ifndef PUGIXML_NO_EXCEPTIONS 1369 | #if defined(_MSC_VER) 1370 | // C4275 can be ignored in Visual C++ if you are deriving 1371 | // from a type in the Standard C++ Library 1372 | #pragma warning(push) 1373 | #pragma warning(disable: 4275) 1374 | #endif 1375 | // XPath exception class 1376 | class PUGIXML_CLASS xpath_exception: public std::exception 1377 | { 1378 | private: 1379 | xpath_parse_result _result; 1380 | 1381 | public: 1382 | // Construct exception from parse result 1383 | explicit xpath_exception(const xpath_parse_result& result); 1384 | 1385 | // Get error message 1386 | virtual const char* what() const PUGIXML_NOEXCEPT PUGIXML_OVERRIDE; 1387 | 1388 | // Get parse result 1389 | const xpath_parse_result& result() const; 1390 | }; 1391 | #if defined(_MSC_VER) 1392 | #pragma warning(pop) 1393 | #endif 1394 | #endif 1395 | 1396 | // XPath node class (either xml_node or xml_attribute) 1397 | class PUGIXML_CLASS xpath_node 1398 | { 1399 | private: 1400 | xml_node _node; 1401 | xml_attribute _attribute; 1402 | 1403 | typedef void (*unspecified_bool_type)(xpath_node***); 1404 | 1405 | public: 1406 | // Default constructor; constructs empty XPath node 1407 | xpath_node(); 1408 | 1409 | // Construct XPath node from XML node/attribute 1410 | xpath_node(const xml_node& node); 1411 | xpath_node(const xml_attribute& attribute, const xml_node& parent); 1412 | 1413 | // Get node/attribute, if any 1414 | xml_node node() const; 1415 | xml_attribute attribute() const; 1416 | 1417 | // Get parent of contained node/attribute 1418 | xml_node parent() const; 1419 | 1420 | // Safe bool conversion operator 1421 | operator unspecified_bool_type() const; 1422 | 1423 | // Borland C++ workaround 1424 | bool operator!() const; 1425 | 1426 | // Comparison operators 1427 | bool operator==(const xpath_node& n) const; 1428 | bool operator!=(const xpath_node& n) const; 1429 | }; 1430 | 1431 | #ifdef __BORLANDC__ 1432 | // Borland C++ workaround 1433 | bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs); 1434 | bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs); 1435 | #endif 1436 | 1437 | // A fixed-size collection of XPath nodes 1438 | class PUGIXML_CLASS xpath_node_set 1439 | { 1440 | public: 1441 | // Collection type 1442 | enum type_t 1443 | { 1444 | type_unsorted, // Not ordered 1445 | type_sorted, // Sorted by document order (ascending) 1446 | type_sorted_reverse // Sorted by document order (descending) 1447 | }; 1448 | 1449 | // Constant iterator type 1450 | typedef const xpath_node* const_iterator; 1451 | 1452 | // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work 1453 | typedef const xpath_node* iterator; 1454 | 1455 | // Default constructor. Constructs empty set. 1456 | xpath_node_set(); 1457 | 1458 | // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful 1459 | xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted); 1460 | 1461 | // Destructor 1462 | ~xpath_node_set(); 1463 | 1464 | // Copy constructor/assignment operator 1465 | xpath_node_set(const xpath_node_set& ns); 1466 | xpath_node_set& operator=(const xpath_node_set& ns); 1467 | 1468 | #ifdef PUGIXML_HAS_MOVE 1469 | // Move semantics support 1470 | xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT; 1471 | xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT; 1472 | #endif 1473 | 1474 | // Get collection type 1475 | type_t type() const; 1476 | 1477 | // Get collection size 1478 | size_t size() const; 1479 | 1480 | // Indexing operator 1481 | const xpath_node& operator[](size_t index) const; 1482 | 1483 | // Collection iterators 1484 | const_iterator begin() const; 1485 | const_iterator end() const; 1486 | 1487 | // Sort the collection in ascending/descending order by document order 1488 | void sort(bool reverse = false); 1489 | 1490 | // Get first node in the collection by document order 1491 | xpath_node first() const; 1492 | 1493 | // Check if collection is empty 1494 | bool empty() const; 1495 | 1496 | private: 1497 | type_t _type; 1498 | 1499 | xpath_node _storage[1]; 1500 | 1501 | xpath_node* _begin; 1502 | xpath_node* _end; 1503 | 1504 | void _assign(const_iterator begin, const_iterator end, type_t type); 1505 | void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT; 1506 | }; 1507 | #endif 1508 | 1509 | #ifndef PUGIXML_NO_STL 1510 | // Convert wide string to UTF8 1511 | std::basic_string PUGIXML_FUNCTION as_utf8(const wchar_t* str); 1512 | std::basic_string PUGIXML_FUNCTION as_utf8(const std::basic_string& str); 1513 | 1514 | // Convert UTF8 to wide string 1515 | std::basic_string PUGIXML_FUNCTION as_wide(const char* str); 1516 | std::basic_string PUGIXML_FUNCTION as_wide(const std::basic_string& str); 1517 | #endif 1518 | 1519 | // Memory allocation function interface; returns pointer to allocated memory or NULL on failure 1520 | typedef void* (*allocation_function)(size_t size); 1521 | 1522 | // Memory deallocation function interface 1523 | typedef void (*deallocation_function)(void* ptr); 1524 | 1525 | // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions. 1526 | void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate); 1527 | 1528 | // Get current memory management functions 1529 | allocation_function PUGIXML_FUNCTION get_memory_allocation_function(); 1530 | deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); 1531 | } 1532 | 1533 | #if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) 1534 | namespace std 1535 | { 1536 | // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) 1537 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&); 1538 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&); 1539 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&); 1540 | } 1541 | #endif 1542 | 1543 | #if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) 1544 | namespace std 1545 | { 1546 | // Workarounds for (non-standard) iterator category detection 1547 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&); 1548 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&); 1549 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&); 1550 | } 1551 | #endif 1552 | 1553 | #endif 1554 | 1555 | // Make sure implementation is included in header-only mode 1556 | // Use macro expansion in #include to work around QMake (QTBUG-11923) 1557 | #if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE) 1558 | # define PUGIXML_SOURCE "pugixml.cpp" 1559 | # include PUGIXML_SOURCE 1560 | #endif 1561 | 1562 | /** 1563 | * Copyright (c) 2006-2025 Arseny Kapoulkine 1564 | * 1565 | * Permission is hereby granted, free of charge, to any person 1566 | * obtaining a copy of this software and associated documentation 1567 | * files (the "Software"), to deal in the Software without 1568 | * restriction, including without limitation the rights to use, 1569 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 1570 | * copies of the Software, and to permit persons to whom the 1571 | * Software is furnished to do so, subject to the following 1572 | * conditions: 1573 | * 1574 | * The above copyright notice and this permission notice shall be 1575 | * included in all copies or substantial portions of the Software. 1576 | * 1577 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 1578 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 1579 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 1580 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 1581 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 1582 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1583 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 1584 | * OTHER DEALINGS IN THE SOFTWARE. 1585 | */ 1586 | -------------------------------------------------------------------------------- /c_src/utf8_cleanup.cc: -------------------------------------------------------------------------------- 1 | #include "utf8_cleanup.h" 2 | 3 | //code from pugixml.cpp 4 | //Copyright (C) 2006-2017, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | //modified to strip some invalid unicode intervals 6 | 7 | struct utf8_writer 8 | { 9 | typedef uint8_t* value_type; 10 | 11 | static value_type low(value_type result, uint32_t ch) 12 | { 13 | // U+0000..U+007F 14 | if (ch < 0x80) 15 | { 16 | *result = static_cast(ch); 17 | return result + 1; 18 | } 19 | // U+0080..U+07FF 20 | else if (ch < 0x800) 21 | { 22 | result[0] = static_cast(0xC0 | (ch >> 6)); 23 | result[1] = static_cast(0x80 | (ch & 0x3F)); 24 | return result + 2; 25 | } 26 | // U+0800..U+FFFF (U+0800..U+FDCF, U+FDF0..U+FFFD) 27 | else if (ch < 0xfdd0 || (ch > 0xfdef && ch < 0xfffe)) 28 | { 29 | result[0] = static_cast(0xE0 | (ch >> 12)); 30 | result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); 31 | result[2] = static_cast(0x80 | (ch & 0x3F)); 32 | return result + 3; 33 | } 34 | 35 | return result; 36 | } 37 | 38 | static value_type high(value_type result, uint32_t ch) 39 | { 40 | // U+10000..U+10FFFF 41 | result[0] = static_cast(0xF0 | (ch >> 18)); 42 | result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); 43 | result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); 44 | result[3] = static_cast(0x80 | (ch & 0x3F)); 45 | return result + 4; 46 | } 47 | 48 | static value_type any(value_type result, uint32_t ch) 49 | { 50 | return (ch < 0x10000) ? low(result, ch) : high(result, ch); 51 | } 52 | }; 53 | 54 | struct utf8_decoder 55 | { 56 | typedef uint8_t type; 57 | 58 | template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits) 59 | { 60 | const uint8_t utf8_byte_mask = 0x3f; 61 | 62 | while (size) 63 | { 64 | uint8_t lead = *data; 65 | 66 | // 0xxxxxxx -> U+0000..U+007F 67 | if (lead < 0x80) 68 | { 69 | result = Traits::low(result, lead); 70 | data += 1; 71 | size -= 1; 72 | 73 | // process aligned single-byte (ascii) blocks 74 | if ((reinterpret_cast(data) & 3) == 0) 75 | { 76 | // round-trip through void* to silence 'cast increases required alignment of target type' warnings 77 | while (size >= 4 && (*static_cast(static_cast(data)) & 0x80808080) == 0) 78 | { 79 | result = Traits::low(result, data[0]); 80 | result = Traits::low(result, data[1]); 81 | result = Traits::low(result, data[2]); 82 | result = Traits::low(result, data[3]); 83 | data += 4; 84 | size -= 4; 85 | } 86 | } 87 | } 88 | // 110xxxxx -> U+0080..U+07FF 89 | else if (static_cast(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80) 90 | { 91 | result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); 92 | data += 2; 93 | size -= 2; 94 | } 95 | // 1110xxxx -> U+0800-U+FFFF 96 | else if (static_cast(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) 97 | { 98 | result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); 99 | data += 3; 100 | size -= 3; 101 | } 102 | // 11110xxx -> U+10000..U+10FFFF 103 | else if (static_cast(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) 104 | { 105 | result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); 106 | data += 4; 107 | size -= 4; 108 | } 109 | // 10xxxxxx or 11111xxx -> invalid 110 | else 111 | { 112 | data += 1; 113 | size -= 1; 114 | } 115 | } 116 | 117 | return result; 118 | } 119 | }; 120 | 121 | size_t utf8_cleanup(char* buffer, size_t length) 122 | { 123 | uint8_t* obegin = reinterpret_cast(buffer); 124 | uint8_t* oend = utf8_decoder::process(obegin, length, obegin, utf8_writer()); 125 | return oend - obegin; 126 | } 127 | -------------------------------------------------------------------------------- /c_src/utf8_cleanup.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_UTF8_CLEANUP_H_ 2 | #define ERLXML_C_SRC_UTF8_CLEANUP_H_ 3 | 4 | #include 5 | #include 6 | 7 | size_t utf8_cleanup(char* buffer, size_t length); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /c_src/xmlstreamparser.cc: -------------------------------------------------------------------------------- 1 | #include "xmlstreamparser.h" 2 | #include 3 | #include 4 | 5 | //http://pugixml.org/docs/manual.html 6 | //Limitations for stanza detection algorithm (streaming mode): 7 | // 1. not supporting cdata 8 | // 2. not supporting comments with special xml charachters inside 9 | // 3. not supporting doctype 10 | 11 | const size_t kDefaultBufferSize = 1024; 12 | 13 | // whitespace (space \n \r \t) lookup table 14 | 15 | const uint8_t kLookupWhitespace[256] = { 16 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, // 0 17 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 18 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 19 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3 20 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4 21 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5 22 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6 23 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7 24 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 25 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A 27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B 28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C 29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D 30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E 31 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F 32 | }; 33 | 34 | // match - ! ? and / . we don't increase the nested level for those in case are before > 35 | // and also we ignore the stanza's that has only one element of this type (header or comment) 36 | 37 | const uint8_t kLookupSkipTag[256] = { 38 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0 39 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //1 40 | 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, //2 41 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, //3 42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //4 43 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //5 44 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //6 45 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //7 46 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //8 47 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //9 48 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //A 49 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //B 50 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //C 51 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //D 52 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //E 53 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //F 54 | }; 55 | 56 | XmlStreamParser::XmlStreamParser(size_t max_stanza, bool strip_invalid_utf8, XmlStartStreamHandler start_h, XmlEndStreamHandler end_h, XmlStreamElementHandler el_h) : 57 | process_root_(true), 58 | max_stanza_bytes_(max_stanza), 59 | strip_invalid_utf8_(strip_invalid_utf8), 60 | start_stream_handler_(start_h), 61 | end_stream_handler_(end_h), 62 | element_handler_(el_h), 63 | nested_level_(-1), 64 | last_start_tag_index_(-1), 65 | first_start_tag_index_(-1) 66 | { 67 | 68 | } 69 | 70 | XmlStreamParser::~XmlStreamParser() 71 | { 72 | 73 | } 74 | 75 | XmlStreamParser::parse_result XmlStreamParser::FeedData(const uint8_t* data, size_t length, void* user_data) 76 | { 77 | size_t last_index = buffer_.Length(); 78 | buffer_.WriteBytes(data, length); 79 | parse_result result = DoProcess(last_index, buffer_.Length(), user_data); 80 | 81 | if(result == kParseOk && buffer_.Capacity() > kDefaultBufferSize) 82 | buffer_.Resize(std::max(buffer_.Length(), kDefaultBufferSize)); 83 | 84 | return result; 85 | } 86 | 87 | XmlStreamParser::parse_result XmlStreamParser::DoProcess(size_t start, size_t end, void* user_data) 88 | { 89 | uint8_t* ptr = const_cast(buffer_.Data()); 90 | size_t max_end_position = max_stanza_bytes_ > 0 ? std::min(max_stanza_bytes_, end) : end; 91 | 92 | int64_t end_stanza_index = FindStanzaUpperLimit(ptr, start, max_end_position); 93 | 94 | if(end_stanza_index == -1) 95 | { 96 | Reset(false); 97 | return kParseInvalidXml; 98 | } 99 | 100 | size_t end_stanza_pos = static_cast(end_stanza_index); 101 | 102 | if(end_stanza_pos == max_end_position) 103 | { 104 | if(max_stanza_bytes_ && max_end_position == max_stanza_bytes_) 105 | { 106 | Reset(false); 107 | return kParseStanzaLimitHit; 108 | } 109 | 110 | if(nested_level_ == -1 && process_root_ == false) 111 | { 112 | //finished the stream 113 | end_stream_handler_(user_data, root_name_); 114 | Reset(true); 115 | return kParseOk; 116 | } 117 | 118 | return kParseOk; 119 | } 120 | 121 | end_stanza_pos++; 122 | 123 | if(!PushStanza(ptr, end_stanza_pos, user_data)) 124 | { 125 | Reset(false); 126 | return kParseInvalidXml; 127 | } 128 | 129 | buffer_.Consume(end_stanza_pos); 130 | 131 | size_t remaining = buffer_.Length(); 132 | 133 | if(!remaining) 134 | return kParseOk; 135 | 136 | return DoProcess(0, remaining, user_data); 137 | } 138 | 139 | int64_t XmlStreamParser::FindStanzaUpperLimit(const uint8_t* ptr, size_t start, size_t end) 140 | { 141 | size_t index = start; 142 | 143 | if(last_start_tag_index_ == -1) 144 | { 145 | while (index < end && kLookupWhitespace[ptr[index]]) 146 | index++; 147 | 148 | if(index < end && ptr[index] != '<') 149 | return -1; 150 | } 151 | 152 | for(; index < end; index++) 153 | { 154 | switch (ptr[index]) 155 | { 156 | case '<': 157 | 158 | if(first_start_tag_index_ == -1) 159 | first_start_tag_index_ = index; 160 | 161 | last_start_tag_index_ = index; 162 | break; 163 | 164 | case '>': 165 | 166 | if(last_start_tag_index_ == -1) 167 | return -1; 168 | 169 | if(ptr[last_start_tag_index_+1] == '/') 170 | { 171 | nested_level_--; 172 | } 173 | else 174 | { 175 | if(kLookupSkipTag[ptr[index - 1]] == 0) 176 | nested_level_++; 177 | } 178 | 179 | if(nested_level_ == 0) 180 | return index; 181 | 182 | break; 183 | } 184 | } 185 | 186 | return index; 187 | } 188 | 189 | bool XmlStreamParser::PushStanza(uint8_t* buffer, size_t length, void* user_data) 190 | { 191 | if(process_root_) 192 | return ProcessRootElement(buffer, length, user_data); 193 | 194 | // don't parse anything in case we have a header or comment as first element 195 | // for this reason we need to skip all spaces 196 | 197 | if(!kLookupSkipTag[buffer[first_start_tag_index_+1]]) 198 | { 199 | pugi::xml_parse_status result = pugi_doc_.load_buffer_inplace(buffer, length).status; 200 | 201 | if(result != pugi::status_ok) 202 | return false; 203 | 204 | element_handler_(user_data, pugi_doc_, strip_invalid_utf8_); 205 | } 206 | 207 | last_start_tag_index_ = -1; 208 | first_start_tag_index_ = -1; 209 | assert(nested_level_ == 0); 210 | return true; 211 | } 212 | 213 | bool XmlStreamParser::ProcessRootElement(uint8_t* buffer, size_t length, void* user_data) 214 | { 215 | if(!length) 216 | return false; 217 | 218 | ByteBuffer rootbuff(length+1); 219 | rootbuff.WriteBytes(buffer, length-1); 220 | rootbuff.WriteBytes(reinterpret_cast("/>"), 2); 221 | 222 | pugi::xml_parse_status result = pugi_doc_.load_buffer_inplace(const_cast(rootbuff.Data()), rootbuff.Length(), pugi::parse_default, pugi::encoding_utf8).status; 223 | 224 | if(result != pugi::status_ok) 225 | return false; 226 | 227 | if(!start_stream_handler_(user_data, pugi_doc_, strip_invalid_utf8_)) 228 | return false; 229 | 230 | //drop all bytes so far 231 | root_name_ = pugi_doc_.document_element().name(); 232 | process_root_ = false; 233 | last_start_tag_index_ = -1; 234 | first_start_tag_index_ = -1; 235 | 236 | return true; 237 | } 238 | 239 | void XmlStreamParser::Reset(bool cleanup) 240 | { 241 | if(cleanup) 242 | { 243 | buffer_.Clear(); 244 | buffer_.Resize(kDefaultBufferSize); 245 | } 246 | 247 | nested_level_ = -1; 248 | process_root_ = true; 249 | last_start_tag_index_ = -1; 250 | first_start_tag_index_ = -1; 251 | } 252 | -------------------------------------------------------------------------------- /c_src/xmlstreamparser.h: -------------------------------------------------------------------------------- 1 | #ifndef ERLXML_C_SRC_XMLSTREAMPARSER_H_ 2 | #define ERLXML_C_SRC_XMLSTREAMPARSER_H_ 3 | 4 | #include 5 | 6 | #include "pugixml.hpp" 7 | #include "bytebuffer.h" 8 | #include "macros.h" 9 | 10 | typedef bool (*XmlStartStreamHandler) (void* user_data, pugi::xml_document& doc, bool strip_non_utf8); 11 | typedef void (*XmlEndStreamHandler) (void* user_data, const std::string& name); 12 | typedef void (*XmlStreamElementHandler) (void* user_data, pugi::xml_document& doc, bool strip_non_utf8); 13 | 14 | class XmlStreamParser 15 | { 16 | public: 17 | 18 | enum parse_result { kParseOk = 0, kParseStanzaLimitHit, kParseInvalidXml }; 19 | 20 | XmlStreamParser(size_t max_stanza, bool strip_invalid_utf8, XmlStartStreamHandler start_h, XmlEndStreamHandler end_h, XmlStreamElementHandler el_h); 21 | ~XmlStreamParser(); 22 | 23 | parse_result FeedData(const uint8_t* data, size_t size, void* user_data); 24 | void Reset(bool cleanup); 25 | 26 | const ByteBuffer* GetBufferedData() {return &buffer_;} 27 | 28 | private: 29 | 30 | parse_result DoProcess(size_t start, size_t size, void* user_data); 31 | bool PushStanza(uint8_t* buffer, size_t length, void* user_data); 32 | int64_t FindStanzaUpperLimit(const uint8_t* ptr, size_t start, size_t size); 33 | bool ProcessRootElement(uint8_t* buffer, size_t length, void* user_data); 34 | 35 | bool process_root_; 36 | size_t max_stanza_bytes_; 37 | bool strip_invalid_utf8_; 38 | 39 | ByteBuffer buffer_; 40 | XmlStartStreamHandler start_stream_handler_; 41 | XmlEndStreamHandler end_stream_handler_; 42 | XmlStreamElementHandler element_handler_; 43 | int32_t nested_level_; 44 | int64_t last_start_tag_index_; 45 | int64_t first_start_tag_index_; 46 | pugi::xml_document pugi_doc_; 47 | std::string root_name_; 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /include/erlxml.hrl: -------------------------------------------------------------------------------- 1 | 2 | -author("silviu.caragea"). 3 | 4 | -type xmlattr() :: {binary(), binary()}. 5 | 6 | -record(xmlcdata, {content = [] :: iodata()}). 7 | -record(xmlel, {name :: binary(), attrs = [] :: [xmlattr()], children = [] :: [#xmlel{} | #xmlcdata{}]}). 8 | -record(xmlstreamstart, {name :: binary(), attrs = [] :: [xmlattr()]}). 9 | -record(xmlstreamend, {name :: binary()}). 10 | 11 | -type xmlterm() :: #xmlel{} | xmlattr() | #xmlcdata{}. 12 | -type erlxml_option():: {stanza_limit, non_neg_integer()}. 13 | -type reason() :: invalid_stanza | max_stanza_limit_hit | badarg | binary(). 14 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {pre_hooks, [{"(linux|darwin)", compile, "make compile"}]}. 2 | {post_hooks, [{"(linux|darwin)", clean, "make clean"}]}. 3 | 4 | {artifacts, ["priv/erlxml_nif.so"]}. 5 | 6 | {project_plugins, [rebar3_hex]}. 7 | 8 | {erl_opts, [ 9 | warn_unused_vars, 10 | warn_shadow_vars, 11 | warn_unused_import, 12 | warn_unused_function, 13 | warn_bif_clash, 14 | warn_unused_record, 15 | warn_deprecated_function, 16 | warn_obsolete_guard, 17 | strict_validation, 18 | warn_export_vars, 19 | warn_exported_vars, 20 | warn_export_all, 21 | warnings_as_errors 22 | ]}. 23 | 24 | {cover_enabled, false}. 25 | 26 | {profiles, [ 27 | {bench, [ 28 | {src_dirs, ["src", "benchmark"]}, 29 | {deps, [ 30 | {fast_xml, ".*", {git, "https://github.com/processone/fast_xml.git", {tag, "1.1.55"}}}, 31 | {exml, ".*", {git, "https://github.com/esl/exml.git", {tag, "3.4.1"}}} 32 | ]} 33 | ]} 34 | ]}. 35 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | []. 2 | -------------------------------------------------------------------------------- /src/erlxml.app.src: -------------------------------------------------------------------------------- 1 | {application, erlxml, [ 2 | {description, "erlxml - Erlang XML parsing library based on pugixml."}, 3 | {licenses, ["MIT"]}, 4 | {links,[{"Github","https://github.com/silviucpp/erlxml"}]}, 5 | {vsn, "2.1.1"}, 6 | {registered, []}, 7 | {applications, [ 8 | kernel, 9 | stdlib 10 | ]}, 11 | {pkg_name, erlxml2}, 12 | {env, []}, 13 | {files, [ 14 | "LICENSE*", 15 | "*.MD", 16 | "Makefile", 17 | "rebar.config", 18 | "rebar.lock", 19 | "include/*.hrl", 20 | "src/*.erl", 21 | "src/*.src", 22 | "benchmark/*.erl", 23 | "c_src/pugixml/*.hpp", 24 | "c_src/pugixml/*.cpp", 25 | "c_src/*.h", 26 | "c_src/*.cc", 27 | "c_src/Makefile", 28 | "c_src/nif.mk", 29 | "test/*.erl", 30 | "test/data/*.txt" 31 | ]} 32 | ]}. 33 | -------------------------------------------------------------------------------- /src/erlxml.erl: -------------------------------------------------------------------------------- 1 | -module(erlxml). 2 | -author("silviu.caragea"). 3 | 4 | -include("erlxml.hrl"). 5 | 6 | -export([ 7 | new_stream/0, 8 | new_stream/1, 9 | parse_stream/2, 10 | reset_stream/1, 11 | parse/1, 12 | to_binary/1 13 | ]). 14 | 15 | -spec new_stream() -> 16 | {ok, reference()} | {error, reason()}. 17 | 18 | new_stream() -> 19 | new_stream([]). 20 | 21 | -spec new_stream([erlxml_option()]) -> 22 | {ok, reference()} | {error, reason()}. 23 | 24 | new_stream(Options) -> 25 | erlxml_nif:new_stream(Options). 26 | 27 | -spec parse_stream(reference(), iolist() | binary()) -> 28 | {ok, [#xmlstreamstart{} | #xmlel{} | #xmlstreamend{}]} | {error, reason()} | {error, reason(), binary()}. 29 | 30 | parse_stream(Parser, Data) -> 31 | erlxml_nif:chunk_feed_stream(Parser, Data). 32 | 33 | -spec reset_stream(reference()) -> 34 | ok | {error, reason()}. 35 | 36 | reset_stream(Parser) -> 37 | erlxml_nif:reset_stream(Parser). 38 | 39 | -spec parse(iolist() | binary()) -> 40 | {ok, #xmlel{}} | {error, reason()}. 41 | 42 | parse(Data) -> 43 | erlxml_nif:dom_parse(Data). 44 | 45 | -spec to_binary(#xmlel{}) -> 46 | binary() | {error, reason()}. 47 | 48 | to_binary(Data) -> 49 | erlxml_nif:to_binary(Data). 50 | -------------------------------------------------------------------------------- /src/erlxml_nif.erl: -------------------------------------------------------------------------------- 1 | -module(erlxml_nif). 2 | -author("silviu.caragea"). 3 | 4 | -define(NOT_LOADED, not_loaded(?LINE)). 5 | %% Maximum bytes passed to the NIF handler at once (20Kb) 6 | -define(MAX_BYTES_TO_NIF, 20000). 7 | 8 | -on_load(load_nif/0). 9 | 10 | -export([ 11 | new_stream/1, 12 | chunk_feed_stream/2, 13 | reset_stream/1, 14 | dom_parse/1, 15 | to_binary/1 16 | ]). 17 | 18 | %% nif functions 19 | 20 | load_nif() -> 21 | ok = erlang:load_nif(get_nif_library_path(), 0). 22 | 23 | get_nif_library_path() -> 24 | case code:priv_dir(erlxml) of 25 | {error, bad_name} -> 26 | case filelib:is_dir(filename:join(["..", priv])) of 27 | true -> 28 | filename:join(["..", priv, ?MODULE]); 29 | false -> 30 | filename:join([priv, ?MODULE]) 31 | end; 32 | Dir -> 33 | filename:join(Dir, ?MODULE) 34 | end. 35 | 36 | not_loaded(Line) -> 37 | erlang:nif_error({not_loaded, [{module, ?MODULE}, {line, Line}]}). 38 | 39 | new_stream(_Opts) -> 40 | ?NOT_LOADED. 41 | 42 | feed_stream(_Parser, _Data) -> 43 | ?NOT_LOADED. 44 | 45 | reset_stream(_Parser) -> 46 | ?NOT_LOADED. 47 | 48 | dom_parse(_Data) -> 49 | ?NOT_LOADED. 50 | 51 | to_binary(_Data) -> 52 | ?NOT_LOADED. 53 | 54 | chunk_feed_stream(Parser, Data) when is_binary(Data) -> 55 | chunk_feed_stream(Parser, Data, byte_size(Data), null); 56 | chunk_feed_stream(Parser, Data) -> 57 | chunk_feed_stream(Parser, iolist_to_binary(Data)). 58 | 59 | chunk_feed_stream(Parser, Data, Size, Acc) -> 60 | case Size > ?MAX_BYTES_TO_NIF of 61 | true -> 62 | <> = Data, 63 | case feed_stream(Parser, Chunk) of 64 | {ok, Elements} -> 65 | chunk_feed_stream(Parser, Rest, Size - ?MAX_BYTES_TO_NIF, aggregate_els(Acc, Elements)); 66 | Error -> 67 | Error 68 | end; 69 | _ -> 70 | case feed_stream(Parser, Data) of 71 | {ok, Elements} -> 72 | {ok, aggregate_els(Acc, Elements)}; 73 | Error -> 74 | Error 75 | end 76 | end. 77 | 78 | aggregate_els(null, Els) -> 79 | Els; 80 | aggregate_els(Acc, Els) -> 81 | Els ++ Acc. 82 | -------------------------------------------------------------------------------- /src/erlxml_utils.erl: -------------------------------------------------------------------------------- 1 | -module(erlxml_utils). 2 | -author("byron.wang"). 3 | -include("erlxml.hrl"). 4 | 5 | -export([ 6 | cdata/1, 7 | subel/2, 8 | subel_cdata/2 9 | ]). 10 | 11 | -spec cdata(#xmlel{}) -> 12 | binary(). 13 | 14 | cdata(#xmlel{children = Children}) -> 15 | case lists:keyfind(xmlcdata, 1, Children) of 16 | {xmlcdata, Xmlcdata} -> Xmlcdata; 17 | _ -> <<>> 18 | end. 19 | 20 | -spec subel(#xmlel{}, binary()) -> 21 | #xmlel{} | undefined. 22 | 23 | subel(#xmlel{children = Children}, Name) -> 24 | case lists:keyfind(Name, 2, Children) of 25 | #xmlel{} = X -> X; 26 | _ -> undefined 27 | end. 28 | 29 | -spec subel_cdata(#xmlel{}, binary()) -> 30 | binary() | undefined. 31 | 32 | subel_cdata(#xmlel{} = Xml, Name) -> 33 | case subel(Xml, Name) of 34 | #xmlel{} = X -> cdata(X); 35 | _ -> undefined 36 | end. 37 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | *.beam 2 | -------------------------------------------------------------------------------- /test/data/invalid_token_EF_B7_90.txt: -------------------------------------------------------------------------------- 1 | 123﷐456 -------------------------------------------------------------------------------- /test/data/invalid_token_EF_B7_9F.txt: -------------------------------------------------------------------------------- 1 | 123﷟456 -------------------------------------------------------------------------------- /test/data/invalid_token_EF_B7_A4.txt: -------------------------------------------------------------------------------- 1 | 123﷤456 -------------------------------------------------------------------------------- /test/data/invalid_token_EF_B7_AF.txt: -------------------------------------------------------------------------------- 1 | 123﷯456 -------------------------------------------------------------------------------- /test/data/invalid_token_EF_BF_BE.txt: -------------------------------------------------------------------------------- 1 | 123￾456 -------------------------------------------------------------------------------- /test/data/invalid_token_EF_BF_BF.txt: -------------------------------------------------------------------------------- 1 | 123￿456 -------------------------------------------------------------------------------- /test/data/stream.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | PLAIN 5 | SCRAM-SHA-1 6 | 7 | 8 | zlib 9 | 10 | 11 | 12 | 13 | 14 | 15 | user1 16 | password1 17 | 18 | 19 | 20 | 21 | user2 22 | password2 23 | 24 | 25 | 26 | 27 | user3 28 | password3 29 | 30 | 31 | 32 | 33 | user4 34 | password4 35 | 36 | 37 | 38 | 39 | user5 40 | password5 41 | 42 | 43 | 44 | 45 | user6 46 | password6 47 | 48 | 49 | 50 | 51 | user7 52 | password7 53 | 54 | 55 | 56 | 57 | user8 58 | password8 59 | 60 | 61 | 62 | 63 | user9 64 | password9 65 | 66 | 67 | 68 | 69 | user10 70 | password10 71 | 72 | 73 | 74 | 75 | 76 | Hello, user2! 77 | 78 | 79 | Hello, user3! 80 | 81 | 82 | Hello, user4! 83 | 84 | 85 | Hello, user5! 86 | 87 | 88 | Hello, user6! 89 | 90 | 91 | Hello, user7! 92 | 93 | 94 | Hello, user8! 95 | 96 | 97 | Hello, user9! 98 | 99 | 100 | Hello, user10! 101 | 102 | 103 | Hello, user11! 104 | 105 | 106 | Hello, user12! 107 | 108 | 109 | Hello, user13! 110 | 111 | 112 | Hello, user14! 113 | 114 | 115 | Hello, user15! 116 | 117 | 118 | Hello, user16! 119 | 120 | 121 | Hello, user17! 122 | 123 | 124 | Hello, user18! 125 | 126 | 127 | Hello, user19! 128 | 129 | 130 | Hello, user20! 131 | 132 | 133 | Hello, user21! 134 | 135 | 136 | 137 | 138 | chat 139 | Available 140 | 141 | 142 | chat 143 | Available 144 | 145 | 146 | chat 147 | Available 148 | 149 | 150 | chat 151 | Available 152 | 153 | 154 | chat 155 | Available 156 | 157 | 158 | chat 159 | Available 160 | 161 | 162 | chat 163 | Available 164 | 165 | 166 | chat 167 | Available 168 | 169 | 170 | chat 171 | Available 172 | 173 | 174 | chat 175 | Available 176 | 177 | 178 | chat 179 | Available 180 | 181 | 182 | chat 183 | Available 184 | 185 | 186 | chat 187 | Available 188 | 189 | 190 | chat 191 | Available 192 | 193 | 194 | chat 195 | Available 196 | 197 | 198 | chat 199 | Available 200 | 201 | 202 | chat 203 | Available 204 | 205 | 206 | chat 207 | Available 208 | 209 | 210 | chat 211 | Available 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /test/data/succeeded_C3_AF__C2_BF__C2_B0.txt: -------------------------------------------------------------------------------- 1 | 123ï¿°456 -------------------------------------------------------------------------------- /test/data/succeeded_C6_87.txt: -------------------------------------------------------------------------------- 1 | 123Ƈ456 -------------------------------------------------------------------------------- /test/data/succeeded_EF_B7_89.txt: -------------------------------------------------------------------------------- 1 | 123﷉456 -------------------------------------------------------------------------------- /test/data/succeeded_EF_B7_B0.txt: -------------------------------------------------------------------------------- 1 | 123ﷰ456 -------------------------------------------------------------------------------- /test/data/succeeded_EF_B8_80.txt: -------------------------------------------------------------------------------- 1 | 123︀456 -------------------------------------------------------------------------------- /test/data/succeeded_EF_BF_AE.txt: -------------------------------------------------------------------------------- 1 | 123○456 -------------------------------------------------------------------------------- /test/data/succeeded_F0_90_8C_88.txt: -------------------------------------------------------------------------------- 1 | 123𐌈456 -------------------------------------------------------------------------------- /test/integrity_test.erl: -------------------------------------------------------------------------------- 1 | -module(integrity_test). 2 | 3 | -include_lib("eunit/include/eunit.hrl"). 4 | 5 | -define(ROOT_DATA, "test/data"). 6 | 7 | bad_options_test() -> 8 | {error,{options,{unavailable_option,1}}} = erlxml:new_stream([{unavailable_option, 1}]), 9 | true. 10 | 11 | to_binary_ok_test() -> 12 | Xml = {xmlel,<<"foo">>, [{<<"attr1">>,<<"bar">>}], [{xmlcdata,<<"Some Value">>}]}, 13 | <<"Some Value">> = erlxml:to_binary(Xml), 14 | true. 15 | 16 | to_binary_error_test() -> 17 | Xml = {axmlel,<<"foo">>, [{<<"attr1">>,<<"bar">>}], [{xmlcdata,<<"Some Value">>}]}, 18 | {error, badarg} = erlxml:to_binary(Xml), 19 | true. 20 | 21 | dom_parsing_ok_test() -> 22 | {ok,{xmlel,<<"foo">>, [{<<"attr1">>,<<"bar">>}], [{xmlcdata,<<"Some Value">>}]}} = 23 | erlxml:parse(<<"Some Value">>), 24 | true. 25 | 26 | dom_parsing_error_test() -> 27 | InvalidStaza = <<"Some Value>, 28 | {error,invalid_stanza} = erlxml:parse(InvalidStaza), 29 | true. 30 | 31 | stream_parsing_error_test() -> 32 | InvalidStaza = <<"foo attr1='bar'>Some Value>, 33 | {ok, Parser} = erlxml:new_stream(), 34 | {error, {invalid_stanza, InvalidStaza}} = erlxml:parse_stream(Parser, InvalidStaza), 35 | true. 36 | 37 | stream_parsing_invalid_stanza_start_error_test() -> 38 | {ok, Parser} = erlxml:new_stream(), 39 | {ok,[{xmlstreamstart,<<"stream">>,[]}]} = erlxml:parse_stream(Parser, <<"">>), 40 | {ok,[{xmlel,<<"tag1">>,[], [ 41 | {xmlel,<<"g">>,[],[{xmlcdata,<<"sss">>}]}]}]} = erlxml:parse_stream(Parser, <<" sss">>), 42 | {error,{invalid_stanza,<<" tag1">>}} = erlxml:parse_stream(Parser, <<" tag1">>), 43 | true. 44 | 45 | max_stanza_limit_hit_test() -> 46 | Data = <<"1">>, 47 | {ok, Parser} = erlxml:new_stream([{stanza_limit, 11}]), 48 | {ok, Parser2} = erlxml:new_stream([{stanza_limit, 12}]), 49 | {error, {max_stanza_limit_hit, <<"1">>}} = erlxml:parse_stream(Parser, Data), 50 | {ok, _} = erlxml:parse_stream(Parser2, Data), 51 | true. 52 | 53 | max_stanza_limit_hit_cdata_test() -> 54 | MaxLimit = 65536, 55 | Overflow = 1, 56 | 57 | Head = <<"">>, 58 | Tail = <<"">>, 59 | Body = binary:copy(<<"1">>, (MaxLimit - (byte_size(Head) + byte_size(Tail)))+Overflow), 60 | PendingBuffer = <">>, 61 | Stanza = <<"", PendingBuffer/binary>>, 62 | {ok, Parser} = erlxml:new_stream([{stanza_limit, MaxLimit}]), 63 | {error, {max_stanza_limit_hit, PendingBuffer}} = erlxml:parse_stream(Parser, Stanza), 64 | true. 65 | 66 | chunks_test() -> 67 | Chunk1 = <<"\n\r >, 68 | Chunk2 = <<"\">Some Value">>, 69 | 70 | {ok, Parser} = erlxml:new_stream(), 71 | {ok,[{xmlstreamstart,<<"stream">>,[{<<"ss">>,<<"aa">>}]}]} = erlxml:parse_stream(Parser, Chunk1), 72 | {ok,[{xmlel,<<"foo">>, 73 | [{<<"attr1">>,<<"bar">>}], 74 | [{xmlcdata,<<"Some Value">>}]}, 75 | {xmlel,<<"el2">>,[{<<"ss">>,<<"asd">>}],[]}, 76 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, Chunk2), 77 | true. 78 | 79 | skip_header_and_comments_test() -> 80 | Data = <<" 81 | 82 | 83 | 84 | 1 85 | 2 86 | 3 87 | ">>, 88 | 89 | {ok, Parser} = erlxml:new_stream(), 90 | {ok,[{xmlstreamstart,<<"stream">>,[]}, 91 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"1">>}]}, 92 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"2">>}]}, 93 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"3">>}]}, 94 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, Data), 95 | 96 | ok = erlxml:reset_stream(Parser), 97 | 98 | {ok,[{xmlstreamstart,<<"stream">>,[]}, 99 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"1">>}]}, 100 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"2">>}]}, 101 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"3">>}]}, 102 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, binary_to_list(Data)), 103 | true. 104 | 105 | one_by_one_char_test() -> 106 | Data = <<" 107 | 108 | 109 | 110 | 1 111 | 2 112 | 3 113 | ">>, 114 | 115 | {ok, Parser} = erlxml:new_stream(), 116 | [{ok, _} = erlxml:parse_stream(Parser, [X]) || <> <= Data], 117 | true. 118 | 119 | strip_invalid_utf8_test() -> 120 | Data0 = <<"123🏇4567">>, 121 | Length = byte_size(Data0) -1, 122 | <> = Data0, 123 | Msg= <<"", Data/binary, "">>, 124 | {ok, Parser} = erlxml:new_stream([{strip_non_utf8, true}]), 125 | {ok,[{xmlstreamstart,<<"stream">>,[]}, 126 | {xmlel,<<"node">>, 127 | [{<<"a">>,<<"123456">>}], 128 | [{xmlcdata,<<"123456">>}]}, 129 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, Msg), 130 | true. 131 | 132 | strip_invalid_token_EF_B7_9F_test() -> 133 | {ok, InvalidToken} = file:read_file(<>), 134 | true = test_strip_invalid_token(InvalidToken, <<"123456">>). 135 | 136 | strip_invalid_token_EF_B7_90_test() -> 137 | {ok, InvalidToken} = file:read_file(<>), 138 | true = test_strip_invalid_token(InvalidToken, <<"123456">>). 139 | 140 | strip_invalid_token_EF_B7_A4_test() -> 141 | {ok, InvalidToken} = file:read_file(<>), 142 | true = test_strip_invalid_token(InvalidToken, <<"123456">>). 143 | 144 | strip_invalid_token_EF_B7_AF_test() -> 145 | {ok, InvalidToken} = file:read_file(<>), 146 | true = test_strip_invalid_token(InvalidToken, <<"123456">>). 147 | 148 | strip_invalid_token_EF_BF_BE_test() -> 149 | {ok, InvalidToken} = file:read_file(<>), 150 | true = test_strip_invalid_token(InvalidToken, <<"123456">>). 151 | 152 | strip_invalid_token_EF_BF_BF_test() -> 153 | {ok, InvalidToken} = file:read_file(<>), 154 | true = test_strip_invalid_token(InvalidToken, <<"123456">>). 155 | 156 | succeeded_C3_AF__C2_BF__C2_B0_test() -> 157 | {ok, Token} = file:read_file(<>), 158 | true = test_strip_invalid_token(Token, Token). 159 | 160 | succeeded_C6_87_test() -> 161 | {ok, Token} = file:read_file(<>), 162 | true = test_strip_invalid_token(Token, Token). 163 | 164 | succeeded_EF_B7_89_test() -> 165 | {ok, Token} = file:read_file(<>), 166 | true = test_strip_invalid_token(Token, Token). 167 | 168 | succeeded_EF_B7_B0_test() -> 169 | {ok, Token} = file:read_file(<>), 170 | true = test_strip_invalid_token(Token, Token). 171 | 172 | succeeded_EF_B8_80_test() -> 173 | {ok, Token} = file:read_file(<>), 174 | true = test_strip_invalid_token(Token, Token). 175 | 176 | succeeded_EF_BF_AE_test() -> 177 | {ok, Token} = file:read_file(<>), 178 | true = test_strip_invalid_token(Token, Token). 179 | 180 | succeeded_F0_90_8C_88_test() -> 181 | {ok, Token} = file:read_file(<>), 182 | true = test_strip_invalid_token(Token, Token). 183 | 184 | % internals 185 | 186 | test_strip_invalid_token(InvalidToken, ExpectedResult) -> 187 | Data = <<"", InvalidToken/binary,"">>, 188 | {ok, Parser} = erlxml:new_stream([{strip_non_utf8, true}]), 189 | {ok,[{xmlstreamstart,<<"stream">>,[]}]} = erlxml:parse_stream(Parser, <<"">>), 190 | {ok,[{xmlel,<<"iq">>, 191 | [{<<"xmlns">>,<<"namespace">>}], 192 | [{xmlel,<<"body">>,[],[{xmlcdata, ExpectedResult}]}]}]} = erlxml:parse_stream(Parser, Data), 193 | true. 194 | --------------------------------------------------------------------------------