├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.MD
├── benchmark
├── benchmark.erl
└── benchmark_stream.erl
├── c_src
├── .gitattributes
├── .gitignore
├── Makefile
├── allocators.cc
├── allocators.h
├── bytebuffer.cc
├── bytebuffer.h
├── element_encoder.cc
├── element_encoder.h
├── erlxml.cc
├── erlxml.h
├── erlxml_nif.cc
├── erlxml_nif.h
├── macros.h
├── nif.mk
├── nif_utils.cc
├── nif_utils.h
├── pugixml
│ ├── .gitignore
│ ├── pugiconfig.hpp
│ ├── pugixml.cpp
│ └── pugixml.hpp
├── utf8_cleanup.cc
├── utf8_cleanup.h
├── xmlstreamparser.cc
└── xmlstreamparser.h
├── include
└── erlxml.hrl
├── rebar.config
├── rebar.lock
├── src
├── erlxml.app.src
├── erlxml.erl
├── erlxml_nif.erl
└── erlxml_utils.erl
└── test
├── .gitignore
├── data
├── invalid_token_EF_B7_90.txt
├── invalid_token_EF_B7_9F.txt
├── invalid_token_EF_B7_A4.txt
├── invalid_token_EF_B7_AF.txt
├── invalid_token_EF_BF_BE.txt
├── invalid_token_EF_BF_BF.txt
├── stream.txt
├── succeeded_C3_AF__C2_BF__C2_B0.txt
├── succeeded_C6_87.txt
├── succeeded_EF_B7_89.txt
├── succeeded_EF_B7_B0.txt
├── succeeded_EF_B8_80.txt
├── succeeded_EF_BF_AE.txt
└── succeeded_F0_90_8C_88.txt
└── integrity_test.erl
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | /.rebar/
3 | /deps/
4 | /ebin/
5 | /erlxml_nif.xcodeproj/
6 | /priv/
7 | *.iml
8 | *.DS_Store
9 | /log/
10 | /DerivedData/
11 | /_build/
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 |
2 | language: erlang
3 |
4 | matrix:
5 |
6 | include:
7 | - os: linux
8 | dist: bionic
9 | otp_release: 25.3.2.6
10 |
11 | - os: linux
12 | dist: focal
13 | otp_release: 27.0
14 |
15 | - os: linux
16 | dist: jammy
17 | otp_release: 26.1.1
18 |
19 | - os: osx
20 | osx_image: xcode13.4
21 | language: generic
22 | env:
23 | - HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=true
24 | - HOMEBREW_NO_INSTALL_UPGRADE=true
25 | - HOMEBREW_NO_INSTALL_CLEANUP=true
26 | - HOMEBREW_NO_AUTO_UPDATE=true
27 | cache:
28 | directories:
29 | - $HOME/Library/Caches/Homebrew
30 | - /usr/local/Homebrew
31 |
32 | before_script:
33 |
34 | - if [[ $TRAVIS_OS_NAME == osx ]]; then brew install --force-bottle erlang || true; fi
35 |
36 | - curl https://s3.amazonaws.com/rebar3/rebar3 --output rebar3 && chmod +x rebar3
37 |
38 | script:
39 | - ./rebar3 compile
40 |
41 | after_success:
42 | - ./rebar3 eunit
43 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Silviu Caragea
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | REBAR ?= rebar3
2 |
3 | compile:
4 | @make V=0 -C c_src -j 8
5 |
6 | clean:
7 | @make -C c_src clean
8 |
9 | bench_encoding:
10 | $(REBAR) as bench compile
11 | erl -pa _build/bench/lib/*/ebin -noshell \
12 | -eval "lists:foreach( \
13 | fun(Module) -> \
14 | lists:foreach(fun(N) -> benchmark:bench_encoding(Module, 600000, N) end, [1, 5, 10]) \
15 | end, [erlxml, exml, fast_xml])" \
16 | -eval "init:stop()."
17 |
18 | bench_parsing:
19 | $(REBAR) as bench compile
20 | erl -pa _build/bench/lib/*/ebin -noshell \
21 | -eval "lists:foreach( \
22 | fun(Module) -> \
23 | lists:foreach(fun(N) -> benchmark:bench_parsing(Module, 600000, N) end, [1, 5, 10]) \
24 | end, [erlxml, exml, fast_xml])" \
25 | -eval "init:stop()."
26 |
27 | bench_streaming:
28 | $(REBAR) as bench compile
29 | erl -pa _build/bench/lib/*/ebin -noshell \
30 | -eval "lists:foreach( \
31 | fun(Module) -> \
32 | lists:foreach(fun(N) -> benchmark_stream:bench(Module, \"test/data/stream.txt\", 60000, N) end, [1, 5, 10]) \
33 | end, [erlxml, exml, fast_xml])" \
34 | -eval "init:stop()."
35 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | # erlxml
2 |
3 | *erlxml - Erlang XML parsing library based on pugixml*
4 |
5 | [](https://travis-ci.com/github/silviucpp/erlxml)
6 | [](https://github.com/silviucpp/erlxml/blob/master/LICENSE)
7 | [](https://hex.pm/packages/erlxml2)
8 |
9 | # Implementation notes
10 |
11 | [pugixml][1] is the fastest dom parser available in c++ based on the benchmarks available [here][2]. The streaming parser works by dividing the
12 | stream into independent stanzas, which are then processed using pugixml. While the splitting algorithm is quite fast, it is designed for simplicity,
13 | which currently imposes some limitations on the streaming mode:
14 |
15 | - Does not support `CDATA`
16 | - Does not support comments containing special XML characters
17 | - Does not support `DOCTYPE` declarations
18 |
19 | All of the above limitations apply only to streaming mode and not to DOM parsing mode.
20 |
21 | ### Getting starting:
22 |
23 | ##### DOM parsing
24 |
25 | ```erlang
26 | erlxml:parse(<<"Some Value">>).
27 | ```
28 |
29 | Which results in
30 |
31 | ```erlang
32 | {ok,{xmlel,<<"foo">>,
33 | [{<<"attr1">>,<<"bar">>}],
34 | [{xmlcdata,<<"Some Value">>}]}}
35 | ```
36 |
37 | ##### Generate an XML document from Erlang terms
38 |
39 | ```erlang
40 | Xml = {xmlel,<<"foo">>,
41 | [{<<"attr1">>,<<"bar">>}], % Attributes
42 | [{xmlcdata,<<"Some Value">>}] % Elements
43 | },
44 | erlxml:to_binary(Xml).
45 | ```
46 |
47 | Which results in
48 |
49 | ```erlang
50 | <<"Some Value">>
51 | ```
52 |
53 | ##### Streaming parsing
54 |
55 | ```erlang
56 | Chunk1 = <<">,
57 | Chunk2 = <<"\">Some Value">>,
58 | {ok, Parser} = erlxml:new_stream(),
59 | {ok,[{xmlstreamstart,<<"stream">>,[]}]} = erlxml:parse_stream(Parser, Chunk1),
60 | Rs = erlxml:parse_stream(Parser, Chunk2),
61 | {ok,[{xmlel,<<"foo">>,
62 | [{<<"attr1">>,<<"bar">>}],
63 | [{xmlcdata,<<"Some Value">>}]},
64 | {xmlstreamend,<<"stream">>}]} = Rs.
65 | ```
66 |
67 | ### Options
68 |
69 | When you create a stream using `new_stream/1` you can specify the following options:
70 |
71 | - `stanza_limit` - Specify the maximum size a stanza can have. In case the library parses more than this number of bytes
72 | without finding a stanza will return and error `{error, {max_stanza_limit_hit, binary()}}`. Example: `{stanza_limit, 65000}`. By default, it is 0 that means unlimited.
73 |
74 | - `strip_non_utf8` - Will strip from attributes values and node values elements all invalid utf8 characters. This is considered
75 | user input and might have malformed chars. Default is `false`.
76 |
77 | ### Benchmarks
78 |
79 | The benchmark code is inside the benchmark `folder`. The performances are compared against:
80 |
81 | - [exml][3] version used: 3.4.1
82 | - [fast_xml][4] version used: 1.1.55
83 |
84 | All tests are running with three different concurrency levels (how many erlang processes are spawn)
85 |
86 | - C1 (concurrency level 1)
87 | - C5 (concurrency level 5)
88 | - C10 (concurrency level 10)
89 |
90 | ##### DOM parsing
91 |
92 | Parse the same stanza defined in `benchmark/benchmark.erl` for 600000 times:
93 |
94 | ```sh
95 | make bench_parsing
96 | ```
97 |
98 | | Library | C1 (ms) | C5 (ms) | C10 (ms) |
99 | |:----------:|:------------:|:---------:|:---------:|
100 | | erlxml | 1875.128 | 417.368 | 315.65 |
101 | | exml | 2417.334 | 578.226 | 407.516 |
102 | | fast_xml | 24159.517 | 5854.817 | 4007.837 |
103 |
104 | Note:
105 |
106 | - Starting version 3.0.0, [exml][3] saw significant improvements by replacing Expat with RapidXML.
107 | - `erlxml` delivers the best performance, followed by `exml`, while `fast_xml` performs the worst (huge difference).
108 |
109 | ##### Generate an XML document from Erlang terms
110 |
111 | Encode the same erlang term defined in `benchmark/benchmark.erl` for 600000 times:
112 |
113 | ```sh
114 | make bench_encoding
115 | ```
116 |
117 | | Library | C1 (ms) | C5 (ms) | C10 (ms) |
118 | |:-----------:|:--------:|:-------:|:--------:|
119 | | `erlxml` | 1381.338 | 322.851 | 251.936 |
120 | | `exml` | 1333.54 | 301.625 | 234.295 |
121 | | `fast_xml` | 1019.238 | 238.676 | 198.69 |
122 |
123 | Note:
124 |
125 | - `fast_xml` delivers the best performance, followed by `exml`, and `erlxml` with almost the same performance.
126 | - `erlxml` improved encoding performance in version `2.1.0` by removing unnecessary memory copy and string length computing.
127 |
128 | ##### Streaming parsing
129 |
130 | Test is located in `benchmark/benchmark_stream.erl`, and will load all stanza's from `test/data/stream.txt` and run the parsing mode over that stanza's for 60000 times:
131 |
132 | ```sh
133 | make bench_streaming
134 | ```
135 |
136 | ```sh
137 | ### engine: erlxml concurrency: 1 -> 2337.112 ms 193.81 MB/sec total bytes processed: 452.96 MB
138 | ### engine: erlxml concurrency: 5 -> 598.737 ms 756.52 MB/sec total bytes processed: 452.96 MB
139 | ### engine: erlxml concurrency: 10 -> 407.379 ms 1.09 GB/sec total bytes processed: 452.96 MB
140 | ### engine: exml concurrency: 1 -> 11790.975 ms 38.42 MB/sec total bytes processed: 452.96 MB
141 | ### engine: exml concurrency: 5 -> 2552.339 ms 177.47 MB/sec total bytes processed: 452.96 MB
142 | ### engine: exml concurrency: 10 -> 1840.267 ms 246.14 MB/sec total bytes processed: 452.96 MB
143 | ### engine: fast_xml concurrency: 1 -> 22677.758 ms 19.97 MB/sec total bytes processed: 452.96 MB
144 | ### engine: fast_xml concurrency: 5 -> 5184.096 ms 87.37 MB/sec total bytes processed: 452.96 MB
145 | ### engine: fast_xml concurrency: 10 -> 3854.402 ms 117.52 MB/sec total bytes processed: 452.96 MB
146 | ```
147 |
148 | | Library | C1 (MB/s) | C5 (MB/s) | C10 (MB/s) |
149 | |:-----------:|:--------------:|:---------:|:----------:|
150 | | erlxml | 193.81 | 756.52 | 1090 |
151 | | exml | 38.42 | 177.47 | 246 |
152 | | fast_xml | 19.97 | 87.37 | 117 |
153 |
154 | Notes:
155 |
156 | - `erlxml` is the clear winner.
157 |
158 | [1]:http://pugixml.org
159 | [2]:http://pugixml.org/benchmark.html
160 | [3]:https://github.com/esl/exml
161 | [4]:https://github.com/processone/fast_xml
162 |
--------------------------------------------------------------------------------
/benchmark/benchmark.erl:
--------------------------------------------------------------------------------
1 | -module(benchmark).
2 |
3 | -export([
4 | bench_encoding/3,
5 | bench_parsing/3
6 | ]).
7 |
8 | -define(ELEMENT, {xmlel,<<"iq">>,
9 | [{<<"type">>,<<"get">>}],
10 | [{xmlel,<<"query">>,
11 | [{<<"xmlns">>,<<"jabber:iq:bulk">>}],
12 | [{xmlel,<<"r">>, [{<<"ver">>,<<"1489702723756">>},{<<"client">>,<<"0.41.16">>}], []},
13 | {xmlel,<<"b">>,[{<<"ver">>,<<"1470998323471">>}],[]},
14 | {xmlel,<<"pf">>, [{<<"type">>,<<"roster">>},{<<"ver">>,<<"1473925451360">>}], []},
15 | {xmlel,<<"pf">>, [{<<"type">>,<<"addressbook">>},{<<"ver">>,<<"1410177959174">>}], []}]},
16 | {xmlel,<<"message">>,
17 | [{<<"from">>,<<"user@wdomain/resource-TMnXEhgkGN">>},
18 | {<<"to">>,<<"user2@domain/mac70c36269">>},
19 | {<<"ts">>,<<"1490255206161">>},
20 | {<<"id">>,<<"YgP1z-18834681">>},
21 | {<<"type">>,<<"headline">>}],
22 | [{xmlel,<<"backendmessage">>,
23 | [{<<"xmlns">>,<<"notification">>},{<<"push">>,<<"0">>}],
24 | [{xmlel,<<"resreceived">>,[],
25 | [{xmlel,<<"accountId">>,[],
26 | [{xmlcdata,<<"23423534534">>}]},
27 | {xmlel,<<"amount">>,[],
28 | [{xmlcdata,
29 | <<"0.200000000000000">>}]},
30 | {xmlel,<<"type">>,[],
31 | [{xmlcdata,<<"AEW-12">>}]},
32 | {xmlel,<<"description">>,[],
33 | [{xmlcdata,<<"884340">>}]}]}]}]}]}
34 | ).
35 |
36 | -define(STANZA, <<"
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 | 23423534534
48 |
49 |
50 | 0.200000000000000
51 |
52 |
53 | AEW-12
54 |
55 |
56 | 884340
57 |
58 |
59 |
60 |
61 | ">>).
62 |
63 | bench_encoding(Engine, Number, Concurrency) ->
64 | init(Engine),
65 | to_binary_ok(Engine, to_binary(Engine)),
66 | bench(Engine, fun() -> to_binary(Engine) end, Number, Concurrency).
67 |
68 | bench_parsing(Engine, Number, Concurrency) ->
69 | init(Engine),
70 | parse_ok(Engine, parse(Engine, ?STANZA)),
71 | bench(Engine, fun() -> parse(Engine, ?STANZA) end, Number, Concurrency).
72 |
73 | % internals
74 |
75 | bench(Engine, Fun, Number, Concurrency) ->
76 | Self = self(),
77 | List = lists:seq(1, Concurrency),
78 | LoopNumbers = Number div Concurrency,
79 |
80 | A = os:timestamp(),
81 | Pids = [spawn_link(fun() -> loop(LoopNumbers, Fun), Self ! {self(), done} end) || _ <- List],
82 | [receive {Pid, done} -> ok end || Pid <- Pids],
83 | B = os:timestamp(),
84 |
85 | print(Engine, Concurrency, Number, A, B).
86 |
87 | print(Engine, Concurrency, Num, A, B) ->
88 | Microsecs = timer:now_diff(B, A),
89 | Milliseconds = Microsecs/1000,
90 | Secs = Milliseconds/1000,
91 | StanzaPerSec = Num/Secs,
92 | io:format("### engine: ~p concurrency ~p -> ~p ms ~.2f stanza/sec ~n", [Engine, Concurrency, Milliseconds, StanzaPerSec]).
93 |
94 | loop(0, _Fun) ->
95 | ok;
96 | loop(Nr, Fun) ->
97 | Fun(),
98 | loop(Nr-1, Fun).
99 |
100 | init(fast_xml) ->
101 | application:ensure_all_started(fast_xml);
102 | init(_) ->
103 | ok.
104 |
105 | to_binary(erlxml) ->
106 | erlxml:to_binary(?ELEMENT);
107 | to_binary(exml) ->
108 | exml:to_binary(?ELEMENT);
109 | to_binary(fast_xml) ->
110 | fxml:element_to_binary(?ELEMENT).
111 |
112 | parse(erlxml, Data) ->
113 | erlxml:parse(Data);
114 | parse(exml, Data) ->
115 | exml:parse(Data);
116 | parse(fast_xml, Data) ->
117 | fxml_stream:parse_element(Data).
118 |
119 | to_binary_ok(_Engine, Value) ->
120 | true == is_binary(Value).
121 |
122 | parse_ok(fast_xml, Data) ->
123 | true == is_binary(Data);
124 | parse_ok(_, Data) ->
125 | {ok, _} = Data.
126 |
127 |
--------------------------------------------------------------------------------
/benchmark/benchmark_stream.erl:
--------------------------------------------------------------------------------
1 | -module(benchmark_stream).
2 |
3 | -define(CHUNK_SIZE, 1024).
4 |
5 | -export([
6 | bench/4
7 | ]).
8 |
9 | bench(Module, File, Number, Concurrency) ->
10 | {Chunks, BinarySize} = readlines(File, ?CHUNK_SIZE),
11 |
12 | Self = self(),
13 | List = lists:seq(1, Concurrency),
14 | LoopNumbers = Number div Concurrency,
15 |
16 | ProcFun = fun() ->
17 | {ok, Parser} = new_parser(Module),
18 | NewParser1 = run_parser([<<"">>], Module, Parser),
19 | NewParser2 = loop(LoopNumbers, Chunks, Module, NewParser1),
20 | NewParser3 = run_parser([<<"">>], Module, NewParser2),
21 | close(Module, NewParser3),
22 | Self ! {self(), done}
23 | end,
24 |
25 | A = os:timestamp(),
26 | Pids = [spawn_link(ProcFun) || _ <- List],
27 | [receive {Pid, done} -> ok end || Pid <- Pids],
28 | B = os:timestamp(),
29 |
30 | print(Module, Concurrency, BinarySize*Number, A, B).
31 |
32 | loop(0, _Chunks, _Module, Parser) ->
33 | Parser;
34 | loop(Nr, Chunks, Module, Parser) ->
35 | NewParser = run_parser(Chunks, Module, Parser),
36 | loop(Nr-1, Chunks, Module, NewParser).
37 |
38 | run_parser([H|T], Module, Parser) ->
39 | case stream_parse(Module, Parser, H) of
40 | {ok, _} ->
41 | run_parser(T, Module, Parser);
42 | {ok, NewParser, _} ->
43 | run_parser(T, Module, NewParser)
44 | end;
45 | run_parser([], _Module, Parser) ->
46 | Parser.
47 |
48 | new_parser(erlxml) ->
49 | erlxml:new_stream([{stanza_limit, 65000}]);
50 | new_parser(exml) ->
51 | exml_stream:new_parser();
52 | new_parser(fast_xml) ->
53 | Parent = self(),
54 | ConsumerPid = spawn_link(fun() -> ok = fxml_receive_till_end(), Parent ! {fxml_completed, self()} end),
55 | {ok, fxml_stream:new(ConsumerPid)};
56 | new_parser(dummy) ->
57 | {ok, null}.
58 |
59 | stream_parse(erlxml, Parser, Data) ->
60 | erlxml:parse_stream(Parser, Data);
61 | stream_parse(exml, Parser , Data) ->
62 | exml_stream:parse(Parser, Data);
63 | stream_parse(fast_xml, Parser, Data) ->
64 | {ok, fxml_stream:parse(Parser, Data)};
65 | stream_parse(dummy, _Parser , _Data) ->
66 | {ok, []}.
67 |
68 | close(fast_xml, Parser) ->
69 | fxml_stream:close(Parser),
70 | receive
71 | {fxml_completed, _ConsumerPid} ->
72 | ok
73 | end;
74 | close(exml, Parser) ->
75 | exml_stream:free_parser(Parser);
76 | close(_Module, _Parser) ->
77 | ok.
78 |
79 | readlines(FileName, LengthChunks) ->
80 | {ok, Device} = file:open(FileName, [read]),
81 | Lines = get_lines(Device),
82 | Binary = binary_join(Lines),
83 | Size = byte_size(Binary),
84 | {build_chunks(Binary, LengthChunks, []), Size}.
85 |
86 | build_chunks(Binary, Length, Acc) ->
87 | case byte_size(Binary) > Length of
88 | true ->
89 | <> = Binary,
90 | build_chunks(Rest, Length, [Chunk | Acc]);
91 | _ ->
92 | lists:reverse([Binary|Acc])
93 | end.
94 |
95 | get_lines(Device) ->
96 | lists:reverse(get_lines(Device, [])).
97 |
98 | get_lines(Device, Accum) ->
99 | case io:get_line(Device, "") of
100 | eof ->
101 | file:close(Device), Accum;
102 | Line ->
103 | get_lines(Device, [list_to_binary(Line)|Accum])
104 | end.
105 |
106 | binary_join([Part]) ->
107 | Part;
108 | binary_join([Head|Tail]) ->
109 | lists:foldl(fun (Value, Acc) -> <> end, Head, Tail).
110 |
111 | print(Module, Concurrency, Bytes, A, B) ->
112 | Microsecond = timer:now_diff(B, A),
113 | Milliseconds = Microsecond /1000,
114 | Secs = Milliseconds/1000,
115 | BytesPerSec = Bytes/Secs,
116 | io:format("### engine: ~p concurrency: ~p -> ~p ms ~s/sec total bytes processed: ~s ~n", [Module, Concurrency, Milliseconds, format_size(BytesPerSec), format_size(Bytes)]).
117 |
118 | format_size(Size) ->
119 | format_size(Size, ["B","KB","MB","GB","TB","PB"]).
120 |
121 | format_size(S, [_|[_|_] = L]) when S >= 1024 -> format_size(S/1024, L);
122 | format_size(S, [M|_]) ->
123 | io_lib:format("~.2f ~s", [float(S), M]).
124 |
125 | fxml_receive_till_end() ->
126 | receive
127 | {'$gen_event', Msg} ->
128 | case Msg of
129 | {xmlstreamend, _} ->
130 | ok;
131 | _ ->
132 | fxml_receive_till_end()
133 | end
134 | end.
135 |
--------------------------------------------------------------------------------
/c_src/.gitattributes:
--------------------------------------------------------------------------------
1 | *.cpp linguist-language=Erlang
2 | *.hpp linguist-language=Erlang
3 |
--------------------------------------------------------------------------------
/c_src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | /env.mk
3 |
--------------------------------------------------------------------------------
/c_src/Makefile:
--------------------------------------------------------------------------------
1 |
2 | PROJECT_NIF_NAME=erlxml_nif
3 |
4 | include nif.mk
5 |
6 | ifeq ($(UNAME_SYS), darwin)
7 | LDFLAGS += -Wl,-U,_enif_alloc \
8 | -Wl,-U,_enif_free \
9 | -Wl,-U,_enif_alloc_resource \
10 | -Wl,-U,_enif_open_resource_type \
11 | -Wl,-U,_enif_release_resource \
12 | -Wl,-U,_enif_priv_data \
13 | -Wl,-U,_enif_self \
14 | -Wl,-U,_enif_consume_timeslice \
15 | -Wl,-U,_enif_inspect_binary \
16 | -Wl,-U,_enif_inspect_iolist_as_binary \
17 | -Wl,-U,_enif_is_binary \
18 | -Wl,-U,_enif_is_identical \
19 | -Wl,-U,_enif_is_list \
20 | -Wl,-U,_enif_get_int \
21 | -Wl,-U,_enif_get_list_cell \
22 | -Wl,-U,_enif_get_resource \
23 | -Wl,-U,_enif_get_tuple \
24 | -Wl,-U,_enif_make_atom \
25 | -Wl,-U,_enif_make_badarg \
26 | -Wl,-U,_enif_make_existing_atom \
27 | -Wl,-U,_enif_make_double \
28 | -Wl,-U,_enif_make_new_binary \
29 | -Wl,-U,_enif_make_resource \
30 | -Wl,-U,_enif_make_string_len \
31 | -Wl,-U,_enif_make_tuple \
32 | -Wl,-U,_enif_make_list \
33 | -Wl,-U,_enif_make_ulong \
34 | -Wl,-U,_enif_get_ulong \
35 | -Wl,-U,_enif_make_list_cell \
36 | -Wl,-U,_enif_make_reverse_list
37 | endif
38 |
39 | CXXFLAGS += -DNDEBUG -I pugixml \
40 | -g -Wextra -Werror -Wno-ignored-qualifiers -Wno-unused-const-variable -Wno-missing-field-initializers -fno-exceptions -fno-rtti -std=c++17
41 |
42 | LDFLAGS += -lstdc++
43 |
--------------------------------------------------------------------------------
/c_src/allocators.cc:
--------------------------------------------------------------------------------
1 | #include "allocators.h"
2 | #include "erl_nif.h"
3 |
4 | void* erlxml_allocate(size_t size)
5 | {
6 | return enif_alloc(size);
7 | }
8 |
9 | void erlxml_deallocate(void* ptr)
10 | {
11 | enif_free(ptr);
12 | }
13 |
14 |
--------------------------------------------------------------------------------
/c_src/allocators.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_ALLOCATORS_H_
2 | #define ERLXML_C_SRC_ALLOCATORS_H_
3 |
4 | #include
5 |
6 | void* erlxml_allocate(size_t size);
7 | void erlxml_deallocate(void* ptr);
8 |
9 | #endif
10 |
--------------------------------------------------------------------------------
/c_src/bytebuffer.cc:
--------------------------------------------------------------------------------
1 | #include "bytebuffer.h"
2 | #include "allocators.h"
3 |
4 | #include
5 | #include
6 |
7 | ByteBuffer::ByteBuffer()
8 | {
9 | Construct(NULL, 1024);
10 | }
11 |
12 | ByteBuffer::ByteBuffer(size_t size)
13 | {
14 | Construct(NULL, size);
15 | }
16 |
17 | ByteBuffer::ByteBuffer(const uint8_t* bytes, size_t len)
18 | {
19 | Construct(bytes, len);
20 | }
21 |
22 | void ByteBuffer::Construct(const uint8_t* bytes, size_t len)
23 | {
24 | start_ = 0;
25 | size_ = len;
26 | bytes_ = static_cast(erlxml_allocate(size_));
27 |
28 | if (bytes)
29 | {
30 | end_ = len;
31 | memcpy(bytes_, bytes, end_);
32 | }
33 | else
34 | {
35 | end_ = 0;
36 | }
37 | }
38 |
39 | ByteBuffer::~ByteBuffer()
40 | {
41 | erlxml_deallocate(bytes_);
42 | }
43 |
44 | bool ByteBuffer::ReadBytes(uint8_t* val, size_t len)
45 | {
46 | if (len > Length())
47 | return false;
48 |
49 | memcpy(val, bytes_ + start_, len);
50 | start_ += len;
51 | return true;
52 | }
53 |
54 | void ByteBuffer::WriteBytes(const uint8_t* val, size_t len)
55 | {
56 | memcpy(ReserveWriteBuffer(len), val, len);
57 | }
58 |
59 | uint8_t* ByteBuffer::ReserveWriteBuffer(size_t len)
60 | {
61 | if (Length() + len > Capacity())
62 | {
63 | if(!LeftShift())
64 | Resize(Length() + len);
65 | else if (Length() + len > Capacity())
66 | Resize(Length() + len);
67 | }
68 |
69 | uint8_t* start = bytes_ + end_;
70 | end_ += len;
71 | return start;
72 | }
73 |
74 | void ByteBuffer::Resize(size_t size)
75 | {
76 | if(size == size_)
77 | return;
78 |
79 | if (size > size_)
80 | size = std::max(size, 3 * size_ / 2);
81 |
82 | size_t len = std::min(end_ - start_, size);
83 | uint8_t* new_bytes = static_cast(erlxml_allocate(size));
84 | memcpy(new_bytes, bytes_ + start_, len);
85 | erlxml_deallocate(bytes_);
86 | start_ = 0;
87 | end_ = len;
88 | size_ = size;
89 | bytes_ = new_bytes;
90 | }
91 |
92 | bool ByteBuffer::Consume(size_t size)
93 | {
94 | if (size > Length())
95 | return false;
96 |
97 | start_ += size;
98 | return true;
99 | }
100 |
101 | bool ByteBuffer::LeftShift()
102 | {
103 | if(start_ == 0)
104 | return false;
105 |
106 | size_t length = end_ - start_;
107 |
108 | memmove(bytes_, bytes_ + start_, length);
109 | start_ = 0;
110 | end_ = length;
111 | return true;
112 | }
113 |
114 | void ByteBuffer::Clear()
115 | {
116 | start_ = end_ = 0;
117 | }
118 |
--------------------------------------------------------------------------------
/c_src/bytebuffer.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLTLS_C_SRC_BYTEBUFFER_H_
2 | #define ERLTLS_C_SRC_BYTEBUFFER_H_
3 |
4 | #include "macros.h"
5 |
6 | #include
7 | #include
8 |
9 | class ByteBuffer
10 | {
11 |
12 | public:
13 |
14 | ByteBuffer();
15 | ByteBuffer(size_t size);
16 | ByteBuffer(const uint8_t* bytes, size_t len);
17 |
18 | ~ByteBuffer();
19 |
20 | const uint8_t* Data() const { return bytes_ + start_; }
21 | size_t Length() const { return end_ - start_; }
22 | size_t Capacity() const { return size_ - start_; }
23 |
24 | bool ReadBytes(uint8_t* val, size_t len);
25 | void WriteBytes(const uint8_t* val, size_t len);
26 |
27 | uint8_t* ReserveWriteBuffer(size_t len);
28 | void Resize(size_t size);
29 | bool Consume(size_t size);
30 | void Clear();
31 |
32 | private:
33 |
34 | void Construct(const uint8_t* bytes, size_t size);
35 | bool LeftShift();
36 |
37 | uint8_t* bytes_;
38 | size_t size_;
39 | size_t start_;
40 | size_t end_;
41 |
42 | DISALLOW_COPY_AND_ASSIGN(ByteBuffer);
43 | };
44 |
45 | #endif
46 |
--------------------------------------------------------------------------------
/c_src/element_encoder.cc:
--------------------------------------------------------------------------------
1 |
2 | #include "element_encoder.h"
3 |
4 | #include "nif_utils.h"
5 | #include "erlxml_nif.h"
6 | #include "utf8_cleanup.h"
7 | #include "macros.h"
8 |
9 | #include
10 | #include
11 |
12 | static const int kXmlelArity = 4;
13 | static const int kXmlcdataArity = 2;
14 |
15 | ERL_NIF_TERM from_binary(ErlNifEnv* env, const char* data, size_t length, bool strip_non_utf8)
16 | {
17 | if(strip_non_utf8)
18 | {
19 | size_t new_size = utf8_cleanup(const_cast(data), length);
20 | return make_binary(env, data, new_size);
21 | }
22 |
23 | return make_binary(env, data, length);
24 | }
25 |
26 | // all the time we iterate over attributes and childrens in reverse order
27 | // to make sure we don't have to do lists:reverse in erlang
28 |
29 | bool pugi2stream_start(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list)
30 | {
31 | if(node.type() != pugi::node_element)
32 | return false;
33 |
34 | ERL_NIF_TERM name = make_binary(env, node.name(), strlen(node.name()));
35 | ERL_NIF_TERM attrs = enif_make_list(env, 0);
36 |
37 | for (pugi::xml_attribute_iterator ait = node.attributes_end(); ait != node.attributes_begin();)
38 | {
39 | --ait;
40 | ERL_NIF_TERM key = make_binary(env, ait->name(), strlen(ait->name()));
41 | ERL_NIF_TERM value = from_binary(env, ait->value(), strlen(ait->value()), strip_non_utf8);
42 | attrs = enif_make_list_cell(env, enif_make_tuple2(env, key, value), attrs);
43 | }
44 |
45 | ERL_NIF_TERM xmlstreamstart = enif_make_tuple3(env, ATOMS.atomXmlStreamStart, name, attrs);
46 | *list = enif_make_list_cell(env, xmlstreamstart, *list);
47 |
48 | return true;
49 | }
50 |
51 | void pugi2term(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list)
52 | {
53 | switch(node.type())
54 | {
55 | case pugi::node_element:
56 | {
57 | ERL_NIF_TERM name = make_binary(env, node.name(), strlen(node.name()));
58 | ERL_NIF_TERM attrs = enif_make_list(env, 0);
59 | ERL_NIF_TERM childrens = enif_make_list(env, 0);
60 |
61 | for (pugi::xml_attribute_iterator ait = node.attributes_end(); ait != node.attributes_begin();)
62 | {
63 | --ait;
64 | ERL_NIF_TERM key = make_binary(env, ait->name(), strlen(ait->name()));
65 | ERL_NIF_TERM value = from_binary(env, ait->value(), strlen(ait->value()), strip_non_utf8);
66 | attrs = enif_make_list_cell(env, enif_make_tuple2(env, key, value), attrs);
67 | }
68 |
69 | for (pugi::xml_node_iterator nit = node.end(); nit != node.begin();)
70 | {
71 | --nit;
72 | pugi2term(env, *nit, strip_non_utf8, &childrens);
73 | }
74 |
75 | ERL_NIF_TERM xmlel = enif_make_tuple4(env, ATOMS.atomXmlel, name, attrs, childrens);
76 | *list = enif_make_list_cell(env, xmlel, *list);
77 | break;
78 | }
79 |
80 | case pugi::node_pcdata:
81 | {
82 | ERL_NIF_TERM value = from_binary(env, node.value(), strlen(node.value()), strip_non_utf8);
83 | *list = enif_make_list_cell(env, enif_make_tuple2(env, ATOMS.atomXmlcdata, value), *list);
84 | break;
85 | }
86 |
87 | default:;
88 | }
89 | }
90 |
91 | bool parse_attributes(ErlNifEnv* env, ERL_NIF_TERM list, pugi::xml_node& node)
92 | {
93 | ERL_NIF_TERM head;
94 | const ERL_NIF_TERM *items;
95 | int arity;
96 |
97 | while(enif_get_list_cell(env, list, &head, &list))
98 | {
99 | if(!enif_get_tuple(env, head, &arity, &items) || arity != 2)
100 | return false;
101 |
102 | ErlNifBinary key;
103 | ErlNifBinary value;
104 |
105 | if(!get_binary(env, items[0], &key) || !get_binary(env, items[1], &value))
106 | return false;
107 |
108 | node.append_attribute(STRING_VIEW(key)).set_value(STRING_VIEW(value));
109 | }
110 |
111 | return true;
112 | }
113 |
114 | bool parse_childrens(ErlNifEnv* env, ERL_NIF_TERM list, pugi::xml_node& node)
115 | {
116 | ERL_NIF_TERM head;
117 |
118 | while(enif_get_list_cell(env, list, &head, &list))
119 | {
120 | if(!term2pugi(env, head, node))
121 | return false;
122 | }
123 |
124 | return true;
125 | }
126 |
127 | bool term2pugi(ErlNifEnv* env, ERL_NIF_TERM element, pugi::xml_node& node)
128 | {
129 | const ERL_NIF_TERM *items;
130 | int arity;
131 |
132 | if(!enif_get_tuple(env, element, &arity, &items))
133 | return false;
134 |
135 | if(arity == kXmlelArity && enif_is_identical(ATOMS.atomXmlel, items[0]))
136 | {
137 | //parse xmlel
138 | ErlNifBinary name;
139 |
140 | if(!get_binary(env, items[1], &name))
141 | return false;
142 |
143 | pugi::xml_node element = node.append_child(STRING_VIEW(name));
144 |
145 | if(!parse_attributes(env, items[2], element))
146 | return false;
147 |
148 | if(!parse_childrens(env, items[3], element))
149 | return false;
150 | }
151 | else if(arity == kXmlcdataArity && enif_is_identical(ATOMS.atomXmlcdata, items[0]))
152 | {
153 | ErlNifBinary value;
154 |
155 | if(!get_binary(env, items[1], &value))
156 | return false;
157 |
158 | node.append_child(pugi::node_pcdata).set_value(STRING_VIEW(value));
159 | }
160 | else
161 | {
162 | return false;
163 | }
164 |
165 | return true;
166 | }
167 |
--------------------------------------------------------------------------------
/c_src/element_encoder.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_ELEMENTS_ENCODER_H_
2 | #define ERLXML_C_SRC_ELEMENTS_ENCODER_H_
3 |
4 | #include "pugixml.hpp"
5 | #include "erl_nif.h"
6 |
7 | bool pugi2stream_start(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list);
8 | void pugi2term(ErlNifEnv*env, const pugi::xml_node& node, bool strip_non_utf8, ERL_NIF_TERM* list);
9 | bool term2pugi(ErlNifEnv* env, ERL_NIF_TERM element, pugi::xml_node& node);
10 |
11 | #endif
12 |
13 |
--------------------------------------------------------------------------------
/c_src/erlxml.cc:
--------------------------------------------------------------------------------
1 | #include "erlxml.h"
2 | #include "erlxml_nif.h"
3 | #include "nif_utils.h"
4 | #include "xmlstreamparser.h"
5 | #include "element_encoder.h"
6 |
7 | const char kErrorFailedToAllocXmlStream[] = "failed to alloc stream object";
8 | const char kErrorBadOwner[] = "erlxml session was created on a different process";
9 |
10 | struct enif_erlxml_stream
11 | {
12 | XmlStreamParser* parser;
13 | ERL_NIF_TERM owner_pid;
14 | };
15 |
16 | struct stream_options
17 | {
18 | stream_options() : stanza_limit(0), strip_invalid_utf8(false) {}
19 |
20 | size_t stanza_limit;
21 | bool strip_invalid_utf8;
22 | };
23 |
24 | struct parser_data
25 | {
26 | parser_data(ErlNifEnv* e, ERL_NIF_TERM t) : env(e), term(t) {}
27 |
28 | ErlNifEnv* env;
29 | ERL_NIF_TERM term;
30 | };
31 |
32 | struct xml_string_writer: pugi::xml_writer
33 | {
34 | ByteBuffer buffer;
35 |
36 | void write(const void* data, size_t size)
37 | {
38 | buffer.WriteBytes(reinterpret_cast(data), size);
39 | }
40 | };
41 |
42 | void enif_stream_parser_free(ErlNifEnv* env, void* obj)
43 | {
44 | UNUSED(env);
45 |
46 | enif_erlxml_stream* stream = static_cast(obj);
47 |
48 | if(stream->parser != NULL)
49 | delete stream->parser;
50 | }
51 |
52 | bool handle_start_stream(void* user_data, pugi::xml_document& doc, bool strip_non_utf8)
53 | {
54 | parser_data* wp = reinterpret_cast(user_data);
55 | return pugi2stream_start(wp->env, doc.first_child(), strip_non_utf8, &wp->term);
56 | }
57 |
58 | void handle_stanza(void* user_data, pugi::xml_document& doc, bool strip_non_utf8)
59 | {
60 | parser_data* wp = reinterpret_cast(user_data);
61 | pugi2term(wp->env, doc.first_child(), strip_non_utf8, &wp->term);
62 | }
63 |
64 | void handle_end_stream(void* user_data, const std::string& rootname)
65 | {
66 | parser_data* wp = reinterpret_cast(user_data);
67 | ERL_NIF_TERM name = make_binary(wp->env, rootname.c_str(), rootname.length());
68 | ERL_NIF_TERM xmlstreamstart = enif_make_tuple2(wp->env, ATOMS.atomXmlStreamEnd, name);
69 | wp->term = enif_make_list_cell(wp->env, xmlstreamstart, wp->term);
70 | }
71 |
72 | ERL_NIF_TERM parse_stream_options(ErlNifEnv* env, ERL_NIF_TERM list, stream_options* opts)
73 | {
74 | if(!enif_is_list(env, list))
75 | return make_bad_options(env, list);
76 |
77 | ERL_NIF_TERM head;
78 | const ERL_NIF_TERM *items;
79 | int arity;
80 |
81 | while(enif_get_list_cell(env, list, &head, &list))
82 | {
83 | if(!enif_get_tuple(env, head, &arity, &items) || arity != 2)
84 | return make_bad_options(env, head);
85 |
86 | ERL_NIF_TERM key = items[0];
87 | ERL_NIF_TERM value = items[1];
88 |
89 | if(enif_is_identical(key, ATOMS.atomStanzaLimit))
90 | {
91 | if(!enif_get_uint64(env, value, &opts->stanza_limit))
92 | return make_bad_options(env, head);
93 | }
94 | else if(enif_is_identical(key, ATOMS.atomStripInvalidUtf8))
95 | {
96 | if(!get_boolean(value, &opts->strip_invalid_utf8))
97 | return make_bad_options(env, head);
98 | }
99 | else
100 | {
101 | return make_bad_options(env, head);
102 | }
103 | }
104 |
105 | return ATOMS.atomOk;
106 | }
107 |
108 | ERL_NIF_TERM enif_stream_parser_new(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
109 | {
110 | UNUSED(argc);
111 |
112 | erlxml_data* data = static_cast(enif_priv_data(env));
113 |
114 | stream_options opts;
115 |
116 | ERL_NIF_TERM parse_result = parse_stream_options(env, argv[0], &opts);
117 |
118 | if(!enif_is_identical(parse_result, ATOMS.atomOk))
119 | return parse_result;
120 |
121 | enif_erlxml_stream* nif_stream = static_cast(enif_alloc_resource(data->res_xml_stream_parser, sizeof(enif_erlxml_stream)));
122 |
123 | if(nif_stream == NULL)
124 | return make_error(env, kErrorFailedToAllocXmlStream);
125 |
126 | ErlNifPid current_pid;
127 | enif_self(env, ¤t_pid);
128 |
129 | nif_stream->parser = new XmlStreamParser(opts.stanza_limit, opts.strip_invalid_utf8, handle_start_stream, handle_end_stream, handle_stanza);
130 | nif_stream->owner_pid = enif_make_pid(env, ¤t_pid);
131 |
132 | ERL_NIF_TERM term = enif_make_resource(env, nif_stream);
133 | enif_release_resource(nif_stream);
134 | return enif_make_tuple2(env, ATOMS.atomOk, term);
135 | }
136 |
137 | ERL_NIF_TERM enif_stream_parser_feed(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
138 | {
139 | UNUSED(argc);
140 |
141 | erlxml_data* data = static_cast(enif_priv_data(env));
142 |
143 | enif_erlxml_stream* stream = NULL;
144 | ErlNifBinary bin;
145 |
146 | if(!enif_get_resource(env, argv[0], data->res_xml_stream_parser, (void**) &stream))
147 | return make_badarg(env);
148 |
149 | if(!get_binary(env, argv[1], &bin))
150 | return make_badarg(env);
151 |
152 | ErlNifPid current_pid;
153 |
154 | if(enif_self(env, ¤t_pid) && !enif_is_identical(stream->owner_pid, enif_make_pid(env, ¤t_pid)))
155 | return make_error(env, kErrorBadOwner);
156 |
157 | parser_data parser_data(env, enif_make_list(env, 0));
158 | XmlStreamParser::parse_result result = stream->parser->FeedData(bin.data, bin.size, &parser_data);
159 |
160 | consume_timeslice(env, bin.size);
161 |
162 | switch (result)
163 | {
164 | case XmlStreamParser::kParseOk:
165 | if(!enif_make_reverse_list(env, parser_data.term, &parser_data.term))
166 | return make_error(env, "failed to reverse the element list");
167 |
168 | return make_ok_result(env, parser_data.term);
169 |
170 | case XmlStreamParser::kParseInvalidXml:
171 | case XmlStreamParser::kParseStanzaLimitHit:
172 | {
173 | ERL_NIF_TERM error_tag = (result == XmlStreamParser::kParseInvalidXml ? ATOMS.atomErrorInvalidStanza : ATOMS.atomErrorMaxStanzaLimitHit);
174 | const char* data = reinterpret_cast(stream->parser->GetBufferedData()->Data());
175 | ERL_NIF_TERM binary = make_binary(env, data, stream->parser->GetBufferedData()->Length());
176 | stream->parser->Reset(true);
177 | return make_error(env, enif_make_tuple2(env, error_tag, binary));
178 | }
179 |
180 | default:
181 | return make_error(env, "unknown error");
182 | }
183 | }
184 |
185 | ERL_NIF_TERM enif_stream_parser_reset(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
186 | {
187 | UNUSED(argc);
188 |
189 | erlxml_data* data = static_cast(enif_priv_data(env));
190 |
191 | enif_erlxml_stream* stream = NULL;
192 | ErlNifPid current_pid;
193 |
194 | if(!enif_get_resource(env, argv[0], data->res_xml_stream_parser, (void**) &stream))
195 | return make_badarg(env);
196 |
197 | if(enif_self(env, ¤t_pid) && !enif_is_identical(stream->owner_pid, enif_make_pid(env, ¤t_pid)))
198 | return make_error(env, kErrorBadOwner);
199 |
200 | stream->parser->Reset(true);
201 | return ATOMS.atomOk;
202 | }
203 |
204 | ERL_NIF_TERM enif_dom_parse(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
205 | {
206 | UNUSED(argc);
207 |
208 | ErlNifBinary bin;
209 |
210 | if(!get_binary(env, argv[0], &bin))
211 | return make_badarg(env);
212 |
213 | pugi::xml_document pugi_doc;
214 |
215 | if(pugi_doc.load_buffer(bin.data, bin.size, pugi::parse_default).status != pugi::status_ok)
216 | return make_error(env, ATOMS.atomErrorInvalidStanza);
217 |
218 | ERL_NIF_TERM list = enif_make_list(env, 0);
219 | pugi2term(env, pugi_doc.first_child(), false, &list);
220 |
221 | ERL_NIF_TERM head;
222 | ERL_NIF_TERM tail;
223 |
224 | if(!enif_get_list_cell(env, list, &head, &tail))
225 | return make_error(env, ATOMS.atomErrorInvalidStanza);
226 |
227 | return make_ok_result(env, head);
228 | }
229 |
230 | ERL_NIF_TERM enif_dom_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
231 | {
232 | UNUSED(argc);
233 |
234 | pugi::xml_document doc;
235 |
236 | if(!term2pugi(env, argv[0], doc))
237 | return make_badarg(env);
238 |
239 | xml_string_writer w;
240 | doc.document_element().print(w, "\t", pugi::format_raw);
241 | return make_binary(env, reinterpret_cast(w.buffer.Data()), w.buffer.Length());
242 | }
243 |
--------------------------------------------------------------------------------
/c_src/erlxml.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_ERLXML_H_
2 | #define ERLXML_C_SRC_ERLXML_H_
3 |
4 | #include "erl_nif.h"
5 |
6 | ERL_NIF_TERM enif_stream_parser_new(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
7 | void enif_stream_parser_free(ErlNifEnv* env, void* obj);
8 |
9 | ERL_NIF_TERM enif_stream_parser_feed(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
10 | ERL_NIF_TERM enif_stream_parser_reset(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
11 | ERL_NIF_TERM enif_dom_parse(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
12 | ERL_NIF_TERM enif_dom_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
13 |
14 | #endif
15 |
--------------------------------------------------------------------------------
/c_src/erlxml_nif.cc:
--------------------------------------------------------------------------------
1 | #include "erlxml_nif.h"
2 | #include "nif_utils.h"
3 | #include "macros.h"
4 | #include "pugixml.hpp"
5 | #include "allocators.h"
6 | #include "erlxml.h"
7 |
8 | const char kAtomOk[] = "ok";
9 | const char kAtomError[] = "error";
10 | const char kAtomTrue[] = "true";
11 | const char kAtomFalse[] = "false";
12 | const char kAtomBadArg[] = "badarg";
13 | const char kAtomOptions[] = "options";
14 |
15 | const char kAtomStanzaLimit[] = "stanza_limit";
16 | const char kAtomStripInvalidUtf8[] = "strip_non_utf8";
17 |
18 | const char kAtomErrorInvalidStanza[] = "invalid_stanza";
19 | const char kAtomErrorMaxStanzaLimitHit[] = "max_stanza_limit_hit";
20 |
21 | const char kAtomXmlel[] = "xmlel";
22 | const char kAtomXmlcdata[] = "xmlcdata";
23 | const char kAtomXmlStreamStart[] = "xmlstreamstart";
24 | const char kAtomXmlStreamEnd[] = "xmlstreamend";
25 |
26 | atoms ATOMS;
27 |
28 | void open_resources(ErlNifEnv* env, erlxml_data* data)
29 | {
30 | ErlNifResourceFlags flags = static_cast(ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER);
31 | data->res_xml_stream_parser = enif_open_resource_type(env, NULL, "res_xml_stream_parser", enif_stream_parser_free, flags, NULL);
32 | }
33 |
34 | int on_nif_load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM load_info)
35 | {
36 | UNUSED(load_info);
37 |
38 | pugi::set_memory_management_functions(erlxml_allocate, erlxml_deallocate);
39 |
40 | ATOMS.atomOk = make_atom(env, kAtomOk);
41 | ATOMS.atomError = make_atom(env, kAtomError);
42 | ATOMS.atomTrue = make_atom(env, kAtomTrue);
43 | ATOMS.atomFalse = make_atom(env, kAtomFalse);
44 | ATOMS.atomOptions = make_atom(env, kAtomOptions);
45 | ATOMS.atomBadArg = make_atom(env, kAtomBadArg);
46 |
47 | ATOMS.atomErrorInvalidStanza = make_atom(env, kAtomErrorInvalidStanza);
48 | ATOMS.atomErrorMaxStanzaLimitHit = make_atom(env, kAtomErrorMaxStanzaLimitHit);
49 |
50 | ATOMS.atomStanzaLimit = make_atom(env, kAtomStanzaLimit);
51 | ATOMS.atomStripInvalidUtf8 = make_atom(env, kAtomStripInvalidUtf8);
52 |
53 | ATOMS.atomXmlel = make_atom(env, kAtomXmlel);
54 | ATOMS.atomXmlcdata = make_atom(env, kAtomXmlcdata);
55 | ATOMS.atomXmlStreamStart = make_atom(env, kAtomXmlStreamStart);
56 | ATOMS.atomXmlStreamEnd = make_atom(env, kAtomXmlStreamEnd);
57 |
58 | erlxml_data* data = static_cast(enif_alloc(sizeof(erlxml_data)));
59 | open_resources(env, data);
60 |
61 | *priv_data = data;
62 | return 0;
63 | }
64 |
65 | void on_nif_unload(ErlNifEnv* env, void* priv_data)
66 | {
67 | UNUSED(env);
68 |
69 | erlxml_data* data = static_cast(priv_data);
70 | enif_free(data);
71 | }
72 |
73 | int on_nif_upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM info)
74 | {
75 | UNUSED(old_priv);
76 | UNUSED(info);
77 |
78 | erlxml_data* data = static_cast(enif_alloc(sizeof(erlxml_data)));
79 | open_resources(env, data);
80 |
81 | *priv = data;
82 | return 0;
83 | }
84 |
85 | static ErlNifFunc nif_funcs[] =
86 | {
87 | {"new_stream", 1, enif_stream_parser_new},
88 | {"feed_stream", 2, enif_stream_parser_feed},
89 | {"reset_stream", 1, enif_stream_parser_reset},
90 | {"dom_parse", 1, enif_dom_parse},
91 | {"to_binary", 1, enif_dom_to_binary}
92 | };
93 |
94 | ERL_NIF_INIT(erlxml_nif, nif_funcs, on_nif_load, NULL, on_nif_upgrade, on_nif_unload)
95 |
--------------------------------------------------------------------------------
/c_src/erlxml_nif.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_ERLXML_NIF_H_
2 | #define ERLXML_C_SRC_ERLXML_NIF_H_
3 |
4 | #include "erl_nif.h"
5 |
6 | struct atoms
7 | {
8 | ERL_NIF_TERM atomOk;
9 | ERL_NIF_TERM atomError;
10 | ERL_NIF_TERM atomTrue;
11 | ERL_NIF_TERM atomFalse;
12 | ERL_NIF_TERM atomBadArg;
13 | ERL_NIF_TERM atomOptions;
14 |
15 | //errors
16 | ERL_NIF_TERM atomErrorInvalidStanza;
17 | ERL_NIF_TERM atomErrorMaxStanzaLimitHit;
18 |
19 | //options
20 | ERL_NIF_TERM atomStanzaLimit;
21 | ERL_NIF_TERM atomStripInvalidUtf8;
22 |
23 | //elements
24 |
25 | ERL_NIF_TERM atomXmlel;
26 | ERL_NIF_TERM atomXmlcdata;
27 | ERL_NIF_TERM atomXmlStreamStart;
28 | ERL_NIF_TERM atomXmlStreamEnd;
29 |
30 | };
31 |
32 | struct erlxml_data
33 | {
34 | ErlNifResourceType* res_xml_stream_parser;
35 | };
36 |
37 | extern atoms ATOMS;
38 |
39 | #endif
40 |
--------------------------------------------------------------------------------
/c_src/macros.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_MACROS_H_
2 | #define ERLXML_C_SRC_MACROS_H_
3 |
4 | #define UNUSED(expr) do { (void)(expr); } while (0)
5 | #define STRING_VIEW(bin) std::string_view(reinterpret_cast(bin.data), bin.size)
6 |
7 | #define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&)
8 | #define DISALLOW_COPY_AND_ASSIGN(TypeName) TypeName(const TypeName&); DISALLOW_ASSIGN(TypeName)
9 | #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) TypeName(); DISALLOW_COPY_AND_ASSIGN(TypeName)
10 |
11 | #ifdef NDEBUG
12 | #define ASSERT(x) UNUSED(x)
13 | #else
14 | #include
15 | #define ASSERT(x) assert(x)
16 | #endif
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/c_src/nif.mk:
--------------------------------------------------------------------------------
1 | # Based on c_src.mk from erlang.mk by Loic Hoguin
2 | # https://github.com/ninenines/erlang.mk/blob/master/plugins/c_src.mk
3 |
4 | CURDIR := $(shell pwd)
5 | BASEDIR := $(abspath $(CURDIR)/..)
6 |
7 | ifndef REBAR_BARE_COMPILER_OUTPUT_DIR
8 | PRIV_DIR ?= $(BASEDIR)/priv
9 | else
10 | PRIV_DIR ?= $(REBAR_BARE_COMPILER_OUTPUT_DIR)/priv
11 | endif
12 |
13 | C_SRC_DIR = $(CURDIR)
14 | C_SRC_ENV ?= $(C_SRC_DIR)/env.mk
15 | C_SRC_OUTPUT ?= $(PRIV_DIR)/$(PROJECT_NIF_NAME).so
16 |
17 | #regenerate all the time the env.mk
18 | ifneq ($(wildcard $(C_SRC_DIR)),)
19 | GEN_ENV ?= $(shell erl -noshell -s init stop -eval "file:write_file(\"$(C_SRC_ENV)\", \
20 | io_lib:format( \
21 | \"ERTS_INCLUDE_DIR ?= ~s/erts-~s/include/~n\" \
22 | \"ERL_INTERFACE_INCLUDE_DIR ?= ~s~n\" \
23 | \"ERL_INTERFACE_LIB_DIR ?= ~s~n\", \
24 | [code:root_dir(), erlang:system_info(version), \
25 | code:lib_dir(erl_interface, include), \
26 | code:lib_dir(erl_interface, lib)])), \
27 | halt().")
28 | $(GEN_ENV)
29 | endif
30 |
31 | include $(C_SRC_ENV)
32 |
33 | # System type and C compiler/flags.
34 |
35 | UNAME_SYS_ORG := $(shell uname -s)
36 | UNAME_SYS = $(shell echo $(UNAME_SYS_ORG) | tr A-Z a-z)
37 |
38 | ifeq ($(UNAME_SYS), darwin)
39 | CC ?= cc
40 | CFLAGS ?= -O3 -std=c99 -finline-functions -Wall
41 | CXXFLAGS ?= -O3 -Wall
42 | LDFLAGS ?=
43 | else ifeq ($(UNAME_SYS), freebsd)
44 | CC ?= cc
45 | CFLAGS ?= -O3 -std=c99 -finline-functions -Wall
46 | CXXFLAGS ?= -O3 -finline-functions -Wall
47 | LDFLAGS ?= -Wl,--exclude-libs=ALL
48 | else ifeq ($(UNAME_SYS), linux)
49 | CC ?= gcc
50 | CFLAGS ?= -O3 -std=c99 -finline-functions -Wall
51 | CXXFLAGS ?= -O3 -finline-functions -Wall
52 | LDFLAGS ?= -Wl,--exclude-libs=ALL
53 | endif
54 |
55 | CFLAGS += -fPIC -I $(ERTS_INCLUDE_DIR) -I $(ERL_INTERFACE_INCLUDE_DIR)
56 | CXXFLAGS += -fPIC -I $(ERTS_INCLUDE_DIR) -I $(ERL_INTERFACE_INCLUDE_DIR)
57 | LDFLAGS += -L $(ERL_INTERFACE_LIB_DIR) -shared -lei
58 |
59 | # Verbosity.
60 |
61 | c_verbose_0 = @echo " C " $(?F);
62 | c_verbose = $(c_verbose_$(V))
63 |
64 | cpp_verbose_0 = @echo " CPP " $(?F);
65 | cpp_verbose = $(cpp_verbose_$(V))
66 |
67 | link_verbose_0 = @echo " LD " $(@F);
68 | link_verbose = $(link_verbose_$(V))
69 |
70 | SOURCES := $(shell find $(C_SRC_DIR) -type f \( -name "*.c" -o -name "*.C" -o -name "*.cc" -o -name "*.cpp" \))
71 | OBJECTS = $(addsuffix .o, $(basename $(SOURCES)))
72 |
73 | COMPILE_C = $(c_verbose) $(CC) $(CFLAGS) $(CPPFLAGS) -c
74 | COMPILE_CPP = $(cpp_verbose) $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c
75 |
76 | $(C_SRC_OUTPUT): $(OBJECTS)
77 | @mkdir -p $(PRIV_DIR)/
78 | $(link_verbose) $(CC) $(OBJECTS) $(LDFLAGS) -o $(C_SRC_OUTPUT)
79 |
80 | %.o: %.c
81 | $(COMPILE_C) $(OUTPUT_OPTION) $<
82 |
83 | %.o: %.cc
84 | $(COMPILE_CPP) $(OUTPUT_OPTION) $<
85 |
86 | %.o: %.C
87 | $(COMPILE_CPP) $(OUTPUT_OPTION) $<
88 |
89 | %.o: %.cpp
90 | $(COMPILE_CPP) $(OUTPUT_OPTION) $<
91 |
92 | clean:
93 | @rm -f $(C_SRC_OUTPUT) $(OBJECTS); rm -f $(C_SRC_ENV)
94 |
--------------------------------------------------------------------------------
/c_src/nif_utils.cc:
--------------------------------------------------------------------------------
1 | #include "nif_utils.h"
2 | #include "erlxml_nif.h"
3 | #include "macros.h"
4 |
5 | #include
6 |
7 | // This should correspond to the similar define in erlxml_nif.erl
8 | #define MAX_BYTES_TO_NIF 20000
9 |
10 | ERL_NIF_TERM make_atom(ErlNifEnv* env, const char* name)
11 | {
12 | ERL_NIF_TERM ret;
13 |
14 | if(enif_make_existing_atom(env, name, &ret, ERL_NIF_LATIN1))
15 | return ret;
16 |
17 | return enif_make_atom(env, name);
18 | }
19 |
20 | ERL_NIF_TERM make_binary(ErlNifEnv* env, const char* buff, size_t length)
21 | {
22 | ERL_NIF_TERM term;
23 | uint8_t *destination_buffer = enif_make_new_binary(env, length, &term);
24 | memcpy(destination_buffer, buff, length);
25 | return term;
26 | }
27 |
28 | ERL_NIF_TERM make_error(ErlNifEnv* env, const char* error)
29 | {
30 | return make_error(env, make_binary(env, error, strlen(error)));
31 | }
32 |
33 | ERL_NIF_TERM make_error(ErlNifEnv* env, ERL_NIF_TERM term)
34 | {
35 | return enif_make_tuple2(env, ATOMS.atomError, term);
36 | }
37 |
38 | ERL_NIF_TERM make_bad_options(ErlNifEnv* env, ERL_NIF_TERM term)
39 | {
40 | return make_error(env, enif_make_tuple(env, 2, ATOMS.atomOptions, term));
41 | }
42 |
43 | ERL_NIF_TERM make_badarg(ErlNifEnv* env)
44 | {
45 | return enif_make_tuple2(env, ATOMS.atomError, ATOMS.atomBadArg);
46 | }
47 |
48 | ERL_NIF_TERM make_ok_result(ErlNifEnv* env, ERL_NIF_TERM term)
49 | {
50 | return enif_make_tuple(env, 2, ATOMS.atomOk, term);
51 | }
52 |
53 | void consume_timeslice(ErlNifEnv *env, size_t bytes)
54 | {
55 | int cost = static_cast((bytes * 100) / MAX_BYTES_TO_NIF);
56 |
57 | if(cost)
58 | enif_consume_timeslice(env, cost > 100 ? 100 : cost);
59 | }
60 |
61 | bool get_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin)
62 | {
63 | if(enif_is_binary(env, term))
64 | return enif_inspect_binary(env, term, bin);
65 |
66 | return enif_inspect_iolist_as_binary(env, term, bin);
67 | }
68 |
69 | bool get_boolean(ERL_NIF_TERM term, bool* val)
70 | {
71 | if(enif_is_identical(term, ATOMS.atomTrue))
72 | {
73 | *val = true;
74 | return true;
75 | }
76 |
77 | if(enif_is_identical(term, ATOMS.atomFalse))
78 | {
79 | *val = false;
80 | return true;
81 | }
82 |
83 | return false;
84 | }
85 |
--------------------------------------------------------------------------------
/c_src/nif_utils.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLTLS_C_SRC_NIF_UTILS_H_
2 | #define ERLTLS_C_SRC_NIF_UTILS_H_
3 |
4 | #include
5 |
6 | #include "erl_nif.h"
7 |
8 | ERL_NIF_TERM make_atom(ErlNifEnv* env, const char* name);
9 | ERL_NIF_TERM make_error(ErlNifEnv* env, const char* error);
10 | ERL_NIF_TERM make_error(ErlNifEnv* env, ERL_NIF_TERM term);
11 | ERL_NIF_TERM make_bad_options(ErlNifEnv* env, ERL_NIF_TERM term);
12 | ERL_NIF_TERM make_badarg(ErlNifEnv* env);
13 | ERL_NIF_TERM make_binary(ErlNifEnv* env, const char* buff, size_t length);
14 | ERL_NIF_TERM make_ok_result(ErlNifEnv* env, ERL_NIF_TERM term);
15 |
16 | void consume_timeslice(ErlNifEnv *env, size_t bytes);
17 |
18 | bool get_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin);
19 | bool get_boolean(ERL_NIF_TERM term, bool* val);
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/c_src/pugixml/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 |
--------------------------------------------------------------------------------
/c_src/pugixml/pugiconfig.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * pugixml parser - version 1.15
3 | * --------------------------------------------------------
4 | * Copyright (C) 2006-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
5 | * Report bugs and download new versions at https://pugixml.org/
6 | *
7 | * This library is distributed under the MIT License. See notice at the end
8 | * of this file.
9 | *
10 | * This work is based on the pugxml parser, which is:
11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
12 | */
13 |
14 | #ifndef HEADER_PUGICONFIG_HPP
15 | #define HEADER_PUGICONFIG_HPP
16 |
17 | // Uncomment this to enable wchar_t mode
18 | // #define PUGIXML_WCHAR_MODE
19 |
20 | // Uncomment this to enable compact mode
21 | // #define PUGIXML_COMPACT
22 |
23 | // Uncomment this to disable XPath
24 | #define PUGIXML_NO_XPATH
25 |
26 | // Uncomment this to disable STL
27 | // #define PUGIXML_NO_STL
28 |
29 | // Uncomment this to disable exceptions
30 | #define PUGIXML_NO_EXCEPTIONS
31 |
32 | // Set this to control attributes for public classes/functions, i.e.:
33 | // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
34 | // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
35 | // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
36 | // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
37 |
38 | // Tune these constants to adjust memory-related behavior
39 | // #define PUGIXML_MEMORY_PAGE_SIZE 32768
40 | // #define PUGIXML_MEMORY_OUTPUT_STACK 10240
41 | // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
42 |
43 | // Tune this constant to adjust max nesting for XPath queries
44 | // #define PUGIXML_XPATH_DEPTH_LIMIT 1024
45 |
46 | // Uncomment this to switch to header-only version
47 | // #define PUGIXML_HEADER_ONLY
48 |
49 | // Uncomment this to enable long long support (usually enabled automatically)
50 | #define PUGIXML_HAS_LONG_LONG
51 |
52 | // Uncomment this to enable support for std::string_view (usually enabled automatically)
53 | #define PUGIXML_HAS_STRING_VIEW
54 |
55 | #endif
56 |
57 | /**
58 | * Copyright (c) 2006-2025 Arseny Kapoulkine
59 | *
60 | * Permission is hereby granted, free of charge, to any person
61 | * obtaining a copy of this software and associated documentation
62 | * files (the "Software"), to deal in the Software without
63 | * restriction, including without limitation the rights to use,
64 | * copy, modify, merge, publish, distribute, sublicense, and/or sell
65 | * copies of the Software, and to permit persons to whom the
66 | * Software is furnished to do so, subject to the following
67 | * conditions:
68 | *
69 | * The above copyright notice and this permission notice shall be
70 | * included in all copies or substantial portions of the Software.
71 | *
72 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
73 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
74 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
75 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
76 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
77 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
78 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
79 | * OTHER DEALINGS IN THE SOFTWARE.
80 | */
81 |
--------------------------------------------------------------------------------
/c_src/pugixml/pugixml.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * pugixml parser - version 1.15
3 | * --------------------------------------------------------
4 | * Copyright (C) 2006-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
5 | * Report bugs and download new versions at https://pugixml.org/
6 | *
7 | * This library is distributed under the MIT License. See notice at the end
8 | * of this file.
9 | *
10 | * This work is based on the pugxml parser, which is:
11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
12 | */
13 |
14 | // Define version macro; evaluates to major * 1000 + minor * 10 + patch so that it's safe to use in less-than comparisons
15 | // Note: pugixml used major * 100 + minor * 10 + patch format up until 1.9 (which had version identifier 190); starting from pugixml 1.10, the minor version number is two digits
16 | #ifndef PUGIXML_VERSION
17 | # define PUGIXML_VERSION 1150 // 1.15
18 | #endif
19 |
20 | // Include user configuration file (this can define various configuration macros)
21 | #include "pugiconfig.hpp"
22 |
23 | #ifndef HEADER_PUGIXML_HPP
24 | #define HEADER_PUGIXML_HPP
25 |
26 | // Include stddef.h for size_t and ptrdiff_t
27 | #include
28 |
29 | // Include exception header for XPath
30 | #if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
31 | # include
32 | #endif
33 |
34 | // Include STL headers
35 | #ifndef PUGIXML_NO_STL
36 | # include
37 | # include
38 | # include
39 | #endif
40 |
41 | // Check if std::string_view is available
42 | #if !defined(PUGIXML_HAS_STRING_VIEW) && !defined(PUGIXML_NO_STL)
43 | # if __cplusplus >= 201703L
44 | # define PUGIXML_HAS_STRING_VIEW
45 | # elif defined(_MSVC_LANG) && _MSVC_LANG >= 201703L
46 | # define PUGIXML_HAS_STRING_VIEW
47 | # endif
48 | #endif
49 |
50 | // Include string_view if appropriate
51 | #ifdef PUGIXML_HAS_STRING_VIEW
52 | # include
53 | #endif
54 |
55 | // Macro for deprecated features
56 | #ifndef PUGIXML_DEPRECATED
57 | # if defined(__GNUC__)
58 | # define PUGIXML_DEPRECATED __attribute__((deprecated))
59 | # elif defined(_MSC_VER) && _MSC_VER >= 1300
60 | # define PUGIXML_DEPRECATED __declspec(deprecated)
61 | # else
62 | # define PUGIXML_DEPRECATED
63 | # endif
64 | #endif
65 |
66 | // If no API is defined, assume default
67 | #ifndef PUGIXML_API
68 | # define PUGIXML_API
69 | #endif
70 |
71 | // If no API for classes is defined, assume default
72 | #ifndef PUGIXML_CLASS
73 | # define PUGIXML_CLASS PUGIXML_API
74 | #endif
75 |
76 | // If no API for functions is defined, assume default
77 | #ifndef PUGIXML_FUNCTION
78 | # define PUGIXML_FUNCTION PUGIXML_API
79 | #endif
80 |
81 | // If the platform is known to have long long support, enable long long functions
82 | #ifndef PUGIXML_HAS_LONG_LONG
83 | # if __cplusplus >= 201103
84 | # define PUGIXML_HAS_LONG_LONG
85 | # elif defined(_MSC_VER) && _MSC_VER >= 1400
86 | # define PUGIXML_HAS_LONG_LONG
87 | # endif
88 | #endif
89 |
90 | // If the platform is known to have move semantics support, compile move ctor/operator implementation
91 | #ifndef PUGIXML_HAS_MOVE
92 | # if __cplusplus >= 201103
93 | # define PUGIXML_HAS_MOVE
94 | # elif defined(_MSC_VER) && _MSC_VER >= 1600
95 | # define PUGIXML_HAS_MOVE
96 | # endif
97 | #endif
98 |
99 | // If C++ is 2011 or higher, use 'noexcept' specifiers
100 | #ifndef PUGIXML_NOEXCEPT
101 | # if __cplusplus >= 201103
102 | # define PUGIXML_NOEXCEPT noexcept
103 | # elif defined(_MSC_VER) && _MSC_VER >= 1900
104 | # define PUGIXML_NOEXCEPT noexcept
105 | # else
106 | # define PUGIXML_NOEXCEPT throw()
107 | # endif
108 | #endif
109 |
110 | // Some functions can not be noexcept in compact mode
111 | #ifdef PUGIXML_COMPACT
112 | # define PUGIXML_NOEXCEPT_IF_NOT_COMPACT
113 | #else
114 | # define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT
115 | #endif
116 |
117 | // If C++ is 2011 or higher, add 'override' qualifiers
118 | #ifndef PUGIXML_OVERRIDE
119 | # if __cplusplus >= 201103
120 | # define PUGIXML_OVERRIDE override
121 | # elif defined(_MSC_VER) && _MSC_VER >= 1700
122 | # define PUGIXML_OVERRIDE override
123 | # else
124 | # define PUGIXML_OVERRIDE
125 | # endif
126 | #endif
127 |
128 | // If C++ is 2011 or higher, use 'nullptr'
129 | #ifndef PUGIXML_NULL
130 | # if __cplusplus >= 201103
131 | # define PUGIXML_NULL nullptr
132 | # elif defined(_MSC_VER) && _MSC_VER >= 1600
133 | # define PUGIXML_NULL nullptr
134 | # else
135 | # define PUGIXML_NULL 0
136 | # endif
137 | #endif
138 |
139 | // Character interface macros
140 | #ifdef PUGIXML_WCHAR_MODE
141 | # define PUGIXML_TEXT(t) L ## t
142 | # define PUGIXML_CHAR wchar_t
143 | #else
144 | # define PUGIXML_TEXT(t) t
145 | # define PUGIXML_CHAR char
146 | #endif
147 |
148 | namespace pugi
149 | {
150 | // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
151 | typedef PUGIXML_CHAR char_t;
152 |
153 | #ifndef PUGIXML_NO_STL
154 | // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
155 | typedef std::basic_string string_t;
156 | #endif
157 |
158 | #ifdef PUGIXML_HAS_STRING_VIEW
159 | // String view type used for operations that can work with a length delimited string; depends on PUGIXML_WCHAR_MODE
160 | typedef std::basic_string_view string_view_t;
161 | #endif
162 | }
163 |
164 | // The PugiXML namespace
165 | namespace pugi
166 | {
167 | // Tree node types
168 | enum xml_node_type
169 | {
170 | node_null, // Empty (null) node handle
171 | node_document, // A document tree's absolute root
172 | node_element, // Element tag, i.e. ''
173 | node_pcdata, // Plain character data, i.e. 'text'
174 | node_cdata, // Character data, i.e. ''
175 | node_comment, // Comment tag, i.e. ''
176 | node_pi, // Processing instruction, i.e. ''
177 | node_declaration, // Document declaration, i.e. ''
178 | node_doctype // Document type declaration, i.e. ''
179 | };
180 |
181 | // Parsing options
182 |
183 | // Minimal parsing mode (equivalent to turning all other flags off).
184 | // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
185 | const unsigned int parse_minimal = 0x0000;
186 |
187 | // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
188 | const unsigned int parse_pi = 0x0001;
189 |
190 | // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
191 | const unsigned int parse_comments = 0x0002;
192 |
193 | // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
194 | const unsigned int parse_cdata = 0x0004;
195 |
196 | // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
197 | // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
198 | const unsigned int parse_ws_pcdata = 0x0008;
199 |
200 | // This flag determines if character and entity references are expanded during parsing. This flag is on by default.
201 | const unsigned int parse_escapes = 0x0010;
202 |
203 | // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
204 | const unsigned int parse_eol = 0x0020;
205 |
206 | // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
207 | const unsigned int parse_wconv_attribute = 0x0040;
208 |
209 | // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
210 | const unsigned int parse_wnorm_attribute = 0x0080;
211 |
212 | // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
213 | const unsigned int parse_declaration = 0x0100;
214 |
215 | // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
216 | const unsigned int parse_doctype = 0x0200;
217 |
218 | // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
219 | // of whitespace is added to the DOM tree.
220 | // This flag is off by default; turning it on may result in slower parsing and more memory consumption.
221 | const unsigned int parse_ws_pcdata_single = 0x0400;
222 |
223 | // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default.
224 | const unsigned int parse_trim_pcdata = 0x0800;
225 |
226 | // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
227 | // is a valid document. This flag is off by default.
228 | const unsigned int parse_fragment = 0x1000;
229 |
230 | // This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
231 | // the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments.
232 | // This flag is off by default.
233 | const unsigned int parse_embed_pcdata = 0x2000;
234 |
235 | // This flag determines whether determines whether the the two pcdata should be merged or not, if no intermediatory data are parsed in the document.
236 | // This flag is off by default.
237 | const unsigned int parse_merge_pcdata = 0x4000;
238 |
239 | // The default parsing mode.
240 | // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
241 | // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
242 | const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
243 |
244 | // The full parsing mode.
245 | // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
246 | // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
247 | const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
248 |
249 | // These flags determine the encoding of input data for XML document
250 | enum xml_encoding
251 | {
252 | encoding_auto, // Auto-detect input encoding using BOM or < / detection; use UTF8 if BOM is not found
253 | encoding_utf8, // UTF8 encoding
254 | encoding_utf16_le, // Little-endian UTF16
255 | encoding_utf16_be, // Big-endian UTF16
256 | encoding_utf16, // UTF16 with native endianness
257 | encoding_utf32_le, // Little-endian UTF32
258 | encoding_utf32_be, // Big-endian UTF32
259 | encoding_utf32, // UTF32 with native endianness
260 | encoding_wchar, // The same encoding wchar_t has (either UTF16 or UTF32)
261 | encoding_latin1
262 | };
263 |
264 | // Formatting flags
265 |
266 | // Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
267 | const unsigned int format_indent = 0x01;
268 |
269 | // Write encoding-specific BOM to the output stream. This flag is off by default.
270 | const unsigned int format_write_bom = 0x02;
271 |
272 | // Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
273 | const unsigned int format_raw = 0x04;
274 |
275 | // Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
276 | const unsigned int format_no_declaration = 0x08;
277 |
278 | // Don't escape attribute values and PCDATA contents. This flag is off by default.
279 | const unsigned int format_no_escapes = 0x10;
280 |
281 | // Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
282 | const unsigned int format_save_file_text = 0x20;
283 |
284 | // Write every attribute on a new line with appropriate indentation. This flag is off by default.
285 | const unsigned int format_indent_attributes = 0x40;
286 |
287 | // Don't output empty element tags, instead writing an explicit start and end tag even if there are no children. This flag is off by default.
288 | const unsigned int format_no_empty_element_tags = 0x80;
289 |
290 | // Skip characters belonging to range [0; 32) instead of "NN;" encoding. This flag is off by default.
291 | const unsigned int format_skip_control_chars = 0x100;
292 |
293 | // Use single quotes ' instead of double quotes " for enclosing attribute values. This flag is off by default.
294 | const unsigned int format_attribute_single_quote = 0x200;
295 |
296 | // The default set of formatting flags.
297 | // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
298 | const unsigned int format_default = format_indent;
299 |
300 | const int default_double_precision = 17;
301 | const int default_float_precision = 9;
302 |
303 | // Forward declarations
304 | struct xml_attribute_struct;
305 | struct xml_node_struct;
306 |
307 | class xml_node_iterator;
308 | class xml_attribute_iterator;
309 | class xml_named_node_iterator;
310 |
311 | class xml_tree_walker;
312 |
313 | struct xml_parse_result;
314 |
315 | class xml_node;
316 |
317 | class xml_text;
318 |
319 | #ifndef PUGIXML_NO_XPATH
320 | class xpath_node;
321 | class xpath_node_set;
322 | class xpath_query;
323 | class xpath_variable_set;
324 | #endif
325 |
326 | // Range-based for loop support
327 | template class xml_object_range
328 | {
329 | public:
330 | typedef It const_iterator;
331 | typedef It iterator;
332 |
333 | xml_object_range(It b, It e): _begin(b), _end(e)
334 | {
335 | }
336 |
337 | It begin() const { return _begin; }
338 | It end() const { return _end; }
339 |
340 | bool empty() const { return _begin == _end; }
341 |
342 | private:
343 | It _begin, _end;
344 | };
345 |
346 | // Writer interface for node printing (see xml_node::print)
347 | class PUGIXML_CLASS xml_writer
348 | {
349 | public:
350 | virtual ~xml_writer();
351 |
352 | // Write memory chunk into stream/file/whatever
353 | virtual void write(const void* data, size_t size) = 0;
354 | };
355 |
356 | // xml_writer implementation for FILE*
357 | class PUGIXML_CLASS xml_writer_file: public xml_writer
358 | {
359 | public:
360 | // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
361 | xml_writer_file(void* file);
362 |
363 | virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
364 |
365 | private:
366 | void* file;
367 | };
368 |
369 | #ifndef PUGIXML_NO_STL
370 | // xml_writer implementation for streams
371 | class PUGIXML_CLASS xml_writer_stream: public xml_writer
372 | {
373 | public:
374 | // Construct writer from an output stream object
375 | xml_writer_stream(std::basic_ostream& stream);
376 | xml_writer_stream(std::basic_ostream& stream);
377 |
378 | virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
379 |
380 | private:
381 | std::basic_ostream* narrow_stream;
382 | std::basic_ostream* wide_stream;
383 | };
384 | #endif
385 |
386 | // A light-weight handle for manipulating attributes in DOM tree
387 | class PUGIXML_CLASS xml_attribute
388 | {
389 | friend class xml_attribute_iterator;
390 | friend class xml_node;
391 |
392 | private:
393 | xml_attribute_struct* _attr;
394 |
395 | typedef void (*unspecified_bool_type)(xml_attribute***);
396 |
397 | public:
398 | // Default constructor. Constructs an empty attribute.
399 | xml_attribute();
400 |
401 | // Constructs attribute from internal pointer
402 | explicit xml_attribute(xml_attribute_struct* attr);
403 |
404 | // Safe bool conversion operator
405 | operator unspecified_bool_type() const;
406 |
407 | // Borland C++ workaround
408 | bool operator!() const;
409 |
410 | // Comparison operators (compares wrapped attribute pointers)
411 | bool operator==(const xml_attribute& r) const;
412 | bool operator!=(const xml_attribute& r) const;
413 | bool operator<(const xml_attribute& r) const;
414 | bool operator>(const xml_attribute& r) const;
415 | bool operator<=(const xml_attribute& r) const;
416 | bool operator>=(const xml_attribute& r) const;
417 |
418 | // Check if attribute is empty (null)
419 | bool empty() const;
420 |
421 | // Get attribute name/value, or "" if attribute is empty
422 | const char_t* name() const;
423 | const char_t* value() const;
424 |
425 | // Get attribute value, or the default value if attribute is empty
426 | const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
427 |
428 | // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
429 | int as_int(int def = 0) const;
430 | unsigned int as_uint(unsigned int def = 0) const;
431 | double as_double(double def = 0) const;
432 | float as_float(float def = 0) const;
433 |
434 | #ifdef PUGIXML_HAS_LONG_LONG
435 | long long as_llong(long long def = 0) const;
436 | unsigned long long as_ullong(unsigned long long def = 0) const;
437 | #endif
438 |
439 | // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
440 | bool as_bool(bool def = false) const;
441 |
442 | // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
443 | bool set_name(const char_t* rhs);
444 | bool set_name(const char_t* rhs, size_t size);
445 | #ifdef PUGIXML_HAS_STRING_VIEW
446 | bool set_name(string_view_t rhs);
447 | #endif
448 | bool set_value(const char_t* rhs);
449 | bool set_value(const char_t* rhs, size_t size);
450 | #ifdef PUGIXML_HAS_STRING_VIEW
451 | bool set_value(string_view_t rhs);
452 | #endif
453 |
454 | // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
455 | bool set_value(int rhs);
456 | bool set_value(unsigned int rhs);
457 | bool set_value(long rhs);
458 | bool set_value(unsigned long rhs);
459 | bool set_value(double rhs);
460 | bool set_value(double rhs, int precision);
461 | bool set_value(float rhs);
462 | bool set_value(float rhs, int precision);
463 | bool set_value(bool rhs);
464 |
465 | #ifdef PUGIXML_HAS_LONG_LONG
466 | bool set_value(long long rhs);
467 | bool set_value(unsigned long long rhs);
468 | #endif
469 |
470 | // Set attribute value (equivalent to set_value without error checking)
471 | xml_attribute& operator=(const char_t* rhs);
472 | xml_attribute& operator=(int rhs);
473 | xml_attribute& operator=(unsigned int rhs);
474 | xml_attribute& operator=(long rhs);
475 | xml_attribute& operator=(unsigned long rhs);
476 | xml_attribute& operator=(double rhs);
477 | xml_attribute& operator=(float rhs);
478 | xml_attribute& operator=(bool rhs);
479 |
480 | #ifdef PUGIXML_HAS_STRING_VIEW
481 | xml_attribute& operator=(string_view_t rhs);
482 | #endif
483 |
484 | #ifdef PUGIXML_HAS_LONG_LONG
485 | xml_attribute& operator=(long long rhs);
486 | xml_attribute& operator=(unsigned long long rhs);
487 | #endif
488 |
489 | // Get next/previous attribute in the attribute list of the parent node
490 | xml_attribute next_attribute() const;
491 | xml_attribute previous_attribute() const;
492 |
493 | // Get hash value (unique for handles to the same object)
494 | size_t hash_value() const;
495 |
496 | // Get internal pointer
497 | xml_attribute_struct* internal_object() const;
498 | };
499 |
500 | #ifdef __BORLANDC__
501 | // Borland C++ workaround
502 | bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
503 | bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
504 | #endif
505 |
506 | // A light-weight handle for manipulating nodes in DOM tree
507 | class PUGIXML_CLASS xml_node
508 | {
509 | friend class xml_attribute_iterator;
510 | friend class xml_node_iterator;
511 | friend class xml_named_node_iterator;
512 |
513 | protected:
514 | xml_node_struct* _root;
515 |
516 | typedef void (*unspecified_bool_type)(xml_node***);
517 |
518 | public:
519 | // Default constructor. Constructs an empty node.
520 | xml_node();
521 |
522 | // Constructs node from internal pointer
523 | explicit xml_node(xml_node_struct* p);
524 |
525 | // Safe bool conversion operator
526 | operator unspecified_bool_type() const;
527 |
528 | // Borland C++ workaround
529 | bool operator!() const;
530 |
531 | // Comparison operators (compares wrapped node pointers)
532 | bool operator==(const xml_node& r) const;
533 | bool operator!=(const xml_node& r) const;
534 | bool operator<(const xml_node& r) const;
535 | bool operator>(const xml_node& r) const;
536 | bool operator<=(const xml_node& r) const;
537 | bool operator>=(const xml_node& r) const;
538 |
539 | // Check if node is empty (null)
540 | bool empty() const;
541 |
542 | // Get node type
543 | xml_node_type type() const;
544 |
545 | // Get node name, or "" if node is empty or it has no name
546 | const char_t* name() const;
547 |
548 | // Get node value, or "" if node is empty or it has no value
549 | // Note: For text node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
550 | const char_t* value() const;
551 |
552 | // Get attribute list
553 | xml_attribute first_attribute() const;
554 | xml_attribute last_attribute() const;
555 |
556 | // Get children list
557 | xml_node first_child() const;
558 | xml_node last_child() const;
559 |
560 | // Get next/previous sibling in the children list of the parent node
561 | xml_node next_sibling() const;
562 | xml_node previous_sibling() const;
563 |
564 | // Get parent node
565 | xml_node parent() const;
566 |
567 | // Get root of DOM tree this node belongs to
568 | xml_node root() const;
569 |
570 | // Get text object for the current node
571 | xml_text text() const;
572 |
573 | // Get child, attribute or next/previous sibling with the specified name
574 | xml_node child(const char_t* name) const;
575 | xml_attribute attribute(const char_t* name) const;
576 | xml_node next_sibling(const char_t* name) const;
577 | xml_node previous_sibling(const char_t* name) const;
578 | #ifdef PUGIXML_HAS_STRING_VIEW
579 | xml_node child(string_view_t name) const;
580 | xml_attribute attribute(string_view_t name) const;
581 | xml_node next_sibling(string_view_t name) const;
582 | xml_node previous_sibling(string_view_t name) const;
583 | #endif
584 |
585 | // Get attribute, starting the search from a hint (and updating hint so that searching for a sequence of attributes is fast)
586 | xml_attribute attribute(const char_t* name, xml_attribute& hint) const;
587 | #ifdef PUGIXML_HAS_STRING_VIEW
588 | xml_attribute attribute(string_view_t name, xml_attribute& hint) const;
589 | #endif
590 |
591 | // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
592 | const char_t* child_value() const;
593 |
594 | // Get child value of child with specified name. Equivalent to child(name).child_value().
595 | const char_t* child_value(const char_t* name) const;
596 |
597 | // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
598 | bool set_name(const char_t* rhs);
599 | bool set_name(const char_t* rhs, size_t size);
600 | #ifdef PUGIXML_HAS_STRING_VIEW
601 | bool set_name(string_view_t rhs);
602 | #endif
603 | bool set_value(const char_t* rhs);
604 | bool set_value(const char_t* rhs, size_t size);
605 | #ifdef PUGIXML_HAS_STRING_VIEW
606 | bool set_value(string_view_t rhs);
607 | #endif
608 |
609 | // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
610 | xml_attribute append_attribute(const char_t* name);
611 | xml_attribute prepend_attribute(const char_t* name);
612 | xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
613 | xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
614 | #ifdef PUGIXML_HAS_STRING_VIEW
615 | xml_attribute append_attribute(string_view_t name);
616 | xml_attribute prepend_attribute(string_view_t name);
617 | xml_attribute insert_attribute_after(string_view_t name, const xml_attribute& attr);
618 | xml_attribute insert_attribute_before(string_view_t name, const xml_attribute& attr);
619 | #endif
620 |
621 | // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
622 | xml_attribute append_copy(const xml_attribute& proto);
623 | xml_attribute prepend_copy(const xml_attribute& proto);
624 | xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
625 | xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
626 |
627 | // Add child node with specified type. Returns added node, or empty node on errors.
628 | xml_node append_child(xml_node_type type = node_element);
629 | xml_node prepend_child(xml_node_type type = node_element);
630 | xml_node insert_child_after(xml_node_type type, const xml_node& node);
631 | xml_node insert_child_before(xml_node_type type, const xml_node& node);
632 |
633 | // Add child element with specified name. Returns added node, or empty node on errors.
634 | xml_node append_child(const char_t* name);
635 | xml_node prepend_child(const char_t* name);
636 | xml_node insert_child_after(const char_t* name, const xml_node& node);
637 | xml_node insert_child_before(const char_t* name, const xml_node& node);
638 | #ifdef PUGIXML_HAS_STRING_VIEW
639 | xml_node append_child(string_view_t name);
640 | xml_node prepend_child(string_view_t name);
641 | xml_node insert_child_after(string_view_t, const xml_node& node);
642 | xml_node insert_child_before(string_view_t name, const xml_node& node);
643 | #endif
644 |
645 | // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
646 | xml_node append_copy(const xml_node& proto);
647 | xml_node prepend_copy(const xml_node& proto);
648 | xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
649 | xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
650 |
651 | // Move the specified node to become a child of this node. Returns moved node, or empty node on errors.
652 | xml_node append_move(const xml_node& moved);
653 | xml_node prepend_move(const xml_node& moved);
654 | xml_node insert_move_after(const xml_node& moved, const xml_node& node);
655 | xml_node insert_move_before(const xml_node& moved, const xml_node& node);
656 |
657 | // Remove specified attribute
658 | bool remove_attribute(const xml_attribute& a);
659 | bool remove_attribute(const char_t* name);
660 | #ifdef PUGIXML_HAS_STRING_VIEW
661 | bool remove_attribute(string_view_t name);
662 | #endif
663 |
664 | // Remove all attributes
665 | bool remove_attributes();
666 |
667 | // Remove specified child
668 | bool remove_child(const xml_node& n);
669 | bool remove_child(const char_t* name);
670 | #ifdef PUGIXML_HAS_STRING_VIEW
671 | bool remove_child(string_view_t name);
672 | #endif
673 |
674 | // Remove all children
675 | bool remove_children();
676 |
677 | // Parses buffer as an XML document fragment and appends all nodes as children of the current node.
678 | // Copies/converts the buffer, so it may be deleted or changed after the function returns.
679 | // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory.
680 | xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
681 |
682 | // Find attribute using predicate. Returns first attribute for which predicate returned true.
683 | template xml_attribute find_attribute(Predicate pred) const
684 | {
685 | if (!_root) return xml_attribute();
686 |
687 | for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
688 | if (pred(attrib))
689 | return attrib;
690 |
691 | return xml_attribute();
692 | }
693 |
694 | // Find child node using predicate. Returns first child for which predicate returned true.
695 | template xml_node find_child(Predicate pred) const
696 | {
697 | if (!_root) return xml_node();
698 |
699 | for (xml_node node = first_child(); node; node = node.next_sibling())
700 | if (pred(node))
701 | return node;
702 |
703 | return xml_node();
704 | }
705 |
706 | // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
707 | template xml_node find_node(Predicate pred) const
708 | {
709 | if (!_root) return xml_node();
710 |
711 | xml_node cur = first_child();
712 |
713 | while (cur._root && cur._root != _root)
714 | {
715 | if (pred(cur)) return cur;
716 |
717 | if (cur.first_child()) cur = cur.first_child();
718 | else if (cur.next_sibling()) cur = cur.next_sibling();
719 | else
720 | {
721 | while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
722 |
723 | if (cur._root != _root) cur = cur.next_sibling();
724 | }
725 | }
726 |
727 | return xml_node();
728 | }
729 |
730 | // Find child node by attribute name/value
731 | xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
732 | xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
733 |
734 | #ifndef PUGIXML_NO_STL
735 | // Get the absolute node path from root as a text string.
736 | string_t path(char_t delimiter = '/') const;
737 | #endif
738 |
739 | // Search for a node by path consisting of node names and . or .. elements.
740 | xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
741 |
742 | // Recursively traverse subtree with xml_tree_walker
743 | bool traverse(xml_tree_walker& walker);
744 |
745 | #ifndef PUGIXML_NO_XPATH
746 | // Select single node by evaluating XPath query. Returns first node from the resulting node set.
747 | xpath_node select_node(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL) const;
748 | xpath_node select_node(const xpath_query& query) const;
749 |
750 | // Select node set by evaluating XPath query
751 | xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL) const;
752 | xpath_node_set select_nodes(const xpath_query& query) const;
753 |
754 | // (deprecated: use select_node instead) Select single node by evaluating XPath query.
755 | PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL) const;
756 | PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const;
757 |
758 | #endif
759 |
760 | // Print subtree using a writer object
761 | void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
762 |
763 | #ifndef PUGIXML_NO_STL
764 | // Print subtree to stream
765 | void print(std::basic_ostream& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
766 | void print(std::basic_ostream& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
767 | #endif
768 |
769 | // Child nodes iterators
770 | typedef xml_node_iterator iterator;
771 |
772 | iterator begin() const;
773 | iterator end() const;
774 |
775 | // Attribute iterators
776 | typedef xml_attribute_iterator attribute_iterator;
777 |
778 | attribute_iterator attributes_begin() const;
779 | attribute_iterator attributes_end() const;
780 |
781 | // Range-based for support
782 | xml_object_range children() const;
783 | xml_object_range attributes() const;
784 |
785 | // Range-based for support for all children with the specified name
786 | // Note: name pointer must have a longer lifetime than the returned object; be careful with passing temporaries!
787 | xml_object_range children(const char_t* name) const;
788 |
789 | // Get node offset in parsed file/string (in char_t units) for debugging purposes
790 | ptrdiff_t offset_debug() const;
791 |
792 | // Get hash value (unique for handles to the same object)
793 | size_t hash_value() const;
794 |
795 | // Get internal pointer
796 | xml_node_struct* internal_object() const;
797 | };
798 |
799 | #ifdef __BORLANDC__
800 | // Borland C++ workaround
801 | bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
802 | bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
803 | #endif
804 |
805 | // A helper for working with text inside PCDATA nodes
806 | class PUGIXML_CLASS xml_text
807 | {
808 | friend class xml_node;
809 |
810 | xml_node_struct* _root;
811 |
812 | typedef void (*unspecified_bool_type)(xml_text***);
813 |
814 | explicit xml_text(xml_node_struct* root);
815 |
816 | xml_node_struct* _data_new();
817 | xml_node_struct* _data() const;
818 |
819 | public:
820 | // Default constructor. Constructs an empty object.
821 | xml_text();
822 |
823 | // Safe bool conversion operator
824 | operator unspecified_bool_type() const;
825 |
826 | // Borland C++ workaround
827 | bool operator!() const;
828 |
829 | // Check if text object is empty (null)
830 | bool empty() const;
831 |
832 | // Get text, or "" if object is empty
833 | const char_t* get() const;
834 |
835 | // Get text, or the default value if object is empty
836 | const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
837 |
838 | // Get text as a number, or the default value if conversion did not succeed or object is empty
839 | int as_int(int def = 0) const;
840 | unsigned int as_uint(unsigned int def = 0) const;
841 | double as_double(double def = 0) const;
842 | float as_float(float def = 0) const;
843 |
844 | #ifdef PUGIXML_HAS_LONG_LONG
845 | long long as_llong(long long def = 0) const;
846 | unsigned long long as_ullong(unsigned long long def = 0) const;
847 | #endif
848 |
849 | // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
850 | bool as_bool(bool def = false) const;
851 |
852 | // Set text (returns false if object is empty or there is not enough memory)
853 | bool set(const char_t* rhs);
854 | bool set(const char_t* rhs, size_t size);
855 | #ifdef PUGIXML_HAS_STRING_VIEW
856 | bool set(string_view_t rhs);
857 | #endif
858 |
859 | // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
860 | bool set(int rhs);
861 | bool set(unsigned int rhs);
862 | bool set(long rhs);
863 | bool set(unsigned long rhs);
864 | bool set(double rhs);
865 | bool set(double rhs, int precision);
866 | bool set(float rhs);
867 | bool set(float rhs, int precision);
868 | bool set(bool rhs);
869 |
870 | #ifdef PUGIXML_HAS_LONG_LONG
871 | bool set(long long rhs);
872 | bool set(unsigned long long rhs);
873 | #endif
874 |
875 | // Set text (equivalent to set without error checking)
876 | xml_text& operator=(const char_t* rhs);
877 | xml_text& operator=(int rhs);
878 | xml_text& operator=(unsigned int rhs);
879 | xml_text& operator=(long rhs);
880 | xml_text& operator=(unsigned long rhs);
881 | xml_text& operator=(double rhs);
882 | xml_text& operator=(float rhs);
883 | xml_text& operator=(bool rhs);
884 |
885 | #ifdef PUGIXML_HAS_STRING_VIEW
886 | xml_text& operator=(string_view_t rhs);
887 | #endif
888 |
889 | #ifdef PUGIXML_HAS_LONG_LONG
890 | xml_text& operator=(long long rhs);
891 | xml_text& operator=(unsigned long long rhs);
892 | #endif
893 |
894 | // Get the data node (node_pcdata or node_cdata) for this object
895 | xml_node data() const;
896 | };
897 |
898 | #ifdef __BORLANDC__
899 | // Borland C++ workaround
900 | bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
901 | bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
902 | #endif
903 |
904 | // Child node iterator (a bidirectional iterator over a collection of xml_node)
905 | class PUGIXML_CLASS xml_node_iterator
906 | {
907 | friend class xml_node;
908 |
909 | private:
910 | mutable xml_node _wrap;
911 | xml_node _parent;
912 |
913 | xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
914 |
915 | public:
916 | // Iterator traits
917 | typedef ptrdiff_t difference_type;
918 | typedef xml_node value_type;
919 | typedef xml_node* pointer;
920 | typedef xml_node& reference;
921 |
922 | #ifndef PUGIXML_NO_STL
923 | typedef std::bidirectional_iterator_tag iterator_category;
924 | #endif
925 |
926 | // Default constructor
927 | xml_node_iterator();
928 |
929 | // Construct an iterator which points to the specified node
930 | xml_node_iterator(const xml_node& node);
931 |
932 | // Iterator operators
933 | bool operator==(const xml_node_iterator& rhs) const;
934 | bool operator!=(const xml_node_iterator& rhs) const;
935 |
936 | xml_node& operator*() const;
937 | xml_node* operator->() const;
938 |
939 | xml_node_iterator& operator++();
940 | xml_node_iterator operator++(int);
941 |
942 | xml_node_iterator& operator--();
943 | xml_node_iterator operator--(int);
944 | };
945 |
946 | // Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
947 | class PUGIXML_CLASS xml_attribute_iterator
948 | {
949 | friend class xml_node;
950 |
951 | private:
952 | mutable xml_attribute _wrap;
953 | xml_node _parent;
954 |
955 | xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
956 |
957 | public:
958 | // Iterator traits
959 | typedef ptrdiff_t difference_type;
960 | typedef xml_attribute value_type;
961 | typedef xml_attribute* pointer;
962 | typedef xml_attribute& reference;
963 |
964 | #ifndef PUGIXML_NO_STL
965 | typedef std::bidirectional_iterator_tag iterator_category;
966 | #endif
967 |
968 | // Default constructor
969 | xml_attribute_iterator();
970 |
971 | // Construct an iterator which points to the specified attribute
972 | xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
973 |
974 | // Iterator operators
975 | bool operator==(const xml_attribute_iterator& rhs) const;
976 | bool operator!=(const xml_attribute_iterator& rhs) const;
977 |
978 | xml_attribute& operator*() const;
979 | xml_attribute* operator->() const;
980 |
981 | xml_attribute_iterator& operator++();
982 | xml_attribute_iterator operator++(int);
983 |
984 | xml_attribute_iterator& operator--();
985 | xml_attribute_iterator operator--(int);
986 | };
987 |
988 | // Named node range helper
989 | class PUGIXML_CLASS xml_named_node_iterator
990 | {
991 | friend class xml_node;
992 |
993 | public:
994 | // Iterator traits
995 | typedef ptrdiff_t difference_type;
996 | typedef xml_node value_type;
997 | typedef xml_node* pointer;
998 | typedef xml_node& reference;
999 |
1000 | #ifndef PUGIXML_NO_STL
1001 | typedef std::bidirectional_iterator_tag iterator_category;
1002 | #endif
1003 |
1004 | // Default constructor
1005 | xml_named_node_iterator();
1006 |
1007 | // Construct an iterator which points to the specified node
1008 | // Note: name pointer is stored in the iterator and must have a longer lifetime than iterator itself
1009 | xml_named_node_iterator(const xml_node& node, const char_t* name);
1010 |
1011 | // Iterator operators
1012 | bool operator==(const xml_named_node_iterator& rhs) const;
1013 | bool operator!=(const xml_named_node_iterator& rhs) const;
1014 |
1015 | xml_node& operator*() const;
1016 | xml_node* operator->() const;
1017 |
1018 | xml_named_node_iterator& operator++();
1019 | xml_named_node_iterator operator++(int);
1020 |
1021 | xml_named_node_iterator& operator--();
1022 | xml_named_node_iterator operator--(int);
1023 |
1024 | private:
1025 | mutable xml_node _wrap;
1026 | xml_node _parent;
1027 | const char_t* _name;
1028 |
1029 | xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name);
1030 | };
1031 |
1032 | // Abstract tree walker class (see xml_node::traverse)
1033 | class PUGIXML_CLASS xml_tree_walker
1034 | {
1035 | friend class xml_node;
1036 |
1037 | private:
1038 | int _depth;
1039 |
1040 | protected:
1041 | // Get current traversal depth
1042 | int depth() const;
1043 |
1044 | public:
1045 | xml_tree_walker();
1046 | virtual ~xml_tree_walker();
1047 |
1048 | // Callback that is called when traversal begins
1049 | virtual bool begin(xml_node& node);
1050 |
1051 | // Callback that is called for each node traversed
1052 | virtual bool for_each(xml_node& node) = 0;
1053 |
1054 | // Callback that is called when traversal ends
1055 | virtual bool end(xml_node& node);
1056 | };
1057 |
1058 | // Parsing status, returned as part of xml_parse_result object
1059 | enum xml_parse_status
1060 | {
1061 | status_ok = 0, // No error
1062 |
1063 | status_file_not_found, // File was not found during load_file()
1064 | status_io_error, // Error reading from file/stream
1065 | status_out_of_memory, // Could not allocate memory
1066 | status_internal_error, // Internal error occurred
1067 |
1068 | status_unrecognized_tag, // Parser could not determine tag type
1069 |
1070 | status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction
1071 | status_bad_comment, // Parsing error occurred while parsing comment
1072 | status_bad_cdata, // Parsing error occurred while parsing CDATA section
1073 | status_bad_doctype, // Parsing error occurred while parsing document type declaration
1074 | status_bad_pcdata, // Parsing error occurred while parsing PCDATA section
1075 | status_bad_start_element, // Parsing error occurred while parsing start element tag
1076 | status_bad_attribute, // Parsing error occurred while parsing element attribute
1077 | status_bad_end_element, // Parsing error occurred while parsing end element tag
1078 | status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
1079 |
1080 | status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
1081 |
1082 | status_no_document_element // Parsing resulted in a document without element nodes
1083 | };
1084 |
1085 | // Parsing result
1086 | struct PUGIXML_CLASS xml_parse_result
1087 | {
1088 | // Parsing status (see xml_parse_status)
1089 | xml_parse_status status;
1090 |
1091 | // Last parsed offset (in char_t units from start of input data)
1092 | ptrdiff_t offset;
1093 |
1094 | // Source document encoding
1095 | xml_encoding encoding;
1096 |
1097 | // Default constructor, initializes object to failed state
1098 | xml_parse_result();
1099 |
1100 | // Cast to bool operator
1101 | operator bool() const;
1102 |
1103 | // Get error description
1104 | const char* description() const;
1105 | };
1106 |
1107 | // Document class (DOM tree root)
1108 | class PUGIXML_CLASS xml_document: public xml_node
1109 | {
1110 | private:
1111 | char_t* _buffer;
1112 |
1113 | char _memory[192];
1114 |
1115 | // Non-copyable semantics
1116 | xml_document(const xml_document&);
1117 | xml_document& operator=(const xml_document&);
1118 |
1119 | void _create();
1120 | void _destroy();
1121 | void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
1122 |
1123 | public:
1124 | // Default constructor, makes empty document
1125 | xml_document();
1126 |
1127 | // Destructor, invalidates all node/attribute handles to this document
1128 | ~xml_document();
1129 |
1130 | #ifdef PUGIXML_HAS_MOVE
1131 | // Move semantics support
1132 | xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
1133 | xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
1134 | #endif
1135 |
1136 | // Removes all nodes, leaving the empty document
1137 | void reset();
1138 |
1139 | // Removes all nodes, then copies the entire contents of the specified document
1140 | void reset(const xml_document& proto);
1141 |
1142 | #ifndef PUGIXML_NO_STL
1143 | // Load document from stream.
1144 | xml_parse_result load(std::basic_istream& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
1145 | xml_parse_result load(std::basic_istream& stream, unsigned int options = parse_default);
1146 | #endif
1147 |
1148 | // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
1149 | PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
1150 |
1151 | // Load document from zero-terminated string. No encoding conversions are applied.
1152 | xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
1153 |
1154 | // Load document from file
1155 | xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
1156 | xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
1157 |
1158 | // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
1159 | xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
1160 |
1161 | // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
1162 | // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
1163 | xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
1164 |
1165 | // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
1166 | // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
1167 | xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
1168 |
1169 | // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
1170 | void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
1171 |
1172 | #ifndef PUGIXML_NO_STL
1173 | // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
1174 | void save(std::basic_ostream& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
1175 | void save(std::basic_ostream& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
1176 | #endif
1177 |
1178 | // Save XML to file
1179 | bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
1180 | bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
1181 |
1182 | // Get document element
1183 | xml_node document_element() const;
1184 | };
1185 |
1186 | #ifndef PUGIXML_NO_XPATH
1187 | // XPath query return type
1188 | enum xpath_value_type
1189 | {
1190 | xpath_type_none, // Unknown type (query failed to compile)
1191 | xpath_type_node_set, // Node set (xpath_node_set)
1192 | xpath_type_number, // Number
1193 | xpath_type_string, // String
1194 | xpath_type_boolean // Boolean
1195 | };
1196 |
1197 | // XPath parsing result
1198 | struct PUGIXML_CLASS xpath_parse_result
1199 | {
1200 | // Error message (0 if no error)
1201 | const char* error;
1202 |
1203 | // Last parsed offset (in char_t units from string start)
1204 | ptrdiff_t offset;
1205 |
1206 | // Default constructor, initializes object to failed state
1207 | xpath_parse_result();
1208 |
1209 | // Cast to bool operator
1210 | operator bool() const;
1211 |
1212 | // Get error description
1213 | const char* description() const;
1214 | };
1215 |
1216 | // A single XPath variable
1217 | class PUGIXML_CLASS xpath_variable
1218 | {
1219 | friend class xpath_variable_set;
1220 |
1221 | protected:
1222 | xpath_value_type _type;
1223 | xpath_variable* _next;
1224 |
1225 | xpath_variable(xpath_value_type type);
1226 |
1227 | // Non-copyable semantics
1228 | xpath_variable(const xpath_variable&);
1229 | xpath_variable& operator=(const xpath_variable&);
1230 |
1231 | public:
1232 | // Get variable name
1233 | const char_t* name() const;
1234 |
1235 | // Get variable type
1236 | xpath_value_type type() const;
1237 |
1238 | // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
1239 | bool get_boolean() const;
1240 | double get_number() const;
1241 | const char_t* get_string() const;
1242 | const xpath_node_set& get_node_set() const;
1243 |
1244 | // Set variable value; no type conversion is performed, false is returned on type mismatch error
1245 | bool set(bool value);
1246 | bool set(double value);
1247 | bool set(const char_t* value);
1248 | bool set(const xpath_node_set& value);
1249 | };
1250 |
1251 | // A set of XPath variables
1252 | class PUGIXML_CLASS xpath_variable_set
1253 | {
1254 | private:
1255 | xpath_variable* _data[64];
1256 |
1257 | void _assign(const xpath_variable_set& rhs);
1258 | void _swap(xpath_variable_set& rhs);
1259 |
1260 | xpath_variable* _find(const char_t* name) const;
1261 |
1262 | static bool _clone(xpath_variable* var, xpath_variable** out_result);
1263 | static void _destroy(xpath_variable* var);
1264 |
1265 | public:
1266 | // Default constructor/destructor
1267 | xpath_variable_set();
1268 | ~xpath_variable_set();
1269 |
1270 | // Copy constructor/assignment operator
1271 | xpath_variable_set(const xpath_variable_set& rhs);
1272 | xpath_variable_set& operator=(const xpath_variable_set& rhs);
1273 |
1274 | #ifdef PUGIXML_HAS_MOVE
1275 | // Move semantics support
1276 | xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
1277 | xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
1278 | #endif
1279 |
1280 | // Add a new variable or get the existing one, if the types match
1281 | xpath_variable* add(const char_t* name, xpath_value_type type);
1282 |
1283 | // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
1284 | bool set(const char_t* name, bool value);
1285 | bool set(const char_t* name, double value);
1286 | bool set(const char_t* name, const char_t* value);
1287 | bool set(const char_t* name, const xpath_node_set& value);
1288 |
1289 | // Get existing variable by name
1290 | xpath_variable* get(const char_t* name);
1291 | const xpath_variable* get(const char_t* name) const;
1292 | };
1293 |
1294 | // A compiled XPath query object
1295 | class PUGIXML_CLASS xpath_query
1296 | {
1297 | private:
1298 | void* _impl;
1299 | xpath_parse_result _result;
1300 |
1301 | typedef void (*unspecified_bool_type)(xpath_query***);
1302 |
1303 | // Non-copyable semantics
1304 | xpath_query(const xpath_query&);
1305 | xpath_query& operator=(const xpath_query&);
1306 |
1307 | public:
1308 | // Construct a compiled object from XPath expression.
1309 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
1310 | explicit xpath_query(const char_t* query, xpath_variable_set* variables = PUGIXML_NULL);
1311 |
1312 | // Constructor
1313 | xpath_query();
1314 |
1315 | // Destructor
1316 | ~xpath_query();
1317 |
1318 | #ifdef PUGIXML_HAS_MOVE
1319 | // Move semantics support
1320 | xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT;
1321 | xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT;
1322 | #endif
1323 |
1324 | // Get query expression return type
1325 | xpath_value_type return_type() const;
1326 |
1327 | // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
1328 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
1329 | bool evaluate_boolean(const xpath_node& n) const;
1330 |
1331 | // Evaluate expression as double value in the specified context; performs type conversion if necessary.
1332 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
1333 | double evaluate_number(const xpath_node& n) const;
1334 |
1335 | #ifndef PUGIXML_NO_STL
1336 | // Evaluate expression as string value in the specified context; performs type conversion if necessary.
1337 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
1338 | string_t evaluate_string(const xpath_node& n) const;
1339 | #endif
1340 |
1341 | // Evaluate expression as string value in the specified context; performs type conversion if necessary.
1342 | // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
1343 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
1344 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead.
1345 | size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
1346 |
1347 | // Evaluate expression as node set in the specified context.
1348 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
1349 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
1350 | xpath_node_set evaluate_node_set(const xpath_node& n) const;
1351 |
1352 | // Evaluate expression as node set in the specified context.
1353 | // Return first node in document order, or empty node if node set is empty.
1354 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
1355 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead.
1356 | xpath_node evaluate_node(const xpath_node& n) const;
1357 |
1358 | // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
1359 | const xpath_parse_result& result() const;
1360 |
1361 | // Safe bool conversion operator
1362 | operator unspecified_bool_type() const;
1363 |
1364 | // Borland C++ workaround
1365 | bool operator!() const;
1366 | };
1367 |
1368 | #ifndef PUGIXML_NO_EXCEPTIONS
1369 | #if defined(_MSC_VER)
1370 | // C4275 can be ignored in Visual C++ if you are deriving
1371 | // from a type in the Standard C++ Library
1372 | #pragma warning(push)
1373 | #pragma warning(disable: 4275)
1374 | #endif
1375 | // XPath exception class
1376 | class PUGIXML_CLASS xpath_exception: public std::exception
1377 | {
1378 | private:
1379 | xpath_parse_result _result;
1380 |
1381 | public:
1382 | // Construct exception from parse result
1383 | explicit xpath_exception(const xpath_parse_result& result);
1384 |
1385 | // Get error message
1386 | virtual const char* what() const PUGIXML_NOEXCEPT PUGIXML_OVERRIDE;
1387 |
1388 | // Get parse result
1389 | const xpath_parse_result& result() const;
1390 | };
1391 | #if defined(_MSC_VER)
1392 | #pragma warning(pop)
1393 | #endif
1394 | #endif
1395 |
1396 | // XPath node class (either xml_node or xml_attribute)
1397 | class PUGIXML_CLASS xpath_node
1398 | {
1399 | private:
1400 | xml_node _node;
1401 | xml_attribute _attribute;
1402 |
1403 | typedef void (*unspecified_bool_type)(xpath_node***);
1404 |
1405 | public:
1406 | // Default constructor; constructs empty XPath node
1407 | xpath_node();
1408 |
1409 | // Construct XPath node from XML node/attribute
1410 | xpath_node(const xml_node& node);
1411 | xpath_node(const xml_attribute& attribute, const xml_node& parent);
1412 |
1413 | // Get node/attribute, if any
1414 | xml_node node() const;
1415 | xml_attribute attribute() const;
1416 |
1417 | // Get parent of contained node/attribute
1418 | xml_node parent() const;
1419 |
1420 | // Safe bool conversion operator
1421 | operator unspecified_bool_type() const;
1422 |
1423 | // Borland C++ workaround
1424 | bool operator!() const;
1425 |
1426 | // Comparison operators
1427 | bool operator==(const xpath_node& n) const;
1428 | bool operator!=(const xpath_node& n) const;
1429 | };
1430 |
1431 | #ifdef __BORLANDC__
1432 | // Borland C++ workaround
1433 | bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
1434 | bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
1435 | #endif
1436 |
1437 | // A fixed-size collection of XPath nodes
1438 | class PUGIXML_CLASS xpath_node_set
1439 | {
1440 | public:
1441 | // Collection type
1442 | enum type_t
1443 | {
1444 | type_unsorted, // Not ordered
1445 | type_sorted, // Sorted by document order (ascending)
1446 | type_sorted_reverse // Sorted by document order (descending)
1447 | };
1448 |
1449 | // Constant iterator type
1450 | typedef const xpath_node* const_iterator;
1451 |
1452 | // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
1453 | typedef const xpath_node* iterator;
1454 |
1455 | // Default constructor. Constructs empty set.
1456 | xpath_node_set();
1457 |
1458 | // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
1459 | xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
1460 |
1461 | // Destructor
1462 | ~xpath_node_set();
1463 |
1464 | // Copy constructor/assignment operator
1465 | xpath_node_set(const xpath_node_set& ns);
1466 | xpath_node_set& operator=(const xpath_node_set& ns);
1467 |
1468 | #ifdef PUGIXML_HAS_MOVE
1469 | // Move semantics support
1470 | xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
1471 | xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
1472 | #endif
1473 |
1474 | // Get collection type
1475 | type_t type() const;
1476 |
1477 | // Get collection size
1478 | size_t size() const;
1479 |
1480 | // Indexing operator
1481 | const xpath_node& operator[](size_t index) const;
1482 |
1483 | // Collection iterators
1484 | const_iterator begin() const;
1485 | const_iterator end() const;
1486 |
1487 | // Sort the collection in ascending/descending order by document order
1488 | void sort(bool reverse = false);
1489 |
1490 | // Get first node in the collection by document order
1491 | xpath_node first() const;
1492 |
1493 | // Check if collection is empty
1494 | bool empty() const;
1495 |
1496 | private:
1497 | type_t _type;
1498 |
1499 | xpath_node _storage[1];
1500 |
1501 | xpath_node* _begin;
1502 | xpath_node* _end;
1503 |
1504 | void _assign(const_iterator begin, const_iterator end, type_t type);
1505 | void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT;
1506 | };
1507 | #endif
1508 |
1509 | #ifndef PUGIXML_NO_STL
1510 | // Convert wide string to UTF8
1511 | std::basic_string PUGIXML_FUNCTION as_utf8(const wchar_t* str);
1512 | std::basic_string PUGIXML_FUNCTION as_utf8(const std::basic_string& str);
1513 |
1514 | // Convert UTF8 to wide string
1515 | std::basic_string PUGIXML_FUNCTION as_wide(const char* str);
1516 | std::basic_string PUGIXML_FUNCTION as_wide(const std::basic_string& str);
1517 | #endif
1518 |
1519 | // Memory allocation function interface; returns pointer to allocated memory or NULL on failure
1520 | typedef void* (*allocation_function)(size_t size);
1521 |
1522 | // Memory deallocation function interface
1523 | typedef void (*deallocation_function)(void* ptr);
1524 |
1525 | // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
1526 | void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
1527 |
1528 | // Get current memory management functions
1529 | allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
1530 | deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
1531 | }
1532 |
1533 | #if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
1534 | namespace std
1535 | {
1536 | // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
1537 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
1538 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
1539 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
1540 | }
1541 | #endif
1542 |
1543 | #if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
1544 | namespace std
1545 | {
1546 | // Workarounds for (non-standard) iterator category detection
1547 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
1548 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
1549 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
1550 | }
1551 | #endif
1552 |
1553 | #endif
1554 |
1555 | // Make sure implementation is included in header-only mode
1556 | // Use macro expansion in #include to work around QMake (QTBUG-11923)
1557 | #if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE)
1558 | # define PUGIXML_SOURCE "pugixml.cpp"
1559 | # include PUGIXML_SOURCE
1560 | #endif
1561 |
1562 | /**
1563 | * Copyright (c) 2006-2025 Arseny Kapoulkine
1564 | *
1565 | * Permission is hereby granted, free of charge, to any person
1566 | * obtaining a copy of this software and associated documentation
1567 | * files (the "Software"), to deal in the Software without
1568 | * restriction, including without limitation the rights to use,
1569 | * copy, modify, merge, publish, distribute, sublicense, and/or sell
1570 | * copies of the Software, and to permit persons to whom the
1571 | * Software is furnished to do so, subject to the following
1572 | * conditions:
1573 | *
1574 | * The above copyright notice and this permission notice shall be
1575 | * included in all copies or substantial portions of the Software.
1576 | *
1577 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
1578 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
1579 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
1580 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
1581 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
1582 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
1583 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
1584 | * OTHER DEALINGS IN THE SOFTWARE.
1585 | */
1586 |
--------------------------------------------------------------------------------
/c_src/utf8_cleanup.cc:
--------------------------------------------------------------------------------
1 | #include "utf8_cleanup.h"
2 |
3 | //code from pugixml.cpp
4 | //Copyright (C) 2006-2017, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
5 | //modified to strip some invalid unicode intervals
6 |
7 | struct utf8_writer
8 | {
9 | typedef uint8_t* value_type;
10 |
11 | static value_type low(value_type result, uint32_t ch)
12 | {
13 | // U+0000..U+007F
14 | if (ch < 0x80)
15 | {
16 | *result = static_cast(ch);
17 | return result + 1;
18 | }
19 | // U+0080..U+07FF
20 | else if (ch < 0x800)
21 | {
22 | result[0] = static_cast(0xC0 | (ch >> 6));
23 | result[1] = static_cast(0x80 | (ch & 0x3F));
24 | return result + 2;
25 | }
26 | // U+0800..U+FFFF (U+0800..U+FDCF, U+FDF0..U+FFFD)
27 | else if (ch < 0xfdd0 || (ch > 0xfdef && ch < 0xfffe))
28 | {
29 | result[0] = static_cast(0xE0 | (ch >> 12));
30 | result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F));
31 | result[2] = static_cast(0x80 | (ch & 0x3F));
32 | return result + 3;
33 | }
34 |
35 | return result;
36 | }
37 |
38 | static value_type high(value_type result, uint32_t ch)
39 | {
40 | // U+10000..U+10FFFF
41 | result[0] = static_cast(0xF0 | (ch >> 18));
42 | result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F));
43 | result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F));
44 | result[3] = static_cast(0x80 | (ch & 0x3F));
45 | return result + 4;
46 | }
47 |
48 | static value_type any(value_type result, uint32_t ch)
49 | {
50 | return (ch < 0x10000) ? low(result, ch) : high(result, ch);
51 | }
52 | };
53 |
54 | struct utf8_decoder
55 | {
56 | typedef uint8_t type;
57 |
58 | template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits)
59 | {
60 | const uint8_t utf8_byte_mask = 0x3f;
61 |
62 | while (size)
63 | {
64 | uint8_t lead = *data;
65 |
66 | // 0xxxxxxx -> U+0000..U+007F
67 | if (lead < 0x80)
68 | {
69 | result = Traits::low(result, lead);
70 | data += 1;
71 | size -= 1;
72 |
73 | // process aligned single-byte (ascii) blocks
74 | if ((reinterpret_cast(data) & 3) == 0)
75 | {
76 | // round-trip through void* to silence 'cast increases required alignment of target type' warnings
77 | while (size >= 4 && (*static_cast(static_cast(data)) & 0x80808080) == 0)
78 | {
79 | result = Traits::low(result, data[0]);
80 | result = Traits::low(result, data[1]);
81 | result = Traits::low(result, data[2]);
82 | result = Traits::low(result, data[3]);
83 | data += 4;
84 | size -= 4;
85 | }
86 | }
87 | }
88 | // 110xxxxx -> U+0080..U+07FF
89 | else if (static_cast(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
90 | {
91 | result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
92 | data += 2;
93 | size -= 2;
94 | }
95 | // 1110xxxx -> U+0800-U+FFFF
96 | else if (static_cast(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
97 | {
98 | result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
99 | data += 3;
100 | size -= 3;
101 | }
102 | // 11110xxx -> U+10000..U+10FFFF
103 | else if (static_cast(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
104 | {
105 | result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
106 | data += 4;
107 | size -= 4;
108 | }
109 | // 10xxxxxx or 11111xxx -> invalid
110 | else
111 | {
112 | data += 1;
113 | size -= 1;
114 | }
115 | }
116 |
117 | return result;
118 | }
119 | };
120 |
121 | size_t utf8_cleanup(char* buffer, size_t length)
122 | {
123 | uint8_t* obegin = reinterpret_cast(buffer);
124 | uint8_t* oend = utf8_decoder::process(obegin, length, obegin, utf8_writer());
125 | return oend - obegin;
126 | }
127 |
--------------------------------------------------------------------------------
/c_src/utf8_cleanup.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_UTF8_CLEANUP_H_
2 | #define ERLXML_C_SRC_UTF8_CLEANUP_H_
3 |
4 | #include
5 | #include
6 |
7 | size_t utf8_cleanup(char* buffer, size_t length);
8 |
9 | #endif
10 |
--------------------------------------------------------------------------------
/c_src/xmlstreamparser.cc:
--------------------------------------------------------------------------------
1 | #include "xmlstreamparser.h"
2 | #include
3 | #include
4 |
5 | //http://pugixml.org/docs/manual.html
6 | //Limitations for stanza detection algorithm (streaming mode):
7 | // 1. not supporting cdata
8 | // 2. not supporting comments with special xml charachters inside
9 | // 3. not supporting doctype
10 |
11 | const size_t kDefaultBufferSize = 1024;
12 |
13 | // whitespace (space \n \r \t) lookup table
14 |
15 | const uint8_t kLookupWhitespace[256] = {
16 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, // 0
17 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
18 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2
19 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3
20 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4
21 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5
22 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6
23 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7
24 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
25 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C
29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D
30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E
31 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F
32 | };
33 |
34 | // match - ! ? and / . we don't increase the nested level for those in case are before >
35 | // and also we ignore the stanza's that has only one element of this type (header or comment)
36 |
37 | const uint8_t kLookupSkipTag[256] = {
38 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0
39 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //1
40 | 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, //2
41 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, //3
42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //4
43 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //5
44 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //6
45 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //7
46 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //8
47 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //9
48 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //A
49 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //B
50 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //C
51 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //D
52 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //E
53 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //F
54 | };
55 |
56 | XmlStreamParser::XmlStreamParser(size_t max_stanza, bool strip_invalid_utf8, XmlStartStreamHandler start_h, XmlEndStreamHandler end_h, XmlStreamElementHandler el_h) :
57 | process_root_(true),
58 | max_stanza_bytes_(max_stanza),
59 | strip_invalid_utf8_(strip_invalid_utf8),
60 | start_stream_handler_(start_h),
61 | end_stream_handler_(end_h),
62 | element_handler_(el_h),
63 | nested_level_(-1),
64 | last_start_tag_index_(-1),
65 | first_start_tag_index_(-1)
66 | {
67 |
68 | }
69 |
70 | XmlStreamParser::~XmlStreamParser()
71 | {
72 |
73 | }
74 |
75 | XmlStreamParser::parse_result XmlStreamParser::FeedData(const uint8_t* data, size_t length, void* user_data)
76 | {
77 | size_t last_index = buffer_.Length();
78 | buffer_.WriteBytes(data, length);
79 | parse_result result = DoProcess(last_index, buffer_.Length(), user_data);
80 |
81 | if(result == kParseOk && buffer_.Capacity() > kDefaultBufferSize)
82 | buffer_.Resize(std::max(buffer_.Length(), kDefaultBufferSize));
83 |
84 | return result;
85 | }
86 |
87 | XmlStreamParser::parse_result XmlStreamParser::DoProcess(size_t start, size_t end, void* user_data)
88 | {
89 | uint8_t* ptr = const_cast(buffer_.Data());
90 | size_t max_end_position = max_stanza_bytes_ > 0 ? std::min(max_stanza_bytes_, end) : end;
91 |
92 | int64_t end_stanza_index = FindStanzaUpperLimit(ptr, start, max_end_position);
93 |
94 | if(end_stanza_index == -1)
95 | {
96 | Reset(false);
97 | return kParseInvalidXml;
98 | }
99 |
100 | size_t end_stanza_pos = static_cast(end_stanza_index);
101 |
102 | if(end_stanza_pos == max_end_position)
103 | {
104 | if(max_stanza_bytes_ && max_end_position == max_stanza_bytes_)
105 | {
106 | Reset(false);
107 | return kParseStanzaLimitHit;
108 | }
109 |
110 | if(nested_level_ == -1 && process_root_ == false)
111 | {
112 | //finished the stream
113 | end_stream_handler_(user_data, root_name_);
114 | Reset(true);
115 | return kParseOk;
116 | }
117 |
118 | return kParseOk;
119 | }
120 |
121 | end_stanza_pos++;
122 |
123 | if(!PushStanza(ptr, end_stanza_pos, user_data))
124 | {
125 | Reset(false);
126 | return kParseInvalidXml;
127 | }
128 |
129 | buffer_.Consume(end_stanza_pos);
130 |
131 | size_t remaining = buffer_.Length();
132 |
133 | if(!remaining)
134 | return kParseOk;
135 |
136 | return DoProcess(0, remaining, user_data);
137 | }
138 |
139 | int64_t XmlStreamParser::FindStanzaUpperLimit(const uint8_t* ptr, size_t start, size_t end)
140 | {
141 | size_t index = start;
142 |
143 | if(last_start_tag_index_ == -1)
144 | {
145 | while (index < end && kLookupWhitespace[ptr[index]])
146 | index++;
147 |
148 | if(index < end && ptr[index] != '<')
149 | return -1;
150 | }
151 |
152 | for(; index < end; index++)
153 | {
154 | switch (ptr[index])
155 | {
156 | case '<':
157 |
158 | if(first_start_tag_index_ == -1)
159 | first_start_tag_index_ = index;
160 |
161 | last_start_tag_index_ = index;
162 | break;
163 |
164 | case '>':
165 |
166 | if(last_start_tag_index_ == -1)
167 | return -1;
168 |
169 | if(ptr[last_start_tag_index_+1] == '/')
170 | {
171 | nested_level_--;
172 | }
173 | else
174 | {
175 | if(kLookupSkipTag[ptr[index - 1]] == 0)
176 | nested_level_++;
177 | }
178 |
179 | if(nested_level_ == 0)
180 | return index;
181 |
182 | break;
183 | }
184 | }
185 |
186 | return index;
187 | }
188 |
189 | bool XmlStreamParser::PushStanza(uint8_t* buffer, size_t length, void* user_data)
190 | {
191 | if(process_root_)
192 | return ProcessRootElement(buffer, length, user_data);
193 |
194 | // don't parse anything in case we have a header or comment as first element
195 | // for this reason we need to skip all spaces
196 |
197 | if(!kLookupSkipTag[buffer[first_start_tag_index_+1]])
198 | {
199 | pugi::xml_parse_status result = pugi_doc_.load_buffer_inplace(buffer, length).status;
200 |
201 | if(result != pugi::status_ok)
202 | return false;
203 |
204 | element_handler_(user_data, pugi_doc_, strip_invalid_utf8_);
205 | }
206 |
207 | last_start_tag_index_ = -1;
208 | first_start_tag_index_ = -1;
209 | assert(nested_level_ == 0);
210 | return true;
211 | }
212 |
213 | bool XmlStreamParser::ProcessRootElement(uint8_t* buffer, size_t length, void* user_data)
214 | {
215 | if(!length)
216 | return false;
217 |
218 | ByteBuffer rootbuff(length+1);
219 | rootbuff.WriteBytes(buffer, length-1);
220 | rootbuff.WriteBytes(reinterpret_cast("/>"), 2);
221 |
222 | pugi::xml_parse_status result = pugi_doc_.load_buffer_inplace(const_cast(rootbuff.Data()), rootbuff.Length(), pugi::parse_default, pugi::encoding_utf8).status;
223 |
224 | if(result != pugi::status_ok)
225 | return false;
226 |
227 | if(!start_stream_handler_(user_data, pugi_doc_, strip_invalid_utf8_))
228 | return false;
229 |
230 | //drop all bytes so far
231 | root_name_ = pugi_doc_.document_element().name();
232 | process_root_ = false;
233 | last_start_tag_index_ = -1;
234 | first_start_tag_index_ = -1;
235 |
236 | return true;
237 | }
238 |
239 | void XmlStreamParser::Reset(bool cleanup)
240 | {
241 | if(cleanup)
242 | {
243 | buffer_.Clear();
244 | buffer_.Resize(kDefaultBufferSize);
245 | }
246 |
247 | nested_level_ = -1;
248 | process_root_ = true;
249 | last_start_tag_index_ = -1;
250 | first_start_tag_index_ = -1;
251 | }
252 |
--------------------------------------------------------------------------------
/c_src/xmlstreamparser.h:
--------------------------------------------------------------------------------
1 | #ifndef ERLXML_C_SRC_XMLSTREAMPARSER_H_
2 | #define ERLXML_C_SRC_XMLSTREAMPARSER_H_
3 |
4 | #include
5 |
6 | #include "pugixml.hpp"
7 | #include "bytebuffer.h"
8 | #include "macros.h"
9 |
10 | typedef bool (*XmlStartStreamHandler) (void* user_data, pugi::xml_document& doc, bool strip_non_utf8);
11 | typedef void (*XmlEndStreamHandler) (void* user_data, const std::string& name);
12 | typedef void (*XmlStreamElementHandler) (void* user_data, pugi::xml_document& doc, bool strip_non_utf8);
13 |
14 | class XmlStreamParser
15 | {
16 | public:
17 |
18 | enum parse_result { kParseOk = 0, kParseStanzaLimitHit, kParseInvalidXml };
19 |
20 | XmlStreamParser(size_t max_stanza, bool strip_invalid_utf8, XmlStartStreamHandler start_h, XmlEndStreamHandler end_h, XmlStreamElementHandler el_h);
21 | ~XmlStreamParser();
22 |
23 | parse_result FeedData(const uint8_t* data, size_t size, void* user_data);
24 | void Reset(bool cleanup);
25 |
26 | const ByteBuffer* GetBufferedData() {return &buffer_;}
27 |
28 | private:
29 |
30 | parse_result DoProcess(size_t start, size_t size, void* user_data);
31 | bool PushStanza(uint8_t* buffer, size_t length, void* user_data);
32 | int64_t FindStanzaUpperLimit(const uint8_t* ptr, size_t start, size_t size);
33 | bool ProcessRootElement(uint8_t* buffer, size_t length, void* user_data);
34 |
35 | bool process_root_;
36 | size_t max_stanza_bytes_;
37 | bool strip_invalid_utf8_;
38 |
39 | ByteBuffer buffer_;
40 | XmlStartStreamHandler start_stream_handler_;
41 | XmlEndStreamHandler end_stream_handler_;
42 | XmlStreamElementHandler element_handler_;
43 | int32_t nested_level_;
44 | int64_t last_start_tag_index_;
45 | int64_t first_start_tag_index_;
46 | pugi::xml_document pugi_doc_;
47 | std::string root_name_;
48 | };
49 |
50 | #endif
51 |
--------------------------------------------------------------------------------
/include/erlxml.hrl:
--------------------------------------------------------------------------------
1 |
2 | -author("silviu.caragea").
3 |
4 | -type xmlattr() :: {binary(), binary()}.
5 |
6 | -record(xmlcdata, {content = [] :: iodata()}).
7 | -record(xmlel, {name :: binary(), attrs = [] :: [xmlattr()], children = [] :: [#xmlel{} | #xmlcdata{}]}).
8 | -record(xmlstreamstart, {name :: binary(), attrs = [] :: [xmlattr()]}).
9 | -record(xmlstreamend, {name :: binary()}).
10 |
11 | -type xmlterm() :: #xmlel{} | xmlattr() | #xmlcdata{}.
12 | -type erlxml_option():: {stanza_limit, non_neg_integer()}.
13 | -type reason() :: invalid_stanza | max_stanza_limit_hit | badarg | binary().
14 |
--------------------------------------------------------------------------------
/rebar.config:
--------------------------------------------------------------------------------
1 | {pre_hooks, [{"(linux|darwin)", compile, "make compile"}]}.
2 | {post_hooks, [{"(linux|darwin)", clean, "make clean"}]}.
3 |
4 | {artifacts, ["priv/erlxml_nif.so"]}.
5 |
6 | {project_plugins, [rebar3_hex]}.
7 |
8 | {erl_opts, [
9 | warn_unused_vars,
10 | warn_shadow_vars,
11 | warn_unused_import,
12 | warn_unused_function,
13 | warn_bif_clash,
14 | warn_unused_record,
15 | warn_deprecated_function,
16 | warn_obsolete_guard,
17 | strict_validation,
18 | warn_export_vars,
19 | warn_exported_vars,
20 | warn_export_all,
21 | warnings_as_errors
22 | ]}.
23 |
24 | {cover_enabled, false}.
25 |
26 | {profiles, [
27 | {bench, [
28 | {src_dirs, ["src", "benchmark"]},
29 | {deps, [
30 | {fast_xml, ".*", {git, "https://github.com/processone/fast_xml.git", {tag, "1.1.55"}}},
31 | {exml, ".*", {git, "https://github.com/esl/exml.git", {tag, "3.4.1"}}}
32 | ]}
33 | ]}
34 | ]}.
35 |
--------------------------------------------------------------------------------
/rebar.lock:
--------------------------------------------------------------------------------
1 | [].
2 |
--------------------------------------------------------------------------------
/src/erlxml.app.src:
--------------------------------------------------------------------------------
1 | {application, erlxml, [
2 | {description, "erlxml - Erlang XML parsing library based on pugixml."},
3 | {licenses, ["MIT"]},
4 | {links,[{"Github","https://github.com/silviucpp/erlxml"}]},
5 | {vsn, "2.1.1"},
6 | {registered, []},
7 | {applications, [
8 | kernel,
9 | stdlib
10 | ]},
11 | {pkg_name, erlxml2},
12 | {env, []},
13 | {files, [
14 | "LICENSE*",
15 | "*.MD",
16 | "Makefile",
17 | "rebar.config",
18 | "rebar.lock",
19 | "include/*.hrl",
20 | "src/*.erl",
21 | "src/*.src",
22 | "benchmark/*.erl",
23 | "c_src/pugixml/*.hpp",
24 | "c_src/pugixml/*.cpp",
25 | "c_src/*.h",
26 | "c_src/*.cc",
27 | "c_src/Makefile",
28 | "c_src/nif.mk",
29 | "test/*.erl",
30 | "test/data/*.txt"
31 | ]}
32 | ]}.
33 |
--------------------------------------------------------------------------------
/src/erlxml.erl:
--------------------------------------------------------------------------------
1 | -module(erlxml).
2 | -author("silviu.caragea").
3 |
4 | -include("erlxml.hrl").
5 |
6 | -export([
7 | new_stream/0,
8 | new_stream/1,
9 | parse_stream/2,
10 | reset_stream/1,
11 | parse/1,
12 | to_binary/1
13 | ]).
14 |
15 | -spec new_stream() ->
16 | {ok, reference()} | {error, reason()}.
17 |
18 | new_stream() ->
19 | new_stream([]).
20 |
21 | -spec new_stream([erlxml_option()]) ->
22 | {ok, reference()} | {error, reason()}.
23 |
24 | new_stream(Options) ->
25 | erlxml_nif:new_stream(Options).
26 |
27 | -spec parse_stream(reference(), iolist() | binary()) ->
28 | {ok, [#xmlstreamstart{} | #xmlel{} | #xmlstreamend{}]} | {error, reason()} | {error, reason(), binary()}.
29 |
30 | parse_stream(Parser, Data) ->
31 | erlxml_nif:chunk_feed_stream(Parser, Data).
32 |
33 | -spec reset_stream(reference()) ->
34 | ok | {error, reason()}.
35 |
36 | reset_stream(Parser) ->
37 | erlxml_nif:reset_stream(Parser).
38 |
39 | -spec parse(iolist() | binary()) ->
40 | {ok, #xmlel{}} | {error, reason()}.
41 |
42 | parse(Data) ->
43 | erlxml_nif:dom_parse(Data).
44 |
45 | -spec to_binary(#xmlel{}) ->
46 | binary() | {error, reason()}.
47 |
48 | to_binary(Data) ->
49 | erlxml_nif:to_binary(Data).
50 |
--------------------------------------------------------------------------------
/src/erlxml_nif.erl:
--------------------------------------------------------------------------------
1 | -module(erlxml_nif).
2 | -author("silviu.caragea").
3 |
4 | -define(NOT_LOADED, not_loaded(?LINE)).
5 | %% Maximum bytes passed to the NIF handler at once (20Kb)
6 | -define(MAX_BYTES_TO_NIF, 20000).
7 |
8 | -on_load(load_nif/0).
9 |
10 | -export([
11 | new_stream/1,
12 | chunk_feed_stream/2,
13 | reset_stream/1,
14 | dom_parse/1,
15 | to_binary/1
16 | ]).
17 |
18 | %% nif functions
19 |
20 | load_nif() ->
21 | ok = erlang:load_nif(get_nif_library_path(), 0).
22 |
23 | get_nif_library_path() ->
24 | case code:priv_dir(erlxml) of
25 | {error, bad_name} ->
26 | case filelib:is_dir(filename:join(["..", priv])) of
27 | true ->
28 | filename:join(["..", priv, ?MODULE]);
29 | false ->
30 | filename:join([priv, ?MODULE])
31 | end;
32 | Dir ->
33 | filename:join(Dir, ?MODULE)
34 | end.
35 |
36 | not_loaded(Line) ->
37 | erlang:nif_error({not_loaded, [{module, ?MODULE}, {line, Line}]}).
38 |
39 | new_stream(_Opts) ->
40 | ?NOT_LOADED.
41 |
42 | feed_stream(_Parser, _Data) ->
43 | ?NOT_LOADED.
44 |
45 | reset_stream(_Parser) ->
46 | ?NOT_LOADED.
47 |
48 | dom_parse(_Data) ->
49 | ?NOT_LOADED.
50 |
51 | to_binary(_Data) ->
52 | ?NOT_LOADED.
53 |
54 | chunk_feed_stream(Parser, Data) when is_binary(Data) ->
55 | chunk_feed_stream(Parser, Data, byte_size(Data), null);
56 | chunk_feed_stream(Parser, Data) ->
57 | chunk_feed_stream(Parser, iolist_to_binary(Data)).
58 |
59 | chunk_feed_stream(Parser, Data, Size, Acc) ->
60 | case Size > ?MAX_BYTES_TO_NIF of
61 | true ->
62 | <> = Data,
63 | case feed_stream(Parser, Chunk) of
64 | {ok, Elements} ->
65 | chunk_feed_stream(Parser, Rest, Size - ?MAX_BYTES_TO_NIF, aggregate_els(Acc, Elements));
66 | Error ->
67 | Error
68 | end;
69 | _ ->
70 | case feed_stream(Parser, Data) of
71 | {ok, Elements} ->
72 | {ok, aggregate_els(Acc, Elements)};
73 | Error ->
74 | Error
75 | end
76 | end.
77 |
78 | aggregate_els(null, Els) ->
79 | Els;
80 | aggregate_els(Acc, Els) ->
81 | Els ++ Acc.
82 |
--------------------------------------------------------------------------------
/src/erlxml_utils.erl:
--------------------------------------------------------------------------------
1 | -module(erlxml_utils).
2 | -author("byron.wang").
3 | -include("erlxml.hrl").
4 |
5 | -export([
6 | cdata/1,
7 | subel/2,
8 | subel_cdata/2
9 | ]).
10 |
11 | -spec cdata(#xmlel{}) ->
12 | binary().
13 |
14 | cdata(#xmlel{children = Children}) ->
15 | case lists:keyfind(xmlcdata, 1, Children) of
16 | {xmlcdata, Xmlcdata} -> Xmlcdata;
17 | _ -> <<>>
18 | end.
19 |
20 | -spec subel(#xmlel{}, binary()) ->
21 | #xmlel{} | undefined.
22 |
23 | subel(#xmlel{children = Children}, Name) ->
24 | case lists:keyfind(Name, 2, Children) of
25 | #xmlel{} = X -> X;
26 | _ -> undefined
27 | end.
28 |
29 | -spec subel_cdata(#xmlel{}, binary()) ->
30 | binary() | undefined.
31 |
32 | subel_cdata(#xmlel{} = Xml, Name) ->
33 | case subel(Xml, Name) of
34 | #xmlel{} = X -> cdata(X);
35 | _ -> undefined
36 | end.
37 |
--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | *.beam
2 |
--------------------------------------------------------------------------------
/test/data/invalid_token_EF_B7_90.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/invalid_token_EF_B7_9F.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/invalid_token_EF_B7_A4.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/invalid_token_EF_B7_AF.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/invalid_token_EF_BF_BE.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/invalid_token_EF_BF_BF.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/stream.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | PLAIN
5 | SCRAM-SHA-1
6 |
7 |
8 | zlib
9 |
10 |
11 |
12 |
13 |
14 |
15 | user1
16 | password1
17 |
18 |
19 |
20 |
21 | user2
22 | password2
23 |
24 |
25 |
26 |
27 | user3
28 | password3
29 |
30 |
31 |
32 |
33 | user4
34 | password4
35 |
36 |
37 |
38 |
39 | user5
40 | password5
41 |
42 |
43 |
44 |
45 | user6
46 | password6
47 |
48 |
49 |
50 |
51 | user7
52 | password7
53 |
54 |
55 |
56 |
57 | user8
58 | password8
59 |
60 |
61 |
62 |
63 | user9
64 | password9
65 |
66 |
67 |
68 |
69 | user10
70 | password10
71 |
72 |
73 |
74 |
75 |
76 | Hello, user2!
77 |
78 |
79 | Hello, user3!
80 |
81 |
82 | Hello, user4!
83 |
84 |
85 | Hello, user5!
86 |
87 |
88 | Hello, user6!
89 |
90 |
91 | Hello, user7!
92 |
93 |
94 | Hello, user8!
95 |
96 |
97 | Hello, user9!
98 |
99 |
100 | Hello, user10!
101 |
102 |
103 | Hello, user11!
104 |
105 |
106 | Hello, user12!
107 |
108 |
109 | Hello, user13!
110 |
111 |
112 | Hello, user14!
113 |
114 |
115 | Hello, user15!
116 |
117 |
118 | Hello, user16!
119 |
120 |
121 | Hello, user17!
122 |
123 |
124 | Hello, user18!
125 |
126 |
127 | Hello, user19!
128 |
129 |
130 | Hello, user20!
131 |
132 |
133 | Hello, user21!
134 |
135 |
136 |
137 |
138 | chat
139 | Available
140 |
141 |
142 | chat
143 | Available
144 |
145 |
146 | chat
147 | Available
148 |
149 |
150 | chat
151 | Available
152 |
153 |
154 | chat
155 | Available
156 |
157 |
158 | chat
159 | Available
160 |
161 |
162 | chat
163 | Available
164 |
165 |
166 | chat
167 | Available
168 |
169 |
170 | chat
171 | Available
172 |
173 |
174 | chat
175 | Available
176 |
177 |
178 | chat
179 | Available
180 |
181 |
182 | chat
183 | Available
184 |
185 |
186 | chat
187 | Available
188 |
189 |
190 | chat
191 | Available
192 |
193 |
194 | chat
195 | Available
196 |
197 |
198 | chat
199 | Available
200 |
201 |
202 | chat
203 | Available
204 |
205 |
206 | chat
207 | Available
208 |
209 |
210 | chat
211 | Available
212 |
213 |
214 |
215 |
--------------------------------------------------------------------------------
/test/data/succeeded_C3_AF__C2_BF__C2_B0.txt:
--------------------------------------------------------------------------------
1 | 123ï¿°456
--------------------------------------------------------------------------------
/test/data/succeeded_C6_87.txt:
--------------------------------------------------------------------------------
1 | 123Ƈ456
--------------------------------------------------------------------------------
/test/data/succeeded_EF_B7_89.txt:
--------------------------------------------------------------------------------
1 | 123456
--------------------------------------------------------------------------------
/test/data/succeeded_EF_B7_B0.txt:
--------------------------------------------------------------------------------
1 | 123ﷰ456
--------------------------------------------------------------------------------
/test/data/succeeded_EF_B8_80.txt:
--------------------------------------------------------------------------------
1 | 123︀456
--------------------------------------------------------------------------------
/test/data/succeeded_EF_BF_AE.txt:
--------------------------------------------------------------------------------
1 | 123○456
--------------------------------------------------------------------------------
/test/data/succeeded_F0_90_8C_88.txt:
--------------------------------------------------------------------------------
1 | 123𐌈456
--------------------------------------------------------------------------------
/test/integrity_test.erl:
--------------------------------------------------------------------------------
1 | -module(integrity_test).
2 |
3 | -include_lib("eunit/include/eunit.hrl").
4 |
5 | -define(ROOT_DATA, "test/data").
6 |
7 | bad_options_test() ->
8 | {error,{options,{unavailable_option,1}}} = erlxml:new_stream([{unavailable_option, 1}]),
9 | true.
10 |
11 | to_binary_ok_test() ->
12 | Xml = {xmlel,<<"foo">>, [{<<"attr1">>,<<"bar">>}], [{xmlcdata,<<"Some Value">>}]},
13 | <<"Some Value">> = erlxml:to_binary(Xml),
14 | true.
15 |
16 | to_binary_error_test() ->
17 | Xml = {axmlel,<<"foo">>, [{<<"attr1">>,<<"bar">>}], [{xmlcdata,<<"Some Value">>}]},
18 | {error, badarg} = erlxml:to_binary(Xml),
19 | true.
20 |
21 | dom_parsing_ok_test() ->
22 | {ok,{xmlel,<<"foo">>, [{<<"attr1">>,<<"bar">>}], [{xmlcdata,<<"Some Value">>}]}} =
23 | erlxml:parse(<<"Some Value">>),
24 | true.
25 |
26 | dom_parsing_error_test() ->
27 | InvalidStaza = <<"Some Value>,
28 | {error,invalid_stanza} = erlxml:parse(InvalidStaza),
29 | true.
30 |
31 | stream_parsing_error_test() ->
32 | InvalidStaza = <<"foo attr1='bar'>Some Value>,
33 | {ok, Parser} = erlxml:new_stream(),
34 | {error, {invalid_stanza, InvalidStaza}} = erlxml:parse_stream(Parser, InvalidStaza),
35 | true.
36 |
37 | stream_parsing_invalid_stanza_start_error_test() ->
38 | {ok, Parser} = erlxml:new_stream(),
39 | {ok,[{xmlstreamstart,<<"stream">>,[]}]} = erlxml:parse_stream(Parser, <<"">>),
40 | {ok,[{xmlel,<<"tag1">>,[], [
41 | {xmlel,<<"g">>,[],[{xmlcdata,<<"sss">>}]}]}]} = erlxml:parse_stream(Parser, <<" sss">>),
42 | {error,{invalid_stanza,<<" tag1">>}} = erlxml:parse_stream(Parser, <<" tag1">>),
43 | true.
44 |
45 | max_stanza_limit_hit_test() ->
46 | Data = <<"1">>,
47 | {ok, Parser} = erlxml:new_stream([{stanza_limit, 11}]),
48 | {ok, Parser2} = erlxml:new_stream([{stanza_limit, 12}]),
49 | {error, {max_stanza_limit_hit, <<"1">>}} = erlxml:parse_stream(Parser, Data),
50 | {ok, _} = erlxml:parse_stream(Parser2, Data),
51 | true.
52 |
53 | max_stanza_limit_hit_cdata_test() ->
54 | MaxLimit = 65536,
55 | Overflow = 1,
56 |
57 | Head = <<"">>,
58 | Tail = <<"">>,
59 | Body = binary:copy(<<"1">>, (MaxLimit - (byte_size(Head) + byte_size(Tail)))+Overflow),
60 | PendingBuffer = <">>,
61 | Stanza = <<"", PendingBuffer/binary>>,
62 | {ok, Parser} = erlxml:new_stream([{stanza_limit, MaxLimit}]),
63 | {error, {max_stanza_limit_hit, PendingBuffer}} = erlxml:parse_stream(Parser, Stanza),
64 | true.
65 |
66 | chunks_test() ->
67 | Chunk1 = <<"\n\r >,
68 | Chunk2 = <<"\">Some Value">>,
69 |
70 | {ok, Parser} = erlxml:new_stream(),
71 | {ok,[{xmlstreamstart,<<"stream">>,[{<<"ss">>,<<"aa">>}]}]} = erlxml:parse_stream(Parser, Chunk1),
72 | {ok,[{xmlel,<<"foo">>,
73 | [{<<"attr1">>,<<"bar">>}],
74 | [{xmlcdata,<<"Some Value">>}]},
75 | {xmlel,<<"el2">>,[{<<"ss">>,<<"asd">>}],[]},
76 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, Chunk2),
77 | true.
78 |
79 | skip_header_and_comments_test() ->
80 | Data = <<"
81 |
82 |
83 |
84 | 1
85 | 2
86 | 3
87 | ">>,
88 |
89 | {ok, Parser} = erlxml:new_stream(),
90 | {ok,[{xmlstreamstart,<<"stream">>,[]},
91 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"1">>}]},
92 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"2">>}]},
93 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"3">>}]},
94 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, Data),
95 |
96 | ok = erlxml:reset_stream(Parser),
97 |
98 | {ok,[{xmlstreamstart,<<"stream">>,[]},
99 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"1">>}]},
100 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"2">>}]},
101 | {xmlel,<<"tag">>,[],[{xmlcdata,<<"3">>}]},
102 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, binary_to_list(Data)),
103 | true.
104 |
105 | one_by_one_char_test() ->
106 | Data = <<"
107 |
108 |
109 |
110 | 1
111 | 2
112 | 3
113 | ">>,
114 |
115 | {ok, Parser} = erlxml:new_stream(),
116 | [{ok, _} = erlxml:parse_stream(Parser, [X]) || <> <= Data],
117 | true.
118 |
119 | strip_invalid_utf8_test() ->
120 | Data0 = <<"123🏇4567">>,
121 | Length = byte_size(Data0) -1,
122 | <> = Data0,
123 | Msg= <<"", Data/binary, "">>,
124 | {ok, Parser} = erlxml:new_stream([{strip_non_utf8, true}]),
125 | {ok,[{xmlstreamstart,<<"stream">>,[]},
126 | {xmlel,<<"node">>,
127 | [{<<"a">>,<<"123456">>}],
128 | [{xmlcdata,<<"123456">>}]},
129 | {xmlstreamend,<<"stream">>}]} = erlxml:parse_stream(Parser, Msg),
130 | true.
131 |
132 | strip_invalid_token_EF_B7_9F_test() ->
133 | {ok, InvalidToken} = file:read_file(<>),
134 | true = test_strip_invalid_token(InvalidToken, <<"123456">>).
135 |
136 | strip_invalid_token_EF_B7_90_test() ->
137 | {ok, InvalidToken} = file:read_file(<>),
138 | true = test_strip_invalid_token(InvalidToken, <<"123456">>).
139 |
140 | strip_invalid_token_EF_B7_A4_test() ->
141 | {ok, InvalidToken} = file:read_file(<>),
142 | true = test_strip_invalid_token(InvalidToken, <<"123456">>).
143 |
144 | strip_invalid_token_EF_B7_AF_test() ->
145 | {ok, InvalidToken} = file:read_file(<>),
146 | true = test_strip_invalid_token(InvalidToken, <<"123456">>).
147 |
148 | strip_invalid_token_EF_BF_BE_test() ->
149 | {ok, InvalidToken} = file:read_file(<>),
150 | true = test_strip_invalid_token(InvalidToken, <<"123456">>).
151 |
152 | strip_invalid_token_EF_BF_BF_test() ->
153 | {ok, InvalidToken} = file:read_file(<>),
154 | true = test_strip_invalid_token(InvalidToken, <<"123456">>).
155 |
156 | succeeded_C3_AF__C2_BF__C2_B0_test() ->
157 | {ok, Token} = file:read_file(<>),
158 | true = test_strip_invalid_token(Token, Token).
159 |
160 | succeeded_C6_87_test() ->
161 | {ok, Token} = file:read_file(<>),
162 | true = test_strip_invalid_token(Token, Token).
163 |
164 | succeeded_EF_B7_89_test() ->
165 | {ok, Token} = file:read_file(<>),
166 | true = test_strip_invalid_token(Token, Token).
167 |
168 | succeeded_EF_B7_B0_test() ->
169 | {ok, Token} = file:read_file(<>),
170 | true = test_strip_invalid_token(Token, Token).
171 |
172 | succeeded_EF_B8_80_test() ->
173 | {ok, Token} = file:read_file(<>),
174 | true = test_strip_invalid_token(Token, Token).
175 |
176 | succeeded_EF_BF_AE_test() ->
177 | {ok, Token} = file:read_file(<>),
178 | true = test_strip_invalid_token(Token, Token).
179 |
180 | succeeded_F0_90_8C_88_test() ->
181 | {ok, Token} = file:read_file(<>),
182 | true = test_strip_invalid_token(Token, Token).
183 |
184 | % internals
185 |
186 | test_strip_invalid_token(InvalidToken, ExpectedResult) ->
187 | Data = <<"", InvalidToken/binary,"">>,
188 | {ok, Parser} = erlxml:new_stream([{strip_non_utf8, true}]),
189 | {ok,[{xmlstreamstart,<<"stream">>,[]}]} = erlxml:parse_stream(Parser, <<"">>),
190 | {ok,[{xmlel,<<"iq">>,
191 | [{<<"xmlns">>,<<"namespace">>}],
192 | [{xmlel,<<"body">>,[],[{xmlcdata, ExpectedResult}]}]}]} = erlxml:parse_stream(Parser, Data),
193 | true.
194 |
--------------------------------------------------------------------------------