├── .gitignore
├── HTMLp.DOMCore.pas
├── HTMLp.Entities.pas
├── HTMLp.Formatter.pas
├── HTMLp.HTMLParser.pas
├── HTMLp.HTMLReader.pas
├── HTMLp.HTMLTags.pas
├── HTMLp.Helper.pas
├── LICENSE
├── README.md
└── example
├── HTMLP.dpr
├── HTMLP.dproj
├── MainForm.dfm
├── MainForm.pas
└── index.html
/.gitignore:
--------------------------------------------------------------------------------
1 | # Uncomment these types if you want even more clean repository. But be careful.
2 | # It can make harm to an existing project source. Read explanations below.
3 | #
4 | # Resource files are binaries containing manifest, project icon and version info.
5 | # They can not be viewed as text or compared by diff-tools. Consider replacing them with .rc files.
6 | #*.res
7 | #
8 | # Type library file (binary). In old Delphi versions it should be stored.
9 | # Since Delphi 2009 it is produced from .ridl file and can safely be ignored.
10 | #*.tlb
11 | #
12 | # Diagram Portfolio file. Used by the diagram editor up to Delphi 7.
13 | # Uncomment this if you are not using diagrams or use newer Delphi version.
14 | #*.ddp
15 | #
16 | # Visual LiveBindings file. Added in Delphi XE2.
17 | # Uncomment this if you are not using LiveBindings Designer.
18 | #*.vlb
19 | #
20 | # Deployment Manager configuration file for your project. Added in Delphi XE2.
21 | # Uncomment this if it is not mobile development and you do not use remote debug feature.
22 | #*.deployproj
23 | #
24 | # C++ object files produced when C/C++ Output file generation is configured.
25 | # Uncomment this if you are not using external objects (zlib library for example).
26 | #*.obj
27 | #
28 |
29 | # Delphi compiler-generated binaries (safe to delete)
30 | *.exe
31 | *.dll
32 | *.bpl
33 | *.bpi
34 | *.dcp
35 | *.so
36 | *.apk
37 | *.drc
38 | *.map
39 | *.dres
40 | *.rsm
41 | *.tds
42 | *.dcu
43 | *.lib
44 | *.a
45 | *.o
46 | *.ocx
47 |
48 | # Delphi autogenerated files (duplicated info)
49 | *.cfg
50 | *.hpp
51 | *Resource.rc
52 |
53 | # Delphi local files (user-specific info)
54 | *.local
55 | *.identcache
56 | *.projdata
57 | *.tvsconfig
58 | *.dsk
59 |
60 | # Delphi history and backups
61 | __history/
62 | __recovery/
63 | *.~*
64 |
65 | # Castalia statistics file (since XE7 Castalia is distributed with Delphi)
66 | *.stat
67 |
--------------------------------------------------------------------------------
/HTMLp.Entities.pas:
--------------------------------------------------------------------------------
1 | unit HTMLp.Entities;
2 |
3 | interface
4 |
5 | const
6 | MaxEntNameLen = 8;
7 |
8 | function GetEntValue(const Name: string): WideChar;
9 | function GetEntName(Code: Word): string;
10 |
11 | implementation
12 |
13 | uses
14 | SysUtils, Classes;
15 |
16 | const
17 | EntCount = 252;
18 |
19 | type
20 | PEntity = ^TEntity;
21 | TEntity = record
22 | Name: String;
23 | Code: Word
24 | end;
25 |
26 | TEntities = array[0..EntCount - 1] of TEntity;
27 |
28 | const
29 | EntTab: TEntities = (
30 | (Name: 'nbsp'; Code: 160),
31 | (Name: 'iexcl'; Code: 161),
32 | (Name: 'cent'; Code: 162),
33 | (Name: 'pound'; Code: 163),
34 | (Name: 'curren'; Code: 164),
35 | (Name: 'yen'; Code: 165),
36 | (Name: 'brvbar'; Code: 166),
37 | (Name: 'sect'; Code: 167),
38 | (Name: 'uml'; Code: 168),
39 | (Name: 'copy'; Code: 169),
40 | (Name: 'ordf'; Code: 170),
41 | (Name: 'laquo'; Code: 171),
42 | (Name: 'not'; Code: 172),
43 | (Name: 'shy'; Code: 173),
44 | (Name: 'reg'; Code: 174),
45 | (Name: 'macr'; Code: 175),
46 | (Name: 'deg'; Code: 176),
47 | (Name: 'plusmn'; Code: 177),
48 | (Name: 'sup2'; Code: 178),
49 | (Name: 'sup3'; Code: 179),
50 | (Name: 'acute'; Code: 180),
51 | (Name: 'micro'; Code: 181),
52 | (Name: 'para'; Code: 182),
53 | (Name: 'middot'; Code: 183),
54 | (Name: 'cedil'; Code: 184),
55 | (Name: 'sup1'; Code: 185),
56 | (Name: 'ordm'; Code: 186),
57 | (Name: 'raquo'; Code: 187),
58 | (Name: 'frac14'; Code: 188),
59 | (Name: 'frac12'; Code: 189),
60 | (Name: 'frac34'; Code: 190),
61 | (Name: 'iquest'; Code: 191),
62 | (Name: 'Agrave'; Code: 192),
63 | (Name: 'Aacute'; Code: 193),
64 | (Name: 'Acirc'; Code: 194),
65 | (Name: 'Atilde'; Code: 195),
66 | (Name: 'Auml'; Code: 196),
67 | (Name: 'Aring'; Code: 197),
68 | (Name: 'AElig'; Code: 198),
69 | (Name: 'Ccedil'; Code: 199),
70 | (Name: 'Egrave'; Code: 200),
71 | (Name: 'Eacute'; Code: 201),
72 | (Name: 'Ecirc'; Code: 202),
73 | (Name: 'Euml'; Code: 203),
74 | (Name: 'Igrave'; Code: 204),
75 | (Name: 'Iacute'; Code: 205),
76 | (Name: 'Icirc'; Code: 206),
77 | (Name: 'Iuml'; Code: 207),
78 | (Name: 'ETH'; Code: 208),
79 | (Name: 'Ntilde'; Code: 209),
80 | (Name: 'Ograve'; Code: 210),
81 | (Name: 'Oacute'; Code: 211),
82 | (Name: 'Ocirc'; Code: 212),
83 | (Name: 'Otilde'; Code: 213),
84 | (Name: 'Ouml'; Code: 214),
85 | (Name: 'times'; Code: 215),
86 | (Name: 'Oslash'; Code: 216),
87 | (Name: 'Ugrave'; Code: 217),
88 | (Name: 'Uacute'; Code: 218),
89 | (Name: 'Ucirc'; Code: 219),
90 | (Name: 'Uuml'; Code: 220),
91 | (Name: 'Yacute'; Code: 221),
92 | (Name: 'THORN'; Code: 222),
93 | (Name: 'szlig'; Code: 223),
94 | (Name: 'agrave'; Code: 224),
95 | (Name: 'aacute'; Code: 225),
96 | (Name: 'acirc'; Code: 226),
97 | (Name: 'atilde'; Code: 227),
98 | (Name: 'auml'; Code: 228),
99 | (Name: 'aring'; Code: 229),
100 | (Name: 'aelig'; Code: 230),
101 | (Name: 'ccedil'; Code: 231),
102 | (Name: 'egrave'; Code: 232),
103 | (Name: 'eacute'; Code: 233),
104 | (Name: 'ecirc'; Code: 234),
105 | (Name: 'euml'; Code: 235),
106 | (Name: 'igrave'; Code: 236),
107 | (Name: 'iacute'; Code: 237),
108 | (Name: 'icirc'; Code: 238),
109 | (Name: 'iuml'; Code: 239),
110 | (Name: 'eth'; Code: 240),
111 | (Name: 'ntilde'; Code: 241),
112 | (Name: 'ograve'; Code: 242),
113 | (Name: 'oacute'; Code: 243),
114 | (Name: 'ocirc'; Code: 244),
115 | (Name: 'otilde'; Code: 245),
116 | (Name: 'ouml'; Code: 246),
117 | (Name: 'divide'; Code: 247),
118 | (Name: 'oslash'; Code: 248),
119 | (Name: 'ugrave'; Code: 249),
120 | (Name: 'uacute'; Code: 250),
121 | (Name: 'ucirc'; Code: 251),
122 | (Name: 'uuml'; Code: 252),
123 | (Name: 'yacute'; Code: 253),
124 | (Name: 'thorn'; Code: 254),
125 | (Name: 'yuml'; Code: 255),
126 | (Name: 'fnof'; Code: 402),
127 | (Name: 'Alpha'; Code: 913),
128 | (Name: 'Beta'; Code: 914),
129 | (Name: 'Gamma'; Code: 915),
130 | (Name: 'Delta'; Code: 916),
131 | (Name: 'Epsilon'; Code: 917),
132 | (Name: 'Zeta'; Code: 918),
133 | (Name: 'Eta'; Code: 919),
134 | (Name: 'Theta'; Code: 920),
135 | (Name: 'Iota'; Code: 921),
136 | (Name: 'Kappa'; Code: 922),
137 | (Name: 'Lambda'; Code: 923),
138 | (Name: 'Mu'; Code: 924),
139 | (Name: 'Nu'; Code: 925),
140 | (Name: 'Xi'; Code: 926),
141 | (Name: 'Omicron'; Code: 927),
142 | (Name: 'Pi'; Code: 928),
143 | (Name: 'Rho'; Code: 929),
144 | (Name: 'Sigma'; Code: 931),
145 | (Name: 'Tau'; Code: 932),
146 | (Name: 'Upsilon'; Code: 933),
147 | (Name: 'Phi'; Code: 934),
148 | (Name: 'Chi'; Code: 935),
149 | (Name: 'Psi'; Code: 936),
150 | (Name: 'Omega'; Code: 937),
151 | (Name: 'alpha'; Code: 945),
152 | (Name: 'beta'; Code: 946),
153 | (Name: 'gamma'; Code: 947),
154 | (Name: 'delta'; Code: 948),
155 | (Name: 'epsilon'; Code: 949),
156 | (Name: 'zeta'; Code: 950),
157 | (Name: 'eta'; Code: 951),
158 | (Name: 'theta'; Code: 952),
159 | (Name: 'iota'; Code: 953),
160 | (Name: 'kappa'; Code: 954),
161 | (Name: 'lambda'; Code: 955),
162 | (Name: 'mu'; Code: 956),
163 | (Name: 'nu'; Code: 957),
164 | (Name: 'xi'; Code: 958),
165 | (Name: 'omicron'; Code: 959),
166 | (Name: 'pi'; Code: 960),
167 | (Name: 'rho'; Code: 961),
168 | (Name: 'sigmaf'; Code: 962),
169 | (Name: 'sigma'; Code: 963),
170 | (Name: 'tau'; Code: 964),
171 | (Name: 'upsilon'; Code: 965),
172 | (Name: 'phi'; Code: 966),
173 | (Name: 'chi'; Code: 967),
174 | (Name: 'psi'; Code: 968),
175 | (Name: 'omega'; Code: 969),
176 | (Name: 'thetasym'; Code: 977),
177 | (Name: 'upsih'; Code: 978),
178 | (Name: 'piv'; Code: 982),
179 | (Name: 'bull'; Code: 8226),
180 | (Name: 'hellip'; Code: 8230),
181 | (Name: 'prime'; Code: 8242),
182 | (Name: 'Prime'; Code: 8243),
183 | (Name: 'oline'; Code: 8254),
184 | (Name: 'frasl'; Code: 8260),
185 | (Name: 'weierp'; Code: 8472),
186 | (Name: 'image'; Code: 8465),
187 | (Name: 'real'; Code: 8476),
188 | (Name: 'trade'; Code: 8482),
189 | (Name: 'alefsym'; Code: 8501),
190 | (Name: 'larr'; Code: 8592),
191 | (Name: 'uarr'; Code: 8593),
192 | (Name: 'rarr'; Code: 8594),
193 | (Name: 'darr'; Code: 8595),
194 | (Name: 'harr'; Code: 8596),
195 | (Name: 'crarr'; Code: 8629),
196 | (Name: 'lArr'; Code: 8656),
197 | (Name: 'uArr'; Code: 8657),
198 | (Name: 'rArr'; Code: 8658),
199 | (Name: 'dArr'; Code: 8659),
200 | (Name: 'hArr'; Code: 8660),
201 | (Name: 'forall'; Code: 8704),
202 | (Name: 'part'; Code: 8706),
203 | (Name: 'exist'; Code: 8707),
204 | (Name: 'empty'; Code: 8709),
205 | (Name: 'nabla'; Code: 8711),
206 | (Name: 'isin'; Code: 8712),
207 | (Name: 'notin'; Code: 8713),
208 | (Name: 'ni'; Code: 8715),
209 | (Name: 'prod'; Code: 8719),
210 | (Name: 'sum'; Code: 8721),
211 | (Name: 'minus'; Code: 8722),
212 | (Name: 'lowast'; Code: 8727),
213 | (Name: 'radic'; Code: 8730),
214 | (Name: 'prop'; Code: 8733),
215 | (Name: 'infin'; Code: 8734),
216 | (Name: 'ang'; Code: 8736),
217 | (Name: 'and'; Code: 8743),
218 | (Name: 'or'; Code: 8744),
219 | (Name: 'cap'; Code: 8745),
220 | (Name: 'cup'; Code: 8746),
221 | (Name: 'int'; Code: 8747),
222 | (Name: 'there4'; Code: 8756),
223 | (Name: 'sim'; Code: 8764),
224 | (Name: 'cong'; Code: 8773),
225 | (Name: 'asymp'; Code: 8776),
226 | (Name: 'ne'; Code: 8800),
227 | (Name: 'equiv'; Code: 8801),
228 | (Name: 'le'; Code: 8804),
229 | (Name: 'ge'; Code: 8805),
230 | (Name: 'sub'; Code: 8834),
231 | (Name: 'sup'; Code: 8835),
232 | (Name: 'nsub'; Code: 8836),
233 | (Name: 'sube'; Code: 8838),
234 | (Name: 'supe'; Code: 8839),
235 | (Name: 'oplus'; Code: 8853),
236 | (Name: 'otimes'; Code: 8855),
237 | (Name: 'perp'; Code: 8869),
238 | (Name: 'sdot'; Code: 8901),
239 | (Name: 'lceil'; Code: 8968),
240 | (Name: 'rceil'; Code: 8969),
241 | (Name: 'lfloor'; Code: 8970),
242 | (Name: 'rfloor'; Code: 8971),
243 | (Name: 'lang'; Code: 9001),
244 | (Name: 'rang'; Code: 9002),
245 | (Name: 'loz'; Code: 9674),
246 | (Name: 'spades'; Code: 9824),
247 | (Name: 'clubs'; Code: 9827),
248 | (Name: 'hearts'; Code: 9829),
249 | (Name: 'diams'; Code: 9830),
250 | (Name: 'quot'; Code: 34),
251 | (Name: 'amp'; Code: 38),
252 | (Name: 'lt'; Code: 60),
253 | (Name: 'gt'; Code: 62),
254 | (Name: 'OElig'; Code: 338),
255 | (Name: 'oelig'; Code: 339),
256 | (Name: 'Scaron'; Code: 352),
257 | (Name: 'scaron'; Code: 353),
258 | (Name: 'Yuml'; Code: 376),
259 | (Name: 'circ'; Code: 710),
260 | (Name: 'tilde'; Code: 732),
261 | (Name: 'ensp'; Code: 8194),
262 | (Name: 'emsp'; Code: 8195),
263 | (Name: 'thinsp'; Code: 8201),
264 | (Name: 'zwnj'; Code: 8204),
265 | (Name: 'zwj'; Code: 8205),
266 | (Name: 'lrm'; Code: 8206),
267 | (Name: 'rlm'; Code: 8207),
268 | (Name: 'ndash'; Code: 8211),
269 | (Name: 'mdash'; Code: 8212),
270 | (Name: 'lsquo'; Code: 8216),
271 | (Name: 'rsquo'; Code: 8217),
272 | (Name: 'sbquo'; Code: 8218),
273 | (Name: 'ldquo'; Code: 8220),
274 | (Name: 'rdquo'; Code: 8221),
275 | (Name: 'bdquo'; Code: 8222),
276 | (Name: 'dagger'; Code: 8224),
277 | (Name: 'Dagger'; Code: 8225),
278 | (Name: 'permil'; Code: 8240),
279 | (Name: 'lsaquo'; Code: 8249),
280 | (Name: 'rsaquo'; Code: 8250),
281 | (Name: 'euro'; Code: 8364)
282 | );
283 |
284 | type
285 | TEntList = class(TList)
286 | private
287 | function GetCode(const Name: string): Integer;
288 | public
289 | constructor Create;
290 | property Code[const Name: string]: Integer read GetCode;
291 | end;
292 |
293 | var
294 | EntityList: TEntList;
295 |
296 | function EntCompare(Ent1, Ent2: Pointer): Integer;
297 | begin
298 | Result := AnsiCompareStr(PEntity(Ent1)^.Name, PEntity(Ent2)^.Name);
299 | end;
300 |
301 | constructor TEntList.Create;
302 | var
303 | I: Integer;
304 | begin
305 | inherited Create;
306 |
307 | Capacity := EntCount;
308 | for I := 0 to EntCount - 1 do Add(@EntTab[I]);
309 | Sort(EntCompare);
310 | end;
311 |
312 | function TEntList.GetCode(const Name: string): Integer;
313 | var
314 | I, L, U, Cmp: Integer;
315 | begin
316 | L := 0;
317 | U := Count - 1;
318 | while L <= U do
319 | begin
320 | I := (L + U) div 2;
321 |
322 | Cmp := AnsiCompareStr(Name, PEntity(Items[I])^.Name);
323 | if Cmp = 0 then
324 | begin
325 | Result := PEntity(Items[I])^.Code;
326 | Exit
327 | end;
328 |
329 | if Cmp < 0 then U := I - 1
330 | else L := I + 1;
331 | end;
332 |
333 | Result := 32;
334 | end;
335 |
336 | function GetEntValue(const Name: string): WideChar;
337 | begin
338 | Result := WideChar(EntityList.Code[Name]);
339 | end;
340 |
341 | function GetEntName(Code: Word): string;
342 | var
343 | I: Integer;
344 | begin
345 | for I := 0 to EntCount - 1 do
346 | begin
347 | if EntTab[I].Code = Code then
348 | begin
349 | Result := EntTab[I].Name;
350 | Exit
351 | end;
352 | end;
353 |
354 | Result := '';
355 | end;
356 |
357 | initialization
358 | EntityList := TEntList.Create;
359 | finalization
360 | EntityList.Free;
361 | end.
362 |
--------------------------------------------------------------------------------
/HTMLp.Formatter.pas:
--------------------------------------------------------------------------------
1 | unit HTMLp.Formatter;
2 |
3 | interface
4 |
5 | uses
6 | HTMLp.DomCore, HTMLp.HtmlTags;
7 |
8 | const
9 | SHOW_ALL = $FFFFFFFF;
10 | SHOW_ELEMENT = $00000001;
11 | SHOW_ATTRIBUTE = $00000002;
12 | SHOW_TEXT = $00000004;
13 | SHOW_CDATA_SECTION = $00000008;
14 | SHOW_ENTITY_REFERENCE = $00000010;
15 | SHOW_ENTITY = $00000020;
16 | SHOW_PROCESSING_INSTRUCTION = $00000040;
17 | SHOW_COMMENT = $00000080;
18 | SHOW_DOCUMENT = $00000100;
19 | SHOW_DOCUMENT_TYPE = $00000200;
20 | SHOW_DOCUMENT_FRAGMENT = $00000400;
21 | SHOW_NOTATION = $00000800;
22 |
23 | type
24 | TStringBuilder = class
25 | private
26 | FCapacity: Integer;
27 | FLength: Integer;
28 | FValue: WideString;
29 | public
30 | constructor Create(ACapacity: Integer);
31 | function EndWithWhiteSpace: Boolean;
32 | function TailMatch(const Tail: WideString): Boolean;
33 | function ToString: WideString;
34 | procedure AppendText(const TextStr: WideString);
35 | property Length: Integer read FLength;
36 | end;
37 |
38 | TBaseFormatter = class
39 | private
40 | procedure ProcessNode(Node: TNode);
41 | protected
42 | FDocument: TDocument;
43 | FStringBuilder: TStringBuilder;
44 | FHTMLTagList: THTMLTagList;
45 | FDepth: Integer;
46 | FWhatToShow: Integer;
47 | FExpandEntities: Boolean;
48 | FPreserveWhiteSpace: Boolean;
49 | FInAttributes: Boolean;
50 | procedure AppendNewLine;
51 | procedure AppendParagraph;
52 | procedure AppendText(const TextStr: WideString); virtual;
53 | procedure ProcessAttribute(Attr: TAttr); virtual;
54 | procedure ProcessAttributes(Element: TElement); virtual;
55 | procedure ProcessCDataSection(CDataSection: TCDataSection); virtual;
56 | procedure ProcessComment(Comment: TComment); virtual;
57 | procedure ProcessDocumentElement; virtual;
58 | procedure ProcessElement(Element: TElement); virtual;
59 | procedure ProcessEntityReference(EntityReference: TEntityReference); virtual;
60 | // procedure ProcessNotation(Notation: TNotation); virtual;
61 | procedure ProcessProcessingInstruction(ProcessingInstruction: TProcessingInstruction); virtual;
62 | procedure ProcessTextNode(TextNode: TTextNode); virtual;
63 | public
64 | constructor Create;
65 | destructor Destroy; override;
66 |
67 | function getText(document: TDocument): WideString;
68 | end;
69 |
70 | THtmlFormatter = class(TBaseFormatter)
71 | private
72 | FIndent: Integer;
73 | function OnlyTextContent(Element: TElement): Boolean;
74 | protected
75 | procedure ProcessAttribute(Attr: TAttr); override;
76 | procedure ProcessComment(Comment: TComment); override;
77 | procedure ProcessElement(Element: TElement); override;
78 | procedure ProcessTextNode(TextNode: TTextNode); override;
79 | public
80 | constructor Create;
81 | property Indent: Integer read FIndent write FIndent;
82 | end;
83 |
84 | TTextFormatter = class(TBaseFormatter)
85 | protected
86 | FInsideAnchor: Boolean;
87 | function GetAnchorText(Node: TElement): WideString; virtual;
88 | function GetImageText(Node: TElement): WideString; virtual;
89 | procedure AppendText(const TextStr: WideString); override;
90 | procedure ProcessElement(Element: TElement); override;
91 | procedure ProcessEntityReference(EntityReference: TEntityReference); override;
92 | procedure ProcessTextNode(TextNode: TTextNode); override;
93 | public
94 | constructor Create;
95 | end;
96 |
97 | implementation
98 |
99 | uses
100 | SysUtils,
101 |
102 | HTMLp.Entities;
103 |
104 | const
105 | CRLF: WideString = #13#10;
106 | PARAGRAPH_SEPARATOR: WideString = #13#10#13#10;
107 |
108 | ViewAsBlockTags: THTMLTagSet = [
109 | ADDRESS_TAG, BLOCKQUOTE_TAG, CAPTION_TAG, CENTER_TAG, DD_TAG, DIV_TAG,
110 | DL_TAG, DT_TAG, FIELDSET_TAG, FORM_TAG, FRAME_TAG, H1_TAG, H2_TAG, H3_TAG,
111 | H4_TAG, H5_TAG, H6_TAG, HR_TAG, IFRAME_TAG, LI_TAG, NOFRAMES_TAG, NOSCRIPT_TAG,
112 | OL_TAG, P_TAG, PRE_TAG, TABLE_TAG, TD_TAG, TH_TAG, TITLE_TAG, UL_TAG
113 | ];
114 |
115 | function IsWhiteSpace(W: WideChar): Boolean;
116 | begin
117 | Result := (Ord(W) in WhiteSpace);
118 | end;
119 |
120 | function normalizeWhiteSpace(const TextStr: WideString): WideString;
121 | var
122 | I, J, Count: Integer;
123 | begin
124 | SetLength(Result, Length(TextStr));
125 |
126 | J := 0;
127 | Count := 0;
128 | for I := 1 to Length(TextStr) do
129 | begin
130 | if IsWhiteSpace(TextStr[I]) then
131 | begin
132 | Inc(Count);
133 | Continue
134 | end;
135 | if Count <> 0 then
136 | begin
137 | Count := 0;
138 | Inc(J);
139 | Result[J] := ' ';
140 | end;
141 |
142 | Inc(J);
143 | Result[J] := TextStr[I];
144 | end;
145 |
146 | if Count <> 0 then
147 | begin
148 | Inc(J);
149 | Result[J] := ' '
150 | end;
151 |
152 | SetLength(Result, J);
153 | end;
154 |
155 | function Spaces(Count: Integer): WideString;
156 | var
157 | I: Integer;
158 | begin
159 | SetLength(Result, Count);
160 | for I := 1 to Count do Result[I] := ' ';
161 | end;
162 |
163 | function TrimLeftSpaces(const S: WideString): WideString;
164 | var
165 | I: Integer;
166 | begin
167 | I := 1;
168 | while (I <= Length(S)) and (Ord(S[I]) = SP) do Inc(I);
169 | Result := Copy(S, I, Length(S) - I + 1);
170 | end;
171 |
172 | constructor TStringBuilder.Create(ACapacity: Integer);
173 | begin
174 | inherited Create;
175 | FCapacity := ACapacity;
176 | SetLength(FValue, FCapacity);
177 | end;
178 |
179 | function TStringBuilder.EndWithWhiteSpace: Boolean;
180 | begin
181 | Result := IsWhiteSpace(FValue[FLength]);
182 | end;
183 |
184 | function TStringBuilder.TailMatch(const Tail: WideString): Boolean;
185 | var
186 | TailLen, I: Integer;
187 | begin
188 | Result := false;
189 | TailLen := System.Length(Tail);
190 | if TailLen > FLength then Exit;
191 |
192 | for I := 1 to TailLen do
193 | begin
194 | if FValue[FLength - TailLen + I] <> Tail[I] then Exit;
195 | end;
196 |
197 | Result := True;
198 | end;
199 |
200 | function TStringBuilder.ToString: WideString;
201 | begin
202 | SetLength(FValue, FLength);
203 | Result := FValue;
204 | end;
205 |
206 | procedure TStringBuilder.AppendText(const TextStr: WideString);
207 | var
208 | TextLen, I: Integer;
209 | begin
210 | if (FLength + System.Length(TextStr)) > FCapacity then
211 | begin
212 | FCapacity := 2 * FCapacity;
213 | SetLength(FValue, FCapacity)
214 | end;
215 | TextLen := System.Length(TextStr);
216 | for I := 1 to TextLen do FValue[FLength + I] := TextStr[I];
217 | Inc(FLength, TextLen);
218 | end;
219 |
220 | constructor TBaseFormatter.Create;
221 | begin
222 | inherited Create;
223 | FWhatToShow := Integer(SHOW_ALL);
224 |
225 | FHTMLTagList := THTMLTagList.Create;
226 | end;
227 |
228 | destructor TBaseFormatter.Destroy;
229 | begin
230 | FreeAndNil(FHTMLTagList);
231 |
232 | inherited Destroy;
233 | end;
234 |
235 | procedure TBaseFormatter.ProcessNode(Node: TNode);
236 | begin
237 | case Node.NodeType of
238 | ELEMENT_NODE: ProcessElement(Node as TElement);
239 | TEXT_NODE: if (FWhatToShow and SHOW_TEXT) <> 0 then ProcessTextNode(Node as TTextNode);
240 | CDATA_SECTION_NODE: if (FWhatToShow and SHOW_CDATA_SECTION) <> 0 then ProcessCDataSection(Node as TCDataSection);
241 | ENTITY_REFERENCE_NODE: if (FWhatToShow and SHOW_ENTITY_REFERENCE) <> 0 then ProcessEntityReference(Node as TEntityReference);
242 | PROCESSING_INSTRUCTION_NODE: if (FWhatToShow and SHOW_PROCESSING_INSTRUCTION) <> 0 then ProcessProcessingInstruction(Node as TProcessingInstruction);
243 | COMMENT_NODE: if (FWhatToShow and SHOW_COMMENT) <> 0 then ProcessComment(Node as TComment);
244 | // NOTATION_NODE: if (FWhatToShow and SHOW_NOTATION) <> 0 then ProcessNotation(Node as Notation)
245 | end
246 | end;
247 |
248 | procedure TBaseFormatter.AppendNewLine;
249 | begin
250 | if FStringBuilder.Length > 0 then
251 | begin
252 | if not FStringBuilder.TailMatch(CRLF) then FStringBuilder.AppendText(CRLF)
253 | end;
254 | end;
255 |
256 | procedure TBaseFormatter.AppendParagraph;
257 | begin
258 | if FStringBuilder.Length > 0 then
259 | begin
260 | if not FStringBuilder.TailMatch(CRLF) then FStringBuilder.AppendText(PARAGRAPH_SEPARATOR)
261 | else if not FStringBuilder.TailMatch(PARAGRAPH_SEPARATOR) then FStringBuilder.AppendText(CRLF);
262 | end
263 | end;
264 |
265 | procedure TBaseFormatter.AppendText(const TextStr: WideString);
266 | begin
267 | FStringBuilder.AppendText(TextStr);
268 | end;
269 |
270 | procedure TBaseFormatter.ProcessAttribute(Attr: TAttr);
271 | var
272 | I: Integer;
273 | begin
274 | for I := 0 to Attr.ChildNodes.Count - 1 do ProcessNode(Attr.ChildNodes.Items[I]);
275 | end;
276 |
277 | procedure TBaseFormatter.ProcessAttributes(Element: TElement);
278 | var
279 | I: Integer;
280 | begin
281 | if (FWhatToShow and SHOW_ATTRIBUTE) <> 0 then
282 | begin
283 | FInAttributes := true;
284 | for I := 0 to Element.Attributes.Count - 1 do ProcessAttribute(Element.Attributes.Items[I] as TAttr);
285 | FInAttributes := False;
286 | end
287 | end;
288 |
289 | procedure TBaseFormatter.ProcessCDataSection(CDataSection: TCDataSection);
290 | begin
291 | // TODO
292 | end;
293 |
294 | procedure TBaseFormatter.ProcessComment(Comment: TComment);
295 | begin
296 | AppendText('');
299 | end;
300 |
301 | procedure TBaseFormatter.ProcessDocumentElement;
302 | begin
303 | if Assigned(FDocument.DocumentElement) then
304 | begin
305 | FDepth := 0;
306 | ProcessElement(FDocument.DocumentElement)
307 | end;
308 | end;
309 |
310 | procedure TBaseFormatter.ProcessElement(Element: TElement);
311 | var
312 | I: Integer;
313 | begin
314 | Inc(FDepth);
315 | for I := 0 to Element.ChildNodes.Count - 1 do ProcessNode(Element.ChildNodes.Items[I]);
316 | Dec(FDepth)
317 | end;
318 |
319 | procedure TBaseFormatter.ProcessEntityReference(EntityReference: TEntityReference);
320 | begin
321 | if FExpandEntities then AppendText(GetEntValue(EntityReference.Name))
322 | else AppendText('&' + EntityReference.Name + ';');
323 | end;
324 | {
325 | procedure TBaseFormatter.ProcessNotation(Notation: TNotation);
326 | begin
327 | // TODO
328 | end;
329 | }
330 | procedure TBaseFormatter.ProcessProcessingInstruction(ProcessingInstruction: TProcessingInstruction);
331 | begin
332 | // TODO
333 | end;
334 |
335 | procedure TBaseFormatter.ProcessTextNode(TextNode: TTextNode);
336 | begin
337 | AppendText(TextNode.Data);
338 | end;
339 |
340 | function TBaseFormatter.getText(document: TDocument): WideString;
341 | begin
342 | FDocument := document;
343 | FStringBuilder := TStringBuilder.Create(65530);
344 | try
345 | ProcessDocumentElement;
346 | Result := FStringBuilder.ToString;
347 | finally
348 | FStringBuilder.Free;
349 | end
350 | end;
351 |
352 | constructor THtmlFormatter.Create;
353 | begin
354 | inherited Create;
355 |
356 | FIndent := 2;
357 | end;
358 |
359 | function THtmlFormatter.OnlyTextContent(Element: TElement): Boolean;
360 | var
361 | I: Integer;
362 | Node: TNode;
363 | begin
364 | Result := False;
365 |
366 | for I := 0 to Element.ChildNodes.Count - 1 do
367 | begin
368 | Node := Element.ChildNodes.Items[I];
369 | if not (Node.NodeType in [TEXT_NODE, ENTITY_REFERENCE_NODE]) then Exit
370 | end;
371 |
372 | Result := True;
373 | end;
374 |
375 | procedure THtmlFormatter.ProcessAttribute(Attr: TAttr);
376 | begin
377 | if Attr.HasChildNodes then
378 | begin
379 | AppendText(' ' + Attr.Name + '="');
380 | inherited ProcessAttribute(Attr);
381 | AppendText('"');
382 | end
383 | else AppendText(' ' + Attr.Name + '="' + Attr.Name + '"');
384 | end;
385 |
386 | procedure THtmlFormatter.ProcessComment(Comment: TComment);
387 | begin
388 | AppendNewLine;
389 | AppendText(Spaces(FIndent * FDepth));
390 |
391 | inherited ProcessComment(Comment)
392 | end;
393 |
394 | procedure THtmlFormatter.ProcessElement(Element: TElement);
395 | var
396 | HTMLTag: THTMLTag;
397 | begin
398 | HTMLTag := FHTMLTagList.GetTagByName(Element.TagName);
399 | AppendNewLine;
400 | AppendText(Spaces(FIndent * FDepth));
401 | AppendText('<' + Element.TagName);
402 | ProcessAttributes(Element);
403 |
404 | if Element.HasChildNodes then
405 | begin
406 | AppendText('>');
407 |
408 | if HTMLTag.Number in PreserveWhiteSpaceTags then FPreserveWhiteSpace := True;
409 | inherited ProcessElement(Element);
410 | FPreserveWhiteSpace := False;
411 |
412 | if not OnlyTextContent(Element) then
413 | begin
414 | AppendNewLine;
415 | AppendText(Spaces(FIndent * FDepth))
416 | end;
417 |
418 | AppendText('' + Element.TagName + '>')
419 | end
420 | else AppendText(' />');
421 | end;
422 |
423 | procedure THtmlFormatter.ProcessTextNode(TextNode: TTextNode);
424 | var
425 | TextStr: WideString;
426 | begin
427 | if FPreserveWhiteSpace then AppendText(TextNode.Data)
428 | else
429 | begin
430 | TextStr := normalizeWhiteSpace(TextNode.Data);
431 | if TextStr <> ' ' then AppendText(TextStr)
432 | end;
433 | end;
434 |
435 | constructor TTextFormatter.Create;
436 | begin
437 | inherited Create;
438 | FWhatToShow := SHOW_ELEMENT or SHOW_TEXT or SHOW_ENTITY_REFERENCE;
439 | FExpandEntities := True;
440 | end;
441 |
442 | function TTextFormatter.GetAnchorText(Node: TElement): WideString;
443 | var
444 | Attr: TAttr;
445 | begin
446 | Result := '';
447 |
448 | if Node.HasAttribute('href') then
449 | begin
450 | Attr := Node.GetAttributeNode('href');
451 | Result := ' ';
452 | if UrlSchemes.GetScheme(Attr.Value) = '' then Result := Result + 'http://';
453 | Result := Result + Attr.Value;
454 | end
455 | end;
456 |
457 | function TTextFormatter.GetImageText(Node: TElement): WideString;
458 | begin
459 | if Node.HasAttribute('alt') then Result := Node.GetAttributeNode('alt').Value
460 | else Result := '';
461 | end;
462 |
463 | procedure TTextFormatter.AppendText(const TextStr: WideString);
464 | begin
465 | if (FStringBuilder.Length = 0)
466 | or FStringBuilder.EndWithWhiteSpace then
467 | begin
468 | inherited AppendText(TrimLeftSpaces(TextStr))
469 | end
470 | else
471 | begin
472 | inherited AppendText(TextStr);
473 | end;
474 | end;
475 |
476 | procedure TTextFormatter.ProcessElement(Element: TElement);
477 | var
478 | HTMLTag: THTMLTag;
479 | begin
480 | HTMLTag := FHTMLTagList.GetTagByName(Element.TagName);
481 | if HTMLTag.Number in ViewAsBlockTags then AppendParagraph;
482 |
483 | case HTMLTag.Number of
484 | A_TAG: FInsideAnchor := True;
485 | LI_TAG: AppendText('* ');
486 | end;
487 |
488 | if HTMLTag.Number in PreserveWhiteSpaceTags then FPreserveWhiteSpace := True;
489 | inherited ProcessElement(Element);
490 | FPreserveWhiteSpace := False;
491 |
492 | case HTMLTag.Number of
493 | BR_TAG:
494 | AppendNewLine;
495 |
496 | A_TAG:
497 | begin
498 | AppendText(GetAnchorText(Element));
499 | FInsideAnchor := false
500 | end;
501 |
502 | IMG_TAG:
503 | begin
504 | if FInsideAnchor then AppendText(GetImageText(Element));
505 | end
506 | end;
507 |
508 | if HTMLTag.Number in ViewAsBlockTags then AppendParagraph;
509 | end;
510 |
511 | procedure TTextFormatter.ProcessEntityReference(EntityReference: TEntityReference);
512 | begin
513 | if EntityReference.Name = 'nbsp' then AppendText(' ')
514 | else inherited ProcessEntityReference(EntityReference);
515 | end;
516 |
517 | procedure TTextFormatter.ProcessTextNode(TextNode: TTextNode);
518 | begin
519 | if FPreserveWhiteSpace then AppendText(TextNode.Data)
520 | else AppendText(normalizeWhiteSpace(TextNode.Data));
521 | end;
522 |
523 | end.
524 |
--------------------------------------------------------------------------------
/HTMLp.HTMLParser.pas:
--------------------------------------------------------------------------------
1 | unit HTMLp.HTMLParser;
2 |
3 | interface
4 |
5 | uses
6 | HTMLp.DomCore, HTMLp.HTMLReader, HTMLp.HTMLTags;
7 |
8 | type
9 | THTMLParser = class
10 | private
11 | FHTMLDocument: TDocument;
12 | FHTMLReader: THTMLReader;
13 | FHTMLTagList: THTMLTagList;
14 | FCurrentNode: TNode;
15 | FCurrentTag: THTMLTag;
16 | function FindDefParent: TElement;
17 | function FindParent: TElement;
18 | function FindParentElement(tagList: THTMLTagSet): TElement;
19 | function FindTableParent: TElement;
20 | function FindThisElement: TElement;
21 | function GetMainElement(const tagName: WideString): TElement;
22 | procedure ProcessAttributeEnd(Sender: TObject);
23 | procedure ProcessAttributeStart(Sender: TObject);
24 | procedure ProcessCDataSection(Sender: TObject);
25 | procedure ProcessComment(Sender: TObject);
26 | procedure ProcessDocType(Sender: TObject);
27 | procedure ProcessElementEnd(Sender: TObject);
28 | procedure ProcessElementStart(Sender: TObject);
29 | procedure ProcessEndElement(Sender: TObject);
30 | procedure ProcessEntityReference(Sender: TObject);
31 | procedure ProcessScript(Sender: TObject);
32 | procedure ProcessTextNode(Sender: TObject);
33 | public
34 | constructor Create;
35 | destructor Destroy; override;
36 |
37 | function ParseString(const htmlStr: WideString): TDocument;
38 | property HTMLDocument: TDocument read FHTMLDocument;
39 | end;
40 |
41 | implementation
42 |
43 | const
44 | htmlTagName = 'html';
45 | headTagName = 'head';
46 | bodyTagName = 'body';
47 |
48 | constructor THTMLParser.Create;
49 | begin
50 | inherited Create;
51 |
52 | FHTMLTagList := THTMLTagList.Create;
53 | FHTMLReader := THTMLReader.Create;
54 |
55 | with FHTMLReader do
56 | begin
57 | OnAttributeEnd := ProcessAttributeEnd;
58 | OnAttributeStart := ProcessAttributeStart;
59 | OnCDataSection := ProcessCDataSection;
60 | OnComment := ProcessComment;
61 | OnDocType := ProcessDocType;
62 | OnElementEnd := ProcessElementEnd;
63 | OnElementStart := ProcessElementStart;
64 | OnEndElement := ProcessEndElement;
65 | OnEntityReference := ProcessEntityReference;
66 | OnScript := ProcessScript;
67 | //OnNotation := ProcessNotation;
68 | //OnProcessingInstruction := ProcessProcessingInstruction;
69 | OnTextNode := ProcessTextNode;
70 | end
71 | end;
72 |
73 | destructor THTMLParser.Destroy;
74 | begin
75 | FHTMLReader.Free;
76 | FHTMLTagList.Free;
77 |
78 | inherited Destroy;
79 | end;
80 |
81 | function THTMLParser.FindDefParent: TElement;
82 | begin
83 | if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHTMLDocument.AppendChild(FHTMLDocument.CreateElement(htmlTagName)) as TElement
84 | else if FCurrentTag.Number in HeadTags then Result := GetMainElement(headTagName)
85 | else Result := GetMainElement(bodyTagName);
86 | end;
87 |
88 | function THTMLParser.FindParent: TElement;
89 | begin
90 | if (FCurrentTag.Number = P_TAG) or (FCurrentTag.Number in BlockTags) then Result := FindParentElement(BlockParentTags)
91 | else if FCurrentTag.Number in [LI_TAG] then Result := FindParentElement(ListItemParentTags)
92 | else if FCurrentTag.Number in [DD_TAG, DT_TAG] then Result := FindParentElement(DefItemParentTags)
93 | else if FCurrentTag.Number in [TD_TAG, TH_TAG] then Result := FindParentElement(CellParentTags)
94 | else if FCurrentTag.Number in [TR_TAG] then Result := FindParentElement(RowParentTags)
95 | else if FCurrentTag.Number in [COL_TAG] then Result := FindParentElement(ColParentTags)
96 | else if FCurrentTag.Number in [COLGROUP_TAG, THEAD_TAG, TFOOT_TAG, TBODY_TAG] then Result := FindParentElement(TableSectionParentTags)
97 | else if FCurrentTag.Number in [TABLE_TAG] then Result := FindTableParent
98 | else if FCurrentTag.Number in [OPTION_TAG] then Result := FindParentElement(OptionParentTags)
99 | else if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHTMLDocument.DocumentElement as TElement
100 | else Result := nil;
101 |
102 | if Result = nil then Result := FindDefParent;
103 | end;
104 |
105 | function THTMLParser.FindParentElement(tagList: THTMLTagSet): TElement;
106 | var
107 | Node: TNode;
108 | HTMLTag: THTMLTag;
109 | begin
110 | Node := FCurrentNode;
111 |
112 | while Node.NodeType = ELEMENT_NODE do
113 | begin
114 | HTMLTag := FHTMLTagList.GetTagByName(Node.Name);
115 |
116 | if HTMLTag.Number in tagList then
117 | begin
118 | Result := Node as TElement;
119 | Exit;
120 | end;
121 |
122 | Node := Node.ParentNode;
123 | end;
124 |
125 | Result := nil;
126 | end;
127 |
128 | function THTMLParser.FindTableParent: TElement;
129 | var
130 | Node: TNode;
131 | HTMLTag: THTMLTag;
132 | begin
133 | Node := FCurrentNode;
134 |
135 | while Node.NodeType = ELEMENT_NODE do
136 | begin
137 | HTMLTag := FHTMLTagList.GetTagByName(Node.Name);
138 |
139 | if (HTMLTag.Number = TD_TAG) or (HTMLTag.Number in BlockTags) then
140 | begin
141 | Result := (Node as TElement);
142 | Exit;
143 | end;
144 |
145 | Node := Node.ParentNode;
146 | end;
147 |
148 | Result := GetMainElement(bodyTagName);
149 | end;
150 |
151 | function THTMLParser.FindThisElement: TElement;
152 | var
153 | Node: TNode;
154 | begin
155 | Node := FCurrentNode;
156 |
157 | while Node.NodeType = ELEMENT_NODE do
158 | begin
159 | Result := (Node as TElement);
160 | if Result.TagName = FHTMLReader.Name then Exit;
161 |
162 | Node := Node.ParentNode;
163 | end;
164 |
165 | Result := nil;
166 | end;
167 |
168 | function THTMLParser.GetMainElement(const tagName: WideString): TElement;
169 | var
170 | child: TNode;
171 | I: Integer;
172 | begin
173 | if (FHTMLDocument.DocumentElement = nil) then FHTMLDocument.AppendChild(FHTMLDocument.CreateElement(htmlTagName));
174 |
175 | for I := 0 to FHTMLDocument.DocumentElement.ChildNodes.Count - 1 do
176 | begin
177 | child := FHTMLDocument.DocumentElement.ChildNodes.Items[I];
178 |
179 | if (child.NodeType = ELEMENT_NODE) and (child.Name = tagName) then
180 | begin
181 | Result := (child as TElement);
182 |
183 | Exit
184 | end
185 | end;
186 |
187 | Result := FHTMLDocument.CreateElement(tagName);
188 | FHTMLDocument.DocumentElement.AppendChild(Result);
189 | end;
190 |
191 | procedure THTMLParser.ProcessAttributeEnd(Sender: TObject);
192 | begin
193 | FCurrentNode := (FCurrentNode as TAttr).OwnerElement;
194 | end;
195 |
196 | procedure THTMLParser.ProcessAttributeStart(Sender: TObject);
197 | var
198 | newAttrName: string;
199 | Attr: TAttr;
200 | begin
201 | newAttrName := (Sender as THTMLReader).Name;
202 | Attr := (FCurrentNode as TElement).GetAttributeNode(newAttrName);
203 |
204 | if not (Assigned(Attr)) then
205 | begin
206 | Attr := FHTMLDocument.CreateAttribute(newAttrName);
207 | (FCurrentNode as TElement).SetAttributeNode(Attr);
208 | end;
209 |
210 | FCurrentNode := Attr;
211 | end;
212 |
213 | procedure THTMLParser.ProcessCDataSection(Sender: TObject);
214 | var
215 | CDataSection: TCDataSection;
216 | begin
217 | CDataSection := FHTMLDocument.CreateCDATASection(FHTMLReader.NodeValue);
218 | FCurrentNode.AppendChild(CDataSection)
219 | end;
220 |
221 | procedure THTMLParser.ProcessComment(Sender: TObject);
222 | var
223 | Comment: TComment;
224 | begin
225 | Comment := FHTMLDocument.CreateComment(FHTMLReader.NodeValue);
226 | FCurrentNode.AppendChild(Comment);
227 | end;
228 |
229 | procedure THTMLParser.ProcessDocType(Sender: TObject);
230 | begin
231 | with FHTMLReader do FHTMLDocument.Doctype := DomImplementation.CreateDocumentType(Name, PublicID, SystemID);
232 | end;
233 |
234 | procedure THTMLParser.ProcessElementEnd(Sender: TObject);
235 | begin
236 | if FHTMLReader.isEmptyElement
237 | or (FCurrentTag.Number in EmptyTags) then FCurrentNode := FCurrentNode.ParentNode;
238 |
239 | FCurrentTag := nil;
240 | end;
241 |
242 | procedure THTMLParser.ProcessElementStart(Sender: TObject);
243 | var
244 | Element: TElement;
245 | Parent: TNode;
246 | begin
247 | FCurrentTag := FHTMLTagList.GetTagByName(FHTMLReader.Name);
248 | if FCurrentTag.Number in (NeedFindParentTags + BlockTags) then
249 | begin
250 | Parent := FindParent;
251 | if not Assigned(Parent) then raise DomException.Create(HIERARCHY_REQUEST_ERR);
252 | FCurrentNode := Parent;
253 | end;
254 |
255 | Element := FHTMLDocument.CreateElement(FHTMLReader.Name);
256 | FCurrentNode.AppendChild(Element);
257 | FCurrentNode := Element;
258 | end;
259 |
260 | procedure THTMLParser.ProcessEndElement(Sender: TObject);
261 | var
262 | Element: TElement;
263 | begin
264 | Element := FindThisElement;
265 | if Assigned(Element) then FCurrentNode := Element.ParentNode
266 | // else if IsBlockTagName(FHtmlReader.nodeName) then raise DomException.Create(HIERARCHY_REQUEST_ERR);
267 | end;
268 |
269 | procedure THTMLParser.ProcessEntityReference(Sender: TObject);
270 | var
271 | EntityReference: TEntityReference;
272 | begin
273 | EntityReference := FHTMLDocument.CreateEntityReference(FHTMLReader.Name);
274 | FCurrentNode.AppendChild(EntityReference);
275 | end;
276 |
277 | procedure THtmlParser.ProcessScript(Sender: TObject);
278 | var
279 | Script: TScript;
280 | begin
281 | Script := FHTMLDocument.CreateScript(FHTMLReader.NodeValue);
282 | FCurrentNode.AppendChild(Script);
283 | end;
284 |
285 | procedure THtmlParser.ProcessTextNode(Sender: TObject);
286 | var
287 | TextNode: TTextNode;
288 | begin
289 | TextNode := FHTMLDocument.CreateTextNode(FHTMLReader.NodeValue);
290 | FCurrentNode.AppendChild(TextNode);
291 | end;
292 |
293 | function THTMLParser.ParseString(const htmlStr: WideString): TDocument;
294 | begin
295 | FHTMLReader.HTMLStr := htmlStr;
296 | FHTMLDocument := DomImplementation.CreateEmptyDocument(nil);
297 | FCurrentNode := FHTMLDocument;
298 | try
299 | while FHTMLReader.Read do;
300 | except
301 | // TODO: Add event ?
302 | end;
303 |
304 | Result := FHTMLDocument;
305 | end;
306 |
307 | end.
308 |
--------------------------------------------------------------------------------
/HTMLp.HTMLReader.pas:
--------------------------------------------------------------------------------
1 | unit HTMLp.HTMLReader;
2 |
3 | interface
4 |
5 | uses
6 | Classes,
7 |
8 | HTMLp.DomCore;
9 |
10 | type
11 | TDelimiters = set of Byte;
12 | TReaderState = (rsInitial, rsBeforeAttr, rsBeforeValue, rsInValue, rsInQuotedValue);
13 |
14 | THTMLReader = class
15 | private
16 | FHTMLStr: string;
17 | FPosition: Integer;
18 | FNodeType: Integer;
19 | FPrefix: string;
20 | FLocalName: string;
21 | FNodeValue: string;
22 | FPublicID: string;
23 | FSystemID: string;
24 | FIsEmptyElement: Boolean;
25 | FState: TReaderState;
26 | FQuotation: Word;
27 | FOnAttributeEnd: TNotifyEvent;
28 | FOnAttributeStart: TNotifyEvent;
29 | FOnCDataSection: TNotifyEvent;
30 | FOnComment: TNotifyEvent;
31 | FOnDocType: TNotifyEvent;
32 | FOnElementEnd: TNotifyEvent;
33 | FOnElementStart: TNotifyEvent;
34 | FOnEndElement: TNotifyEvent;
35 | FOnEntityReference: TNotifyEvent;
36 | FOnNotation: TNotifyEvent;
37 | FOnProcessingInstruction: TNotifyEvent;
38 | FOnScript: TNotifyEvent;
39 | FOnTextNode: TNotifyEvent;
40 | function GetNodeName: string;
41 | function GetToken(Delimiters: TDelimiters): string;
42 | function IsAttrTextChar: Boolean;
43 | function IsDigit(HexBase: Boolean): Boolean;
44 | function IsEndEntityChar: Boolean;
45 | function IsEntityChar: Boolean;
46 | function IsEqualChar: Boolean;
47 | function IsHexEntityChar: Boolean;
48 | function IsNumericEntity: Boolean;
49 | function IsQuotation: Boolean;
50 | function IsSlashChar: Boolean;
51 | function IsSpecialTagChar: Boolean;
52 | function IsStartCharacterData: Boolean;
53 | function IsStartComment: Boolean;
54 | function IsStartDocumentType: Boolean;
55 | function IsStartEntityChar: Boolean;
56 | function IsStartMarkupChar: Boolean;
57 | function IsStartScript: Boolean;
58 | function IsStartTagChar: Boolean;
59 | function Match(const Signature: string; IgnoreCase: Boolean): Boolean;
60 | function ReadAttrNode: Boolean;
61 | function ReadAttrTextNode: Boolean;
62 | function ReadCharacterData: Boolean;
63 | function ReadComment: Boolean;
64 | function ReadDocumentType: Boolean;
65 | function ReadElementNode: Boolean;
66 | function ReadEndElementNode: Boolean;
67 | function ReadEntityNode: Boolean;
68 | function ReadNamedEntityNode: Boolean;
69 | function ReadNumericEntityNode: Boolean;
70 | function ReadQuotedValue(var Value: string): Boolean;
71 | function ReadScript: Boolean;
72 | function ReadSpecialNode: Boolean;
73 | function ReadTagNode: Boolean;
74 | function ReadValueNode: Boolean;
75 | function SkipTo(const Signature: string): Boolean;
76 | procedure FireEvent(Event: TNotifyEvent);
77 | procedure ReadElementTail;
78 | procedure ReadTextNode;
79 | procedure SetHTMLStr(const Value: string);
80 | procedure SetNodeName(Value: string);
81 | procedure SkipWhiteSpaces;
82 | public
83 | constructor Create;
84 | function Read: Boolean;
85 |
86 | property HTMLStr: string read FHTMLStr write SetHTMLStr;
87 | property isEmptyElement: Boolean read FIsEmptyElement;
88 | property LocalName: string read FLocalName;
89 | property Name: string read GetNodeName;
90 | property NodeType: Integer read FNodeType;
91 | property Position: Integer read FPosition;
92 | property Prefix: string read FPrefix;
93 | property PublicID: string read FPublicID;
94 | property State: TReaderState read FState;
95 | property SystemID: string read FSystemID;
96 | property NodeValue: string read FNodeValue;
97 | property OnAttributeEnd: TNotifyEvent read FOnAttributeEnd write FOnAttributeEnd;
98 | property OnAttributeStart: TNotifyEvent read FOnAttributeStart write FOnAttributeStart;
99 | property OnCDataSection: TNotifyEvent read FOnCDataSection write FOnCDataSection;
100 | property OnComment: TNotifyEvent read FOnComment write FOnComment;
101 | property OnDocType: TNotifyEvent read FOnDocType write FOnDocType;
102 | property OnElementEnd: TNotifyEvent read FOnElementEnd write FOnElementEnd;
103 | property OnElementStart: TNotifyEvent read FOnElementStart write FOnElementStart;
104 | property OnEndElement: TNotifyEvent read FOnEndElement write FOnEndElement;
105 | property OnEntityReference: TNotifyEvent read FOnEntityReference write FOnEntityReference;
106 | property OnNotation: TNotifyEvent read FOnNotation write FOnNotation;
107 | property OnProcessingInstruction: TNotifyEvent read FOnProcessingInstruction write FOnProcessingInstruction;
108 | property OnScript: TNotifyEvent read FOnScript write FOnScript;
109 | property OnTextNode: TNotifyEvent read FOnTextNode write FOnTextNode;
110 | end;
111 |
112 | implementation
113 |
114 | uses
115 | SysUtils;
116 |
117 | const
118 | startTagChar = Ord('<');
119 | endTagChar = Ord('>');
120 | specialTagChar = Ord('!');
121 | slashChar = Ord('/');
122 | equalChar = Ord('=');
123 | quotation = [Ord(''''), Ord('"')];
124 | tagDelimiter = [slashChar, endTagChar];
125 | tagNameDelimiter = whiteSpace + tagDelimiter;
126 | attrNameDelimiter = tagNameDelimiter + [equalChar];
127 | startEntity = Ord('&');
128 | startMarkup = [startTagChar, startEntity];
129 | endEntity = Ord(';');
130 | notEntity = [endEntity] + startMarkup + whiteSpace;
131 | notAttrText = whiteSpace + quotation + tagDelimiter;
132 | numericEntity = Ord('#');
133 | hexEntity = [Ord('x'), Ord('X')];
134 | decDigit = [Ord('0')..Ord('9')];
135 | hexDigit = [Ord('a')..Ord('f'), Ord('A')..Ord('F')];
136 |
137 | DocTypeStartStr = 'DOCTYPE';
138 | DocTypeEndStr = '>';
139 | CDataStartStr = '[CDATA[';
140 | CDataEndStr = ']]>';
141 | CommentStartStr = '--';
142 | CommentEndStr = '-->';
143 | ScriptStartStr = 'script';
144 | ScriptEndStr = '';
145 |
146 | function DecValue(const Digit: WideChar): Word;
147 | begin
148 | Result := Ord(Digit) - Ord('0')
149 | end;
150 |
151 | function HexValue(const HexChar: WideChar): Word;
152 | var
153 | C: Char;
154 | begin
155 | if Ord(HexChar) in decDigit then Result := Ord(HexChar) - Ord('0')
156 | else
157 | begin
158 | C := UpCase(Chr(Ord(HexChar)));
159 | Result := Ord(C) - Ord('A');
160 | end
161 | end;
162 |
163 | constructor THTMLReader.Create;
164 | begin
165 | inherited Create;
166 |
167 | FHTMLStr := HTMLStr;
168 | FPosition := 1
169 | end;
170 |
171 | function THTMLReader.GetNodeName: string;
172 | begin
173 | if FPrefix <> '' then Result := FPrefix + ':' + FLocalName
174 | else Result := FLocalName;
175 | end;
176 |
177 | function THTMLReader.GetToken(Delimiters: TDelimiters): string;
178 | var
179 | Start: Integer;
180 | begin
181 | Start := FPosition;
182 | while (FPosition <= Length(FHTMLStr)) and not (Ord(FHTMLStr[FPosition]) in Delimiters) do Inc(FPosition);
183 | Result := Copy(FHTMLStr, Start, FPosition - Start);
184 | end;
185 |
186 | function THTMLReader.IsAttrTextChar: Boolean;
187 | var
188 | WC: WideChar;
189 | begin
190 | WC := FHTMLStr[FPosition];
191 | if FState = rsInQuotedValue then Result := (Ord(WC) <> FQuotation) and (Ord(WC) <> startEntity)
192 | else Result := not (Ord(WC) in notAttrText);
193 | end;
194 |
195 | function THTMLReader.IsDigit(HexBase: Boolean): Boolean;
196 | var
197 | WC: WideChar;
198 | begin
199 | WC := FHTMLStr[FPosition];
200 | Result := Ord(WC) in decDigit;
201 | if not Result and HexBase then Result := (Ord(WC) in hexDigit);
202 | end;
203 |
204 | function THTMLReader.IsEndEntityChar: Boolean;
205 | var
206 | WC: WideChar;
207 | begin
208 | WC := FHTMLStr[FPosition];
209 | Result := (Ord(WC) = endEntity);
210 | end;
211 |
212 | function THTMLReader.IsEntityChar: Boolean;
213 | var
214 | WC: WideChar;
215 | begin
216 | WC := FHTMLStr[FPosition];
217 | Result := not (Ord(WC) in notEntity);
218 | end;
219 |
220 | function THTMLReader.IsEqualChar: Boolean;
221 | var
222 | WC: WideChar;
223 | begin
224 | WC := FHTMLStr[FPosition];
225 | Result := (Ord(WC) = equalChar);
226 | end;
227 |
228 | function THTMLReader.IsHexEntityChar: Boolean;
229 | var
230 | WC: WideChar;
231 | begin
232 | WC := FHTMLStr[FPosition];
233 | Result := (Ord(WC) in hexEntity);
234 | end;
235 |
236 | function THTMLReader.IsNumericEntity: Boolean;
237 | var
238 | WC: WideChar;
239 | begin
240 | WC := FHTMLStr[FPosition];
241 | Result := (Ord(WC) = numericEntity);
242 | end;
243 |
244 | function THTMLReader.IsQuotation: Boolean;
245 | var
246 | WC: WideChar;
247 | begin
248 | WC := FHTMLStr[FPosition];
249 | if FQuotation = 0 then Result := (Ord(WC) in quotation)
250 | else Result := (Ord(WC) = FQuotation);
251 | end;
252 |
253 | function THTMLReader.IsSlashChar: Boolean;
254 | var
255 | WC: WideChar;
256 | begin
257 | WC := FHTMLStr[FPosition];
258 | Result := (Ord(WC) = slashChar);
259 | end;
260 |
261 | function THTMLReader.IsSpecialTagChar: Boolean;
262 | var
263 | WC: WideChar;
264 | begin
265 | WC := FHTMLStr[FPosition];
266 | Result := (Ord(WC) = specialTagChar);
267 | end;
268 |
269 | function THTMLReader.IsStartCharacterData: Boolean;
270 | begin
271 | Result := Match(CDataStartStr, False);
272 | end;
273 |
274 | function THTMLReader.IsStartComment: Boolean;
275 | begin
276 | Result := Match(CommentStartStr, False);
277 | end;
278 |
279 | function THTMLReader.IsStartDocumentType: Boolean;
280 | begin
281 | Result := Match(DocTypeStartStr, True);
282 | end;
283 |
284 | function THTMLReader.IsStartEntityChar: Boolean;
285 | var
286 | WC: WideChar;
287 | begin
288 | WC := FHTMLStr[FPosition];
289 | Result := (Ord(WC) = startEntity);
290 | end;
291 |
292 | function THTMLReader.IsStartMarkupChar: Boolean;
293 | var
294 | WC: WideChar;
295 | begin
296 | WC := FHTMLStr[FPosition];
297 | Result := (Ord(WC) in startMarkup);
298 | end;
299 |
300 | function THTMLReader.IsStartScript: Boolean;
301 | begin
302 | Result := Match(ScriptStartStr, true);
303 | end;
304 |
305 | function THTMLReader.IsStartTagChar: Boolean;
306 | var
307 | WC: WideChar;
308 | begin
309 | WC := FHTMLStr[FPosition];
310 | Result := (Ord(WC) = startTagChar);
311 | end;
312 |
313 | function THTMLReader.Match(const Signature: string; IgnoreCase: Boolean): Boolean;
314 | var
315 | I, J: Integer;
316 | W1, W2: WideChar;
317 | begin
318 | Result := False;
319 |
320 | for I := 1 to Length(Signature) do
321 | begin
322 | J := FPosition + I - 1;
323 | if (J < 1) or (J > Length(FHTMLStr)) then Exit;
324 | W1 := Signature[I];
325 | W2 := FHTMLStr[J];
326 | if (W1 <> W2)
327 | and (not IgnoreCase or (UpperCase(W1) <> UpperCase(W2))) then Exit;
328 | end;
329 |
330 | Result := True;
331 | end;
332 |
333 | function THTMLReader.ReadAttrNode: Boolean;
334 | var
335 | AttrName: string;
336 | begin
337 | Result := False;
338 | SkipWhiteSpaces;
339 | AttrName := LowerCase(GetToken(attrNameDelimiter));
340 | if AttrName = '' then Exit;
341 |
342 | SetNodeName(AttrName);
343 | FireEvent(FOnAttributeStart);
344 |
345 | FState := rsBeforeValue;
346 | FQuotation := 0;
347 |
348 | Result := True;
349 | end;
350 |
351 | function THTMLReader.ReadAttrTextNode: Boolean;
352 | var
353 | Start: Integer;
354 | begin
355 | Result := False;
356 | Start := FPosition;
357 |
358 | while (FPosition <= Length(FHTMLStr)) and (IsAttrTextChar) do Inc(FPosition);
359 | if FPosition = Start then Exit;
360 |
361 | FNodeType := TEXT_NODE;
362 | FNodeValue:= Copy(FHTMLStr, Start, FPosition - Start);
363 | FireEvent(FOnTextNode);
364 |
365 | Result := True;
366 | end;
367 |
368 | function THTMLReader.ReadCharacterData: Boolean;
369 | var
370 | StartPos: Integer;
371 | begin
372 | Inc(FPosition, Length(CDataStartStr));
373 | StartPos := FPosition;
374 | Result := SkipTo(CDataEndStr);
375 | if Result then
376 | begin
377 | FNodeType := CDATA_SECTION_NODE;
378 | FNodeValue := Copy(FHTMLStr, StartPos, FPosition - StartPos - Length(CDataEndStr));
379 | FireEvent(FOnCDataSection)
380 | end
381 | end;
382 |
383 | function THTMLReader.ReadComment: Boolean;
384 | var
385 | StartPos: Integer;
386 | begin
387 | Inc(FPosition, Length(CommentStartStr));
388 | StartPos := FPosition;
389 | Result := SkipTo(CommentEndStr);
390 | if Result then
391 | begin
392 | FNodeType := COMMENT_NODE;
393 | FNodeValue := Copy(FHTMLStr, StartPos, FPosition - StartPos - Length(CommentEndStr));
394 | FireEvent(FOnComment);
395 | end;
396 | end;
397 |
398 | function THTMLReader.ReadDocumentType: Boolean;
399 | var
400 | Name: string;
401 | begin
402 | Result := False;
403 |
404 | Inc(FPosition, Length(DocTypeStartStr));
405 | SkipWhiteSpaces;
406 | Name := GetToken(tagNameDelimiter);
407 | if Name = '' then Exit;
408 |
409 | SetNodeName(Name);
410 | SkipWhiteSpaces;
411 | GetToken(tagNameDelimiter);
412 | SkipWhiteSpaces;
413 | if ((FHTMLStr[FPosition] = '"') or (FHTMLStr[FPosition] = '''')) and not (ReadQuotedValue(FPublicID)) then Exit;
414 |
415 | SkipWhiteSpaces;
416 | if FHTMLStr[FPosition] = '"' then
417 | begin
418 | if not ReadQuotedValue(FSystemID) then Exit
419 | end;
420 |
421 | Result := SkipTo(DocTypeEndStr);
422 | end;
423 |
424 | function THTMLReader.ReadElementNode: Boolean;
425 | var
426 | TagName: string;
427 | begin
428 | Result := False;
429 |
430 | if FPosition < Length(FHTMLStr) then
431 | begin
432 | TagName := LowerCase(GetToken(tagNameDelimiter));
433 | if TagName = '' then Exit;
434 |
435 | FNodeType := ELEMENT_NODE;
436 | SetNodeName(TagName);
437 |
438 | FState := rsBeforeAttr;
439 | FireEvent(FOnElementStart);
440 | Result := True;
441 | end;
442 | end;
443 |
444 | function THTMLReader.ReadEndElementNode: Boolean;
445 | var
446 | TagName: string;
447 | begin
448 | Result := False;
449 |
450 | Inc(FPosition);
451 | if FPosition > Length(FHTMLStr) then Exit;
452 |
453 | TagName := LowerCase(GetToken(tagNameDelimiter));
454 | if TagName = '' then Exit;
455 |
456 | Result := SkipTo(WideChar(endTagChar));
457 | if Result then
458 | begin
459 | FNodeType := END_ELEMENT_NODE;
460 | SetNodeName(TagName);
461 | FireEvent(FOnEndElement);
462 | Result := True;
463 | end;
464 | end;
465 |
466 | function THTMLReader.ReadEntityNode: Boolean;
467 | var
468 | CurrPos: Integer;
469 | begin
470 | Result := False;
471 | CurrPos := FPosition;
472 | Inc(FPosition);
473 | if FPosition > Length(FHTMLStr) then Exit;
474 |
475 | if IsNumericEntity then
476 | begin
477 | Inc(FPosition);
478 | Result := ReadNumericEntityNode;
479 | end
480 | else Result := ReadNamedEntityNode;
481 |
482 | if Result then
483 | begin
484 | FNodeType := ENTITY_REFERENCE_NODE;
485 | // FireEvent(FOnEntityReference); VVV - remove, entity node is added in ReadXXXEntityNode
486 | end
487 | else FPosition := CurrPos;
488 | end;
489 |
490 | function THTMLReader.ReadNamedEntityNode: Boolean;
491 | var
492 | Start: Integer;
493 | begin
494 | Result := False;
495 | if FPosition > Length(FHTMLStr) then Exit;
496 |
497 | Start := FPosition;
498 | while (FPosition <= Length(FHTMLStr)) and IsEntityChar do Inc(FPosition);
499 | if (FPosition > Length(FHTMLStr)) or not IsEndEntityChar then Exit;
500 |
501 | FNodeType := ENTITY_REFERENCE_NODE;
502 | SetNodeName(Copy(FHTMLStr, Start, FPosition - Start));
503 | Inc(FPosition);
504 |
505 | FireEvent(FOnEntityReference);
506 | Result := True;
507 | end;
508 |
509 | function THTMLReader.ReadNumericEntityNode: Boolean;
510 | var
511 | Value: Word;
512 | HexBase: Boolean;
513 | begin
514 | Result := False;
515 | if FPosition > Length(FHTMLStr) then Exit;
516 |
517 | HexBase := IsHexEntityChar;
518 | if HexBase then Inc(FPosition);
519 |
520 | Value := 0;
521 | while (FPosition <= Length(FHTMLStr)) and IsDigit(HexBase) do
522 | begin
523 | try
524 | if HexBase then Value := Value * 16 + HexValue(FHTMLStr[FPosition])
525 | else Value := Value * 10 + DecValue(FHTMLStr[FPosition])
526 | except
527 | Exit
528 | end;
529 |
530 | Inc(FPosition);
531 | end;
532 |
533 | if (FPosition > Length(FHTMLStr)) or not IsEndEntityChar then Exit;
534 | Inc(FPosition);
535 | FNodeType := TEXT_NODE;
536 | FNodeValue := WideChar(Value);
537 | FireEvent(FOnTextNode);
538 | Result := True;
539 | end;
540 |
541 | function THTMLReader.ReadQuotedValue(var Value: string): Boolean;
542 | var
543 | QuotedChar: WideChar;
544 | Start: Integer;
545 | begin
546 | QuotedChar := FHTMLStr[FPosition];
547 | Inc(FPosition);
548 | Start := FPosition;
549 | Result := SkipTo(QuotedChar);
550 | if Result then Value := Copy(FHTMLStr, Start, FPosition - Start);
551 | end;
552 |
553 | function THTMLReader.ReadScript: Boolean;
554 | var
555 | StartPos: Integer;
556 | begin
557 | Inc(FPosition, Length(ScriptStartStr));
558 | StartPos := FPosition;
559 | Result := SkipTo(ScriptEndStr);
560 | if Result then
561 | begin
562 | FNodeType := SCRIPT_NODE;
563 | FNodeValue := Copy(FHtmlStr, StartPos, FPosition - StartPos - Length(ScriptEndStr));
564 | FireEvent(FOnScript)
565 | end
566 | end;
567 |
568 | function THTMLReader.ReadSpecialNode: Boolean;
569 | begin
570 | Result := False;
571 |
572 | Inc(FPosition);
573 | if FPosition > Length(FHTMLStr) then Exit;
574 |
575 | if IsStartDocumentType then Result := ReadDocumentType
576 | else if IsStartCharacterData then Result := ReadCharacterData
577 | else if IsStartComment then Result := ReadComment;
578 | end;
579 |
580 | function THTMLReader.ReadTagNode: Boolean;
581 | var
582 | CurrPos: Integer;
583 | begin
584 | Result := False;
585 |
586 | CurrPos := FPosition;
587 | Inc(FPosition);
588 | if FPosition > Length(FHTMLStr) then Exit;
589 |
590 | if IsSlashChar then Result := ReadEndElementNode
591 | else if IsSpecialTagChar then Result := ReadSpecialNode
592 | else if IsStartScript then Result := ReadScript
593 | else Result := ReadElementNode;
594 |
595 | if not Result then FPosition := CurrPos;
596 | end;
597 |
598 | function THTMLReader.SkipTo(const Signature: string): Boolean;
599 | begin
600 | while FPosition <= Length(FHTMLStr) do
601 | begin
602 | if Match(Signature, False) then
603 | begin
604 | Inc(FPosition, Length(Signature));
605 | Exit(True);
606 | end;
607 |
608 | Inc(FPosition);
609 | end;
610 |
611 | Result := False;
612 | end;
613 |
614 | procedure THTMLReader.FireEvent(Event: TNotifyEvent);
615 | begin
616 | if Assigned(Event) then Event(Self);
617 | end;
618 |
619 | function THTMLReader.Read: Boolean;
620 | begin
621 | FNodeType := NONE;
622 | FPrefix := '';
623 | FLocalName := '';
624 | FNodeValue := '';
625 | FPublicID := '';
626 | FSystemID := '';
627 | FIsEmptyElement := False;
628 | Result := False;
629 |
630 | if FPosition > Length(FHTMLStr) then Exit;
631 | Result := True;
632 |
633 | if FState in [rsBeforeValue, rsInValue, rsInQuotedValue] then
634 | begin
635 | if ReadValueNode then Exit;
636 |
637 | if FState = rsInQuotedValue then Inc(FPosition);
638 | FNodeType := ATTRIBUTE_NODE;
639 | FireEvent(FOnAttributeEnd);
640 | FState := rsBeforeAttr;
641 | end
642 | else if FState = rsBeforeAttr then
643 | begin
644 | if ReadAttrNode then Exit;
645 |
646 | ReadElementTail;
647 | FState := rsInitial;
648 | end
649 | else if IsStartTagChar then
650 | begin
651 | if ReadTagNode then Exit;
652 |
653 | Inc(FPosition);
654 | FNodeType := ENTITY_REFERENCE_NODE;
655 | SetNodeName('lt');
656 | FireEvent(FOnEntityReference);
657 | end
658 | else if IsStartEntityChar then
659 | begin
660 | if ReadEntityNode then Exit;
661 |
662 | Inc(FPosition);
663 | FNodeType := ENTITY_REFERENCE_NODE;
664 | SetNodeName('amp');
665 | FireEvent(FOnEntityReference);
666 | end
667 | else ReadTextNode;
668 | end;
669 |
670 | procedure THTMLReader.ReadTextNode;
671 | var
672 | Start: Integer;
673 | begin
674 | Start := FPosition;
675 | repeat Inc(FPosition)
676 | until (FPosition > Length(FHTMLStr)) or IsStartMarkupChar;
677 | FNodeType := TEXT_NODE;
678 | FNodeValue:= Copy(FHTMLStr, Start, FPosition - Start);
679 |
680 | FireEvent(FOnTextNode);
681 | end;
682 |
683 | function THTMLReader.ReadValueNode: Boolean;
684 | begin
685 | Result := False;
686 | if FState = rsBeforeValue then
687 | begin
688 | SkipWhiteSpaces;
689 | if FPosition > Length(FHTMLStr) then Exit;
690 | if not IsEqualChar then Exit;
691 |
692 | Inc(FPosition);
693 | SkipWhiteSpaces;
694 | if FPosition > Length(FHTMLStr) then Exit;
695 |
696 | if IsQuotation then
697 | begin
698 | FQuotation := Ord(FHTMLStr[FPosition]);
699 | Inc(FPosition);
700 | FState := rsInQuotedValue;
701 | end
702 | else FState := rsInValue;
703 | end;
704 |
705 | if FPosition > Length(FHTMLStr) then Exit;
706 |
707 | if IsStartEntityChar then
708 | begin
709 | Result := True;
710 | if ReadEntityNode then Exit;
711 |
712 | Inc(FPosition);
713 | FNodeType := ENTITY_REFERENCE_NODE;
714 |
715 | SetNodeName('amp');
716 | FireEvent(FOnEntityReference);
717 | end
718 | else Result := ReadAttrTextNode;
719 | end;
720 |
721 | procedure THTMLReader.ReadElementTail;
722 | begin
723 | SkipWhiteSpaces;
724 |
725 | if (FPosition <= Length(FHTMLStr)) and (IsSlashChar) then
726 | begin
727 | FIsEmptyElement := True;
728 | Inc(FPosition)
729 | end;
730 |
731 | SkipTo(WideChar(endTagChar));
732 | FNodeType := ELEMENT_NODE;
733 | FireEvent(FOnElementEnd);
734 | end;
735 |
736 | procedure THTMLReader.SetHTMLStr(const Value: string);
737 | begin
738 | FHTMLStr := Value;
739 | FPosition := 1;
740 | end;
741 |
742 | procedure THTMLReader.SetNodeName(Value: string);
743 | var
744 | I: Integer;
745 | begin
746 | I := Pos(':', Value);
747 |
748 | if I > 0 then
749 | begin
750 | FPrefix := Copy(Value, 1, I - 1);
751 | FLocalName := Copy(Value, I + 1, Length(Value) - I);
752 | end
753 | else
754 | begin
755 | FPrefix := '';
756 | FLocalName := Value;
757 | end
758 | end;
759 |
760 | procedure THTMLReader.SkipWhiteSpaces;
761 | begin
762 | while (FPosition <= Length(FHTMLStr))
763 | and (Ord(FHTMLStr[FPosition]) in whiteSpace) do Inc(FPosition);
764 | end;
765 |
766 | end.
767 |
--------------------------------------------------------------------------------
/HTMLp.HTMLTags.pas:
--------------------------------------------------------------------------------
1 | unit HTMLp.HTMLTags;
2 |
3 | interface
4 |
5 | uses
6 | Classes,
7 |
8 | HTMLp.DomCore;
9 |
10 | const
11 | MAX_TAGS_COUNT = 128;
12 | MAX_FLAGS_COUNT = 32;
13 |
14 | type
15 | THTMLTagSet = set of 0..MAX_TAGS_COUNT - 1;
16 | THTMLTagFlags = set of 0..MAX_FLAGS_COUNT - 1;
17 |
18 | THTMLTag = class
19 | private
20 | FName: WideString;
21 | FNumber: Integer;
22 | FParserFlags: THTMLTagFlags;
23 | FFormatterFlags: THTMLTagFlags;
24 | public
25 | constructor Create(const AName: WideString; ANumber: Integer; AParserFlags, AFormatterFlags: THTMLTagFlags);
26 |
27 | property Name: WideString read FName;
28 | property Number: Integer read FNumber;
29 | property ParserFlags: THTMLTagFlags read FParserFlags;
30 | property FormatterFlags: THTMLTagFlags read FFormatterFlags;
31 | end;
32 |
33 | TCompareTag = function(Tag: THTMLTag): Integer of object;
34 |
35 | THTMLTagList = class
36 | private
37 | FList: TList;
38 | FUnknownTag: THTMLTag;
39 | FSearchName: WideString;
40 | FSearchNumber: Integer;
41 | function CompareName(Tag: THTMLTag): Integer;
42 | function CompareNumber(Tag: THTMLTag): Integer;
43 | function GetTag(Compare: TCompareTag): THTMLTag;
44 | public
45 | constructor Create;
46 | destructor Destroy; override;
47 |
48 | function GetTagByName(const Name: string): THTMLTag;
49 | function GetTagByNumber(Number: Integer): THTMLTag;
50 | end;
51 |
52 | TURLSchemes = class(TStringList)
53 | private
54 | FMaxLen: Integer;
55 | public
56 | function Add(const S: string): Integer; override;
57 | function IsURL(const S: string): Boolean;
58 | function GetScheme(const S: string): string;
59 |
60 | property MaxLen: Integer read FMaxLen;
61 | end;
62 |
63 | var
64 | URLSchemes: TURLSchemes;
65 |
66 | const
67 | UNKNOWN_TAG = 0;
68 | A_TAG = 1;
69 | ABBR_TAG = 2;
70 | ACRONYM_TAG = 3;
71 | ADDRESS_TAG = 4;
72 | APPLET_TAG = 5;
73 | AREA_TAG = 6;
74 | B_TAG = 7;
75 | BASE_TAG = 8;
76 | BASEFONT_TAG = 9;
77 | BDO_TAG = 10;
78 | BIG_TAG = 11;
79 | BLOCKQUOTE_TAG = 12;
80 | BODY_TAG = 13;
81 | BR_TAG = 14;
82 | BUTTON_TAG = 15;
83 | CAPTION_TAG = 16;
84 | CENTER_TAG = 17;
85 | CITE_TAG = 18;
86 | CODE_TAG = 19;
87 | COL_TAG = 20;
88 | COLGROUP_TAG = 21;
89 | DD_TAG = 22;
90 | DEL_TAG = 23;
91 | DFN_TAG = 24;
92 | DIR_TAG = 25;
93 | DIV_TAG = 26;
94 | DL_TAG = 27;
95 | DT_TAG = 28;
96 | EM_TAG = 29;
97 | FIELDSET_TAG = 30;
98 | FONT_TAG = 31;
99 | FORM_TAG = 32;
100 | FRAME_TAG = 33;
101 | FRAMESET_TAG = 34;
102 | H1_TAG = 35;
103 | H2_TAG = 36;
104 | H3_TAG = 37;
105 | H4_TAG = 38;
106 | H5_TAG = 39;
107 | H6_TAG = 40;
108 | HEAD_TAG = 41;
109 | HR_TAG = 42;
110 | HTML_TAG = 43;
111 | I_TAG = 44;
112 | IFRAME_TAG = 45;
113 | IMG_TAG = 46;
114 | INPUT_TAG = 47;
115 | INS_TAG = 48;
116 | ISINDEX_TAG = 49;
117 | KBD_TAG = 50;
118 | LABEL_TAG = 51;
119 | LEGEND_TAG = 52;
120 | LI_TAG = 53;
121 | LINK_TAG = 54;
122 | MAP_TAG = 55;
123 | MENU_TAG = 56;
124 | META_TAG = 57;
125 | NOFRAMES_TAG = 58;
126 | NOSCRIPT_TAG = 59;
127 | OBJECT_TAG = 60;
128 | OL_TAG = 61;
129 | OPTGROUP_TAG = 62;
130 | OPTION_TAG = 63;
131 | P_TAG = 64;
132 | PARAM_TAG = 65;
133 | PRE_TAG = 66;
134 | Q_TAG = 67;
135 | S_TAG = 68;
136 | SAMP_TAG = 69;
137 | SCRIPT_TAG = 70;
138 | SELECT_TAG = 71;
139 | SMALL_TAG = 72;
140 | SPAN_TAG = 73;
141 | STRIKE_TAG = 74;
142 | STRONG_TAG = 75;
143 | STYLE_TAG = 76;
144 | SUB_TAG = 77;
145 | SUP_TAG = 78;
146 | TABLE_TAG = 79;
147 | TBODY_TAG = 80;
148 | TD_TAG = 81;
149 | TEXTAREA_TAG = 82;
150 | TFOOT_TAG = 83;
151 | TH_TAG = 84;
152 | THEAD_TAG = 85;
153 | TITLE_TAG = 86;
154 | TR_TAG = 87;
155 | TT_TAG = 88;
156 | U_TAG = 89;
157 | UL_TAG = 90;
158 | VAR_TAG = 91;
159 | {}
160 | SOURCE_TAG = 92;
161 | TRACK_TAG = 93;
162 | WBR_TAG = 94;
163 | EMBED_TAG = 95;
164 | ASIDE_TAG = 96;
165 | ARTICLE_TAG = 97;
166 | BDI_TAG = 98;
167 | FIGURE_TAG = 99;
168 | FOOTER_TAG = 100;
169 | FIGCAPTION_TAG = 101;
170 | HEADER_TAG = 102;
171 | MAIN_TAG = 103;
172 | MARK_TAG = 104;
173 | NAV_TAG = 105;
174 | TIME_TAG = 106;
175 | SECTION_TAG = 107;
176 | PLAINTEXT_TAG = 108;
177 | XMP_TAG = 109;
178 | VIDEO_TAG = 110;
179 | AUDIO_TAG = 111;
180 |
181 | {https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements}
182 | BlockTags = [ADDRESS_TAG, TIME_TAG, BLOCKQUOTE_TAG, CENTER_TAG, DIV_TAG, DL_TAG, FIELDSET_TAG, {FORM_TAG,} A_TAG, H1_TAG,
183 | H2_TAG, H3_TAG, H4_TAG, H5_TAG, H6_TAG, HR_TAG, NOSCRIPT_TAG, OL_TAG, PRE_TAG, TABLE_TAG, UL_TAG,
184 | ARTICLE_TAG, SECTION_TAG, ASIDE_TAG, MAIN_TAG, HEADER_TAG, FOOTER_TAG];
185 | {}
186 | BlockParentTags = [ADDRESS_TAG, BLOCKQUOTE_TAG, CENTER_TAG, DIV_TAG, DL_TAG, FIELDSET_TAG, A_TAG,
187 | H1_TAG, H2_TAG, H3_TAG, H4_TAG, H5_TAG, H6_TAG, HR_TAG, LI_TAG, NOSCRIPT_TAG, OL_TAG, PRE_TAG, TD_TAG, TH_TAG, UL_TAG,
188 | ARTICLE_TAG, SECTION_TAG, ASIDE_TAG, MAIN_TAG, HEADER_TAG, FOOTER_TAG];
189 | {https://developer.mozilla.org/en-US/docs/Learn/HTML/Introduction_to_HTML/The_head_metadata_in_HTML}
190 | HeadTags = [BASE_TAG, LINK_TAG, META_TAG, SCRIPT_TAG, STYLE_TAG, TITLE_TAG];
191 | {https://developer.mozilla.org/en-US/docs/Glossary/Empty_element}
192 | EmptyTags = [AREA_TAG, BASE_TAG, BASEFONT_TAG, BR_TAG, COL_TAG, FRAME_TAG, EMBED_TAG, HR_TAG, IMG_TAG, INPUT_TAG, ISINDEX_TAG,
193 | LINK_TAG, META_TAG, PARAM_TAG, SOURCE_TAG, TRACK_TAG, WBR_TAG];
194 | {}
195 | NeedFindParentTags = [COL_TAG, COLGROUP_TAG, DD_TAG, DT_TAG, LI_TAG, OPTION_TAG, P_TAG, TABLE_TAG, TBODY_TAG, TD_TAG,
196 | TFOOT_TAG, TH_TAG, THEAD_TAG, TR_TAG];
197 | {Other}
198 | PreserveWhiteSpaceTags = [PRE_TAG];
199 | ListItemParentTags = [DIR_TAG, MENU_TAG, OL_TAG, UL_TAG];
200 | DefItemParentTags = [DL_TAG];
201 | TableSectionParentTags = [TABLE_TAG];
202 | ColParentTags = [COLGROUP_TAG];
203 | RowParentTags = [TABLE_TAG, TBODY_TAG, TFOOT_TAG, THEAD_TAG];
204 | CellParentTags = [TR_TAG];
205 | OptionParentTags = [OPTGROUP_TAG, SELECT_TAG];
206 | PlainTextTags = [CODE_TAG, SCRIPT_TAG, PLAINTEXT_TAG, XMP_TAG];
207 |
208 | implementation
209 |
210 | uses
211 | SysUtils;
212 |
213 | constructor THTMLTag.Create(const AName: WideString; ANumber: Integer; AParserFlags, AFormatterFlags: THTMLTagFlags);
214 | begin
215 | inherited Create;
216 |
217 | FName := AName;
218 | FNumber := ANumber
219 | end;
220 |
221 | constructor THTMLTagList.Create;
222 | begin
223 | inherited Create;
224 |
225 | FList := TList.Create;
226 | FList.Capacity := MAX_TAGS_COUNT;
227 | FList.Add(THTMLTag.Create('a', A_TAG, [], []));
228 | FList.Add(THTMLTag.Create('abbr', ABBR_TAG, [], []));
229 | FList.Add(THTMLTag.Create('acronym', ACRONYM_TAG, [], []));
230 | FList.Add(THTMLTag.Create('address', ADDRESS_TAG, [], []));
231 | FList.Add(THTMLTag.Create('applet', APPLET_TAG, [], []));
232 | FList.Add(THTMLTag.Create('area', AREA_TAG, [], []));
233 | FList.Add(THTMLTag.Create('aside', ASIDE_TAG, [], []));
234 | FList.Add(THTMLTag.Create('article', ARTICLE_TAG, [], []));
235 | FList.Add(THTMLTag.Create('audio', AUDIO_TAG, [], []));
236 | FList.Add(THTMLTag.Create('b', B_TAG, [], []));
237 | FList.Add(THTMLTag.Create('base', BASE_TAG, [], []));
238 | FList.Add(THTMLTag.Create('basefont', BASEFONT_TAG, [], []));
239 | FList.Add(THTMLTag.Create('bdo', BDO_TAG, [], []));
240 | FList.Add(THTMLTag.Create('big', BIG_TAG, [], []));
241 | FList.Add(THTMLTag.Create('blockquote', BLOCKQUOTE_TAG, [], []));
242 | FList.Add(THTMLTag.Create('body', BODY_TAG, [], []));
243 | FList.Add(THTMLTag.Create('br', BR_TAG, [], []));
244 | FList.Add(THTMLTag.Create('button', BUTTON_TAG, [], []));
245 | FList.Add(THTMLTag.Create('bdi', BDI_TAG, [], []));
246 | FList.Add(THTMLTag.Create('caption', CAPTION_TAG, [], []));
247 | FList.Add(THTMLTag.Create('center', CENTER_TAG, [], []));
248 | FList.Add(THTMLTag.Create('cite', CITE_TAG, [], []));
249 | FList.Add(THTMLTag.Create('code', CODE_TAG, [], []));
250 | FList.Add(THTMLTag.Create('col', COL_TAG, [], []));
251 | FList.Add(THTMLTag.Create('colgroup', COLGROUP_TAG, [], []));
252 | FList.Add(THTMLTag.Create('dd', DD_TAG, [], []));
253 | FList.Add(THTMLTag.Create('del', DEL_TAG, [], []));
254 | FList.Add(THTMLTag.Create('dfn', DFN_TAG, [], []));
255 | FList.Add(THTMLTag.Create('dir', DIR_TAG, [], []));
256 | FList.Add(THTMLTag.Create('div', DIV_TAG, [], []));
257 | FList.Add(THTMLTag.Create('dl', DL_TAG, [], []));
258 | FList.Add(THTMLTag.Create('dt', DT_TAG, [], []));
259 | FList.Add(THTMLTag.Create('em', EM_TAG, [], []));
260 | FList.Add(THTMLTag.Create('embed', EMBED_TAG, [], []));
261 | FList.Add(THTMLTag.Create('fieldset', FIELDSET_TAG, [], []));
262 | FList.Add(THTMLTag.Create('font', FONT_TAG, [], []));
263 | FList.Add(THTMLTag.Create('form', FORM_TAG, [], []));
264 | FList.Add(THTMLTag.Create('frame', FRAME_TAG, [], []));
265 | FList.Add(THTMLTag.Create('frameset', FRAMESET_TAG, [], []));
266 | FList.Add(THTMLTag.Create('figure', FIGURE_TAG, [], []));
267 | FList.Add(THTMLTag.Create('footer', FOOTER_TAG, [], []));
268 | FList.Add(THTMLTag.Create('figcaption', FIGCAPTION_TAG, [], []));
269 | FList.Add(THTMLTag.Create('h1', H1_TAG, [], []));
270 | FList.Add(THTMLTag.Create('h2', H2_TAG, [], []));
271 | FList.Add(THTMLTag.Create('h3', H3_TAG, [], []));
272 | FList.Add(THTMLTag.Create('h4', H4_TAG, [], []));
273 | FList.Add(THTMLTag.Create('h5', H5_TAG, [], []));
274 | FList.Add(THTMLTag.Create('h6', H6_TAG, [], []));
275 | FList.Add(THTMLTag.Create('head', HEAD_TAG, [], []));
276 | FList.Add(THTMLTag.Create('header', HEADER_TAG, [], []));
277 | FList.Add(THTMLTag.Create('hr', HR_TAG, [], []));
278 | FList.Add(THTMLTag.Create('html', HTML_TAG, [], []));
279 | FList.Add(THTMLTag.Create('i', I_TAG, [], []));
280 | FList.Add(THTMLTag.Create('iframe', IFRAME_TAG, [], []));
281 | FList.Add(THTMLTag.Create('img', IMG_TAG, [], []));
282 | FList.Add(THTMLTag.Create('input', INPUT_TAG, [], []));
283 | FList.Add(THTMLTag.Create('ins', INS_TAG, [], []));
284 | FList.Add(THTMLTag.Create('isindex', ISINDEX_TAG, [], []));
285 | FList.Add(THTMLTag.Create('kbd', KBD_TAG, [], []));
286 | FList.Add(THTMLTag.Create('label', LABEL_TAG, [], []));
287 | FList.Add(THTMLTag.Create('legend', LEGEND_TAG, [], []));
288 | FList.Add(THTMLTag.Create('li', LI_TAG, [], []));
289 | FList.Add(THTMLTag.Create('link', LINK_TAG, [], []));
290 | FList.Add(THTMLTag.Create('map', MAP_TAG, [], []));
291 | FList.Add(THTMLTag.Create('menu', MENU_TAG, [], []));
292 | FList.Add(THTMLTag.Create('meta', META_TAG, [], []));
293 | FList.Add(THTMLTag.Create('main', MAIN_TAG, [], []));
294 | FList.Add(THTMLTag.Create('mark', MARK_TAG, [], []));
295 | FList.Add(THTMLTag.Create('noframes', NOFRAMES_TAG, [], []));
296 | FList.Add(THTMLTag.Create('noscript', NOSCRIPT_TAG, [], []));
297 | FList.Add(THTMLTag.Create('nav', NAV_TAG, [], []));
298 | FList.Add(THTMLTag.Create('object', OBJECT_TAG, [], []));
299 | FList.Add(THTMLTag.Create('ol', OL_TAG, [], []));
300 | FList.Add(THTMLTag.Create('optgroup', OPTGROUP_TAG, [], []));
301 | FList.Add(THTMLTag.Create('option', OPTION_TAG, [], []));
302 | FList.Add(THTMLTag.Create('p', P_TAG, [], []));
303 | FList.Add(THTMLTag.Create('param', PARAM_TAG, [], []));
304 | FList.Add(THTMLTag.Create('pre', PRE_TAG, [], []));
305 | FList.Add(THTMLTag.Create('plaintext', PLAINTEXT_TAG, [], []));
306 | FList.Add(THTMLTag.Create('q', Q_TAG, [], []));
307 | FList.Add(THTMLTag.Create('s', S_TAG, [], []));
308 | FList.Add(THTMLTag.Create('samp', SAMP_TAG, [], []));
309 | FList.Add(THTMLTag.Create('script', SCRIPT_TAG, [], []));
310 | FList.Add(THTMLTag.Create('select', SELECT_TAG, [], []));
311 | FList.Add(THTMLTag.Create('small', SMALL_TAG, [], []));
312 | FList.Add(THTMLTag.Create('span', SPAN_TAG, [], []));
313 | FList.Add(THTMLTag.Create('strike', STRIKE_TAG, [], []));
314 | FList.Add(THTMLTag.Create('strong', STRONG_TAG, [], []));
315 | FList.Add(THTMLTag.Create('style', STYLE_TAG, [], []));
316 | FList.Add(THTMLTag.Create('sub', SUB_TAG, [], []));
317 | FList.Add(THTMLTag.Create('sup', SUP_TAG, [], []));
318 | FList.Add(THTMLTag.Create('source', SOURCE_TAG, [], []));
319 | FList.Add(THTMLTag.Create('section', SECTION_TAG, [], []));
320 | FList.Add(THTMLTag.Create('table', TABLE_TAG, [], []));
321 | FList.Add(THTMLTag.Create('tbody', TBODY_TAG, [], []));
322 | FList.Add(THTMLTag.Create('td', TD_TAG, [], []));
323 | FList.Add(THTMLTag.Create('textarea', TEXTAREA_TAG, [], []));
324 | FList.Add(THTMLTag.Create('tfoot', TFOOT_TAG, [], []));
325 | FList.Add(THTMLTag.Create('th', TH_TAG, [], []));
326 | FList.Add(THTMLTag.Create('thead', THEAD_TAG, [], []));
327 | FList.Add(THTMLTag.Create('title', TITLE_TAG, [], []));
328 | FList.Add(THTMLTag.Create('tr', TR_TAG, [], []));
329 | FList.Add(THTMLTag.Create('tt', TT_TAG, [], []));
330 | FList.Add(THTMLTag.Create('track', TRACK_TAG, [], []));
331 | FList.Add(THTMLTag.Create('time', TIME_TAG, [], []));
332 | FList.Add(THTMLTag.Create('u', U_TAG, [], []));
333 | FList.Add(THTMLTag.Create('ul', UL_TAG, [], []));
334 | FList.Add(THTMLTag.Create('var', VAR_TAG, [], []));
335 | FList.Add(THTMLTag.Create('video', VIDEO_TAG, [], []));
336 | FList.Add(THTMLTag.Create('wbr', WBR_TAG, [], []));
337 | FList.Add(THTMLTag.Create('xmp', XMP_TAG, [], []));
338 |
339 | FUnknownTag := THTMLTag.Create('', UNKNOWN_TAG, [], []);
340 | end;
341 |
342 | destructor THTMLTagList.Destroy;
343 | var
344 | I: Integer;
345 | begin
346 | for I := FList.Count - 1 downto 0 do THTMLTag(FList[I]).Free;
347 | FList.Free;
348 | FUnknownTag.Free;
349 |
350 | inherited Destroy;
351 | end;
352 |
353 | {function THTMLTagList.GetTag(Compare: TCompareTag): THTMLTag;
354 | var
355 | I, Low, High, Rel: Integer;
356 | begin
357 | Result := nil;
358 |
359 | for i := FList.Count - 1 downto 0 do
360 | begin
361 | if Compare(FList[i]) <> 0 then
362 | begin
363 | Result := FList[i];
364 | Exit;
365 | end;
366 | end;
367 | end;}
368 |
369 | function THTMLTagList.GetTag(Compare: TCompareTag): THTMLTag;
370 | var
371 | I, Low, High, Rel: Integer;
372 | begin
373 | Low := -1;
374 | High := FList.Count - 1;
375 |
376 | while High - Low > 1 do
377 | begin
378 | I := (High + Low) div 2;
379 | Result := FList[I];
380 |
381 | Rel := Compare(Result);
382 | if Rel < 0 then High := I
383 | else if Rel > 0 then Low := I
384 | else Exit;
385 | end;
386 |
387 | if High >= 0 then
388 | begin
389 | Result := FList[High];
390 | if Compare(Result) = 0 then Exit;
391 | end;
392 |
393 | Result := nil;
394 | end;
395 |
396 | function THTMLTagList.CompareName(Tag: THTMLTag): Integer;
397 | begin
398 | Result := AnsiCompareStr(FSearchName, Tag.Name)
399 | end;
400 |
401 | function THTMLTagList.CompareNumber(Tag: THTMLTag): Integer;
402 | begin
403 | Result := (FSearchNumber - Tag.Number);
404 | end;
405 |
406 | function THTMLTagList.GetTagByName(const Name: string): THTMLTag;
407 | begin
408 | FSearchName := Name;
409 | Result := GetTag(CompareName);
410 | if Result = nil then Result := FUnknownTag;
411 | end;
412 |
413 | function THTMLTagList.GetTagByNumber(Number: Integer): THTMLTag;
414 | begin
415 | FSearchNumber := Number;
416 | Result := GetTag(CompareNumber);
417 | end;
418 |
419 | function TURLSchemes.Add(const S: string): Integer;
420 | begin
421 | if Length(S) > FMaxLen then FMaxLen := Length(S);
422 | Result := inherited Add(S);
423 | end;
424 |
425 | function TURLSchemes.IsURL(const S: string): Boolean;
426 | begin
427 | Result := IndexOf(LowerCase(S)) >= 0;
428 | end;
429 |
430 | function TURLSchemes.GetScheme(const S: string): string;
431 | const
432 | SchemeChars = [Ord('A')..Ord('Z'), Ord('a')..Ord('z')];
433 | var
434 | I: Integer;
435 | begin
436 | Result := '';
437 | for I := 1 to MaxLen + 1 do
438 | begin
439 | if I > Length(S) then Exit;
440 |
441 | if S[I] = ':' then
442 | begin
443 | if IsURL(Copy(S, 1, I - 1)) then Result := Copy(S, 1, I - 1);
444 | Exit
445 | end
446 | end
447 | end;
448 |
449 | initialization
450 | //HTMLTagList := THTMLTagList.Create;
451 | URLSchemes := TURLSchemes.Create;
452 | URLSchemes.Add('http');
453 | URLSchemes.Add('https');
454 | URLSchemes.Add('ftp');
455 | URLSchemes.Add('mailto');
456 | URLSchemes.Add('news');
457 | URLSchemes.Add('nntp');
458 | URLSchemes.Add('gopher');
459 |
460 | finalization
461 | //HTMLTagList.Free;
462 | URLSchemes.Free;
463 |
464 | end.
465 |
--------------------------------------------------------------------------------
/HTMLp.Helper.pas:
--------------------------------------------------------------------------------
1 | unit HTMLp.Helper;
2 |
3 | interface
4 |
5 | uses System.Classes, System.SysUtils, System.StrUtils, System.Math, System.Variants, System.Types,
6 | System.Generics.Collections,
7 |
8 | HTMLp.DomCore, HTMLp.Formatter,
9 | HTMLp.HtmlParser;
10 |
11 | type
12 | TNode = HTMLp.DomCore.TNode;
13 | TNodeList = HTMLp.DomCore.TNodeList;
14 | TElement = HTMLp.DomCore.TElement;
15 |
16 | IHTMLParser = interface
17 | ['{A2496DE9-17B0-40FC-A804-D19363A14154}']
18 | function Find(const selector: string): IHTMLParser;
19 | function Map(callback: TProc): IHTMLParser;
20 | {}
21 | function IsValid: Boolean;
22 | function GetRootNode: TElement;
23 | function GetFirstNode: TElement;
24 | function GetLastNode: TElement;
25 | function GetNodeCount: Integer;
26 | function GetNodeList: TNodeList;
27 | function Clear: IHTMLParser;
28 | {}
29 | property RootNode: TElement read GetRootNode;
30 | property NodeList: TNodeList read GetNodeList;
31 | property NodeCount: Integer read GetNodeCount;
32 | end;
33 |
34 | THTMLParserHelper = class(TInterfacedObject, IHTMLParser)
35 | private
36 | FDocument: TDocument;
37 | FCurrentNodeList: TNodeList;
38 | FCurrentElement: TElement;
39 | FNodeList: TList;
40 | public
41 | constructor Create(const HTML: string); overload;
42 | destructor Destroy; override;
43 |
44 | function Find(const selector: string): IHTMLParser;
45 | function SelectNode(const nodeIndex: Integer): IHTMLParser;
46 | function Map(callback: TProc): IHTMLParser;
47 |
48 | function IsValid: Boolean;
49 | function GetRootNode: TElement;
50 | function GetFirstNode: TElement;
51 | function GetLastNode: TElement;
52 | function GetNodeCount: Integer;
53 | function GetNodeList: TNodeList;
54 | function Clear: IHTMLParser;
55 |
56 | property RootNode: TElement read GetRootNode;
57 | property NodeList: TNodeList read GetNodeList;
58 | property NodeCount: Integer read GetNodeCount;
59 | end;
60 |
61 | function ParseHTML(const HTML: string): IHTMLParser;
62 |
63 | implementation
64 |
65 | function ParseHTML(const HTML: string): IHTMLParser;
66 | begin
67 | Result := THTMLParserHelper.Create(HTML);
68 | end;
69 |
70 | constructor THTMLParserHelper.Create(const HTML: string);
71 | begin
72 | inherited Create;
73 |
74 | {}
75 | with THTMLParser.Create do
76 | begin
77 | try FDocument := parseString(HTML); except end;
78 | FCurrentElement := GetRootNode;
79 |
80 | Free;
81 | end;
82 |
83 | {}
84 | FNodeList := TList.Create;
85 | end;
86 |
87 | destructor THTMLParserHelper.Destroy;
88 | begin
89 | Clear;
90 | FCurrentElement := nil;
91 |
92 | FreeAndNil(FNodeList);
93 | FreeAndNil(FDocument);
94 |
95 | inherited;
96 | end;
97 |
98 | function THTMLParserHelper.Find(const selector: string): IHTMLParser;
99 | begin
100 | Result := Self;
101 | FCurrentNodeList := nil;
102 |
103 | try
104 | if Pos('/', selector) = 1 then FCurrentNodeList := FCurrentElement.GetElementsByXPath(selector)
105 | else FCurrentNodeList := FCurrentElement.GetElementsByCSSSelector(selector);
106 | except end;
107 |
108 | {if not (Assigned(FCurrentNodeList)) then
109 | begin
110 | FCurrentNodeList := FDocument.CreateElement('div');
111 | end; }
112 |
113 | if Assigned(FCurrentNodeList) then FNodeList.Add(FCurrentNodeList);
114 | end;
115 |
116 | function THTMLParserHelper.SelectNode(const nodeIndex: Integer): IHTMLParser;
117 | begin
118 | Result := Self;
119 | if nodeIndex >= FCurrentNodeList.Count then Exit;
120 |
121 | FCurrentElement := (FCurrentNodeList[nodeIndex] as TElement);
122 | end;
123 |
124 | function THTMLParserHelper.Map(callback: TProc): IHTMLParser;
125 | var
126 | i: Integer;
127 | begin
128 | Result := Self;
129 | if not (Assigned(FCurrentNodeList)) then Exit;
130 |
131 | for i := 0 to FCurrentNodeList.Count - 1 do callback(i, (FCurrentNodeList[i] as TElement));
132 | end;
133 |
134 |
135 | {}
136 |
137 | function THTMLParserHelper.IsValid: Boolean;
138 | begin
139 | Result := Assigned(FCurrentElement);
140 | end;
141 |
142 | function THTMLParserHelper.GetRootNode: TElement;
143 | begin
144 | Result := FDocument.DocumentElement;
145 | end;
146 |
147 | function THTMLParserHelper.GetFirstNode: TElement;
148 | var
149 | node: TNode;
150 | begin
151 | Result := nil;
152 |
153 | node := FCurrentNodeList.GetFirst;
154 | if Assigned(node) then Result := (node as TElement);
155 | end;
156 |
157 | function THTMLParserHelper.GetLastNode: TElement;
158 | var
159 | node: TNode;
160 | begin
161 | Result := nil;
162 |
163 | node := FCurrentNodeList.GetLast;
164 | if Assigned(node) then Result := (node as TElement);
165 | end;
166 |
167 | function THTMLParserHelper.GetNodeCount: Integer;
168 | begin
169 | Result := FCurrentNodeList.Count;
170 | end;
171 |
172 | function THTMLParserHelper.GetNodeList: TNodeList;
173 | begin
174 | Result := FCurrentNodeList;
175 | end;
176 |
177 | function THTMLParserHelper.Clear: IHTMLParser;
178 | var
179 | i: Integer;
180 | begin
181 | FCurrentNodeList := nil;
182 |
183 | for i := 0 to FNodeList.Count - 1 do FNodeList[i].Destroy;
184 | FNodeList.Clear;
185 |
186 | Result := Self;
187 | end;
188 |
189 | end.
190 |
191 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 2.1, February 1999
3 |
4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc.
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | [This is the first released version of the Lesser GPL. It also counts
10 | as the successor of the GNU Library Public License, version 2, hence
11 | the version number 2.1.]
12 |
13 | Preamble
14 |
15 | The licenses for most software are designed to take away your
16 | freedom to share and change it. By contrast, the GNU General Public
17 | Licenses are intended to guarantee your freedom to share and change
18 | free software--to make sure the software is free for all its users.
19 |
20 | This license, the Lesser General Public License, applies to some
21 | specially designated software packages--typically libraries--of the
22 | Free Software Foundation and other authors who decide to use it. You
23 | can use it too, but we suggest you first think carefully about whether
24 | this license or the ordinary General Public License is the better
25 | strategy to use in any particular case, based on the explanations below.
26 |
27 | When we speak of free software, we are referring to freedom of use,
28 | not price. Our General Public Licenses are designed to make sure that
29 | you have the freedom to distribute copies of free software (and charge
30 | for this service if you wish); that you receive source code or can get
31 | it if you want it; that you can change the software and use pieces of
32 | it in new free programs; and that you are informed that you can do
33 | these things.
34 |
35 | To protect your rights, we need to make restrictions that forbid
36 | distributors to deny you these rights or to ask you to surrender these
37 | rights. These restrictions translate to certain responsibilities for
38 | you if you distribute copies of the library or if you modify it.
39 |
40 | For example, if you distribute copies of the library, whether gratis
41 | or for a fee, you must give the recipients all the rights that we gave
42 | you. You must make sure that they, too, receive or can get the source
43 | code. If you link other code with the library, you must provide
44 | complete object files to the recipients, so that they can relink them
45 | with the library after making changes to the library and recompiling
46 | it. And you must show them these terms so they know their rights.
47 |
48 | We protect your rights with a two-step method: (1) we copyright the
49 | library, and (2) we offer you this license, which gives you legal
50 | permission to copy, distribute and/or modify the library.
51 |
52 | To protect each distributor, we want to make it very clear that
53 | there is no warranty for the free library. Also, if the library is
54 | modified by someone else and passed on, the recipients should know
55 | that what they have is not the original version, so that the original
56 | author's reputation will not be affected by problems that might be
57 | introduced by others.
58 |
59 | Finally, software patents pose a constant threat to the existence of
60 | any free program. We wish to make sure that a company cannot
61 | effectively restrict the users of a free program by obtaining a
62 | restrictive license from a patent holder. Therefore, we insist that
63 | any patent license obtained for a version of the library must be
64 | consistent with the full freedom of use specified in this license.
65 |
66 | Most GNU software, including some libraries, is covered by the
67 | ordinary GNU General Public License. This license, the GNU Lesser
68 | General Public License, applies to certain designated libraries, and
69 | is quite different from the ordinary General Public License. We use
70 | this license for certain libraries in order to permit linking those
71 | libraries into non-free programs.
72 |
73 | When a program is linked with a library, whether statically or using
74 | a shared library, the combination of the two is legally speaking a
75 | combined work, a derivative of the original library. The ordinary
76 | General Public License therefore permits such linking only if the
77 | entire combination fits its criteria of freedom. The Lesser General
78 | Public License permits more lax criteria for linking other code with
79 | the library.
80 |
81 | We call this license the "Lesser" General Public License because it
82 | does Less to protect the user's freedom than the ordinary General
83 | Public License. It also provides other free software developers Less
84 | of an advantage over competing non-free programs. These disadvantages
85 | are the reason we use the ordinary General Public License for many
86 | libraries. However, the Lesser license provides advantages in certain
87 | special circumstances.
88 |
89 | For example, on rare occasions, there may be a special need to
90 | encourage the widest possible use of a certain library, so that it becomes
91 | a de-facto standard. To achieve this, non-free programs must be
92 | allowed to use the library. A more frequent case is that a free
93 | library does the same job as widely used non-free libraries. In this
94 | case, there is little to gain by limiting the free library to free
95 | software only, so we use the Lesser General Public License.
96 |
97 | In other cases, permission to use a particular library in non-free
98 | programs enables a greater number of people to use a large body of
99 | free software. For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 |
104 | Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 |
109 | The precise terms and conditions for copying, distribution and
110 | modification follow. Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library". The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 |
115 | GNU LESSER GENERAL PUBLIC LICENSE
116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 |
118 | 0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 |
124 | A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 |
128 | The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms. A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language. (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 |
136 | "Source code" for a work means the preferred form of the work for
137 | making modifications to it. For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 |
142 | Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope. The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it). Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 |
150 | 1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 |
158 | You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 |
162 | 2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 |
167 | a) The modified work must itself be a software library.
168 |
169 | b) You must cause the files modified to carry prominent notices
170 | stating that you changed the files and the date of any change.
171 |
172 | c) You must cause the whole of the work to be licensed at no
173 | charge to all third parties under the terms of this License.
174 |
175 | d) If a facility in the modified Library refers to a function or a
176 | table of data to be supplied by an application program that uses
177 | the facility, other than as an argument passed when the facility
178 | is invoked, then you must make a good faith effort to ensure that,
179 | in the event an application does not supply such function or
180 | table, the facility still operates, and performs whatever part of
181 | its purpose remains meaningful.
182 |
183 | (For example, a function in a library to compute square roots has
184 | a purpose that is entirely well-defined independent of the
185 | application. Therefore, Subsection 2d requires that any
186 | application-supplied function or table used by this function must
187 | be optional: if the application does not supply it, the square
188 | root function must still compute square roots.)
189 |
190 | These requirements apply to the modified work as a whole. If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works. But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 |
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 |
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 |
211 | 3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library. To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License. (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.) Do not make any other change in
218 | these notices.
219 |
220 | Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 |
224 | This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 |
227 | 4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 |
234 | If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 |
240 | 5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library". Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 |
246 | However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library". The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 |
252 | When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library. The
257 | threshold for this to be true is not precisely defined by law.
258 |
259 | If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work. (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 |
266 | Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 |
271 | 6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 |
278 | You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License. You must supply a copy of this License. If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License. Also, you must do one
284 | of these things:
285 |
286 | a) Accompany the work with the complete corresponding
287 | machine-readable source code for the Library including whatever
288 | changes were used in the work (which must be distributed under
289 | Sections 1 and 2 above); and, if the work is an executable linked
290 | with the Library, with the complete machine-readable "work that
291 | uses the Library", as object code and/or source code, so that the
292 | user can modify the Library and then relink to produce a modified
293 | executable containing the modified Library. (It is understood
294 | that the user who changes the contents of definitions files in the
295 | Library will not necessarily be able to recompile the application
296 | to use the modified definitions.)
297 |
298 | b) Use a suitable shared library mechanism for linking with the
299 | Library. A suitable mechanism is one that (1) uses at run time a
300 | copy of the library already present on the user's computer system,
301 | rather than copying library functions into the executable, and (2)
302 | will operate properly with a modified version of the library, if
303 | the user installs one, as long as the modified version is
304 | interface-compatible with the version that the work was made with.
305 |
306 | c) Accompany the work with a written offer, valid for at
307 | least three years, to give the same user the materials
308 | specified in Subsection 6a, above, for a charge no more
309 | than the cost of performing this distribution.
310 |
311 | d) If distribution of the work is made by offering access to copy
312 | from a designated place, offer equivalent access to copy the above
313 | specified materials from the same place.
314 |
315 | e) Verify that the user has already received a copy of these
316 | materials or that you have already sent this user a copy.
317 |
318 | For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it. However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 |
327 | It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system. Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 |
333 | 7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 |
340 | a) Accompany the combined library with a copy of the same work
341 | based on the Library, uncombined with any other library
342 | facilities. This must be distributed under the terms of the
343 | Sections above.
344 |
345 | b) Give prominent notice with the combined library of the fact
346 | that part of it is a work based on the Library, and explaining
347 | where to find the accompanying uncombined form of the same work.
348 |
349 | 8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License. Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License. However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 |
357 | 9. You are not required to accept this License, since you have not
358 | signed it. However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works. These actions are
360 | prohibited by law if you do not accept this License. Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 |
366 | 10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions. You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 |
374 | 11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License. If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all. For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 |
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 |
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices. Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 |
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 |
405 | 12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded. In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 |
413 | 13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 |
418 | Each version is given a distinguishing version number. If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation. If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 |
426 | 14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission. For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this. Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 |
435 | NO WARRANTY
436 |
437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 |
447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 |
458 | END OF TERMS AND CONDITIONS
459 |
460 | How to Apply These Terms to Your New Libraries
461 |
462 | If you develop a new library, and you want it to be of the greatest
463 | possible use to the public, we recommend making it free software that
464 | everyone can redistribute and change. You can do so by permitting
465 | redistribution under these terms (or, alternatively, under the terms of the
466 | ordinary General Public License).
467 |
468 | To apply these terms, attach the following notices to the library. It is
469 | safest to attach them to the start of each source file to most effectively
470 | convey the exclusion of warranty; and each file should have at least the
471 | "copyright" line and a pointer to where the full notice is found.
472 |
473 |
474 | Copyright (C)
475 |
476 | This library is free software; you can redistribute it and/or
477 | modify it under the terms of the GNU Lesser General Public
478 | License as published by the Free Software Foundation; either
479 | version 2.1 of the License, or (at your option) any later version.
480 |
481 | This library is distributed in the hope that it will be useful,
482 | but WITHOUT ANY WARRANTY; without even the implied warranty of
483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
484 | Lesser General Public License for more details.
485 |
486 | You should have received a copy of the GNU Lesser General Public
487 | License along with this library; if not, write to the Free Software
488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
489 | USA
490 |
491 | Also add information on how to contact you by electronic and paper mail.
492 |
493 | You should also get your employer (if you work as a programmer) or your
494 | school, if any, to sign a "copyright disclaimer" for the library, if
495 | necessary. Here is a sample; alter the names:
496 |
497 | Yoyodyne, Inc., hereby disclaims all copyright interest in the
498 | library `Frob' (a library for tweaking knobs) written by James Random
499 | Hacker.
500 |
501 | , 1 April 1990
502 | Ty Coon, President of Vice
503 |
504 | That's all there is to it!
505 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HTMLp
2 | Delphi Dom HTML Parser and Converter.
3 | > Fork from: https://sourceforge.net/projects/htmlp/
4 |
5 | ## Original Authors
6 | * Krzysztof Kochanski
7 | * [Sébastien Morin](https://github.com/smsisko)
8 |
9 | ## Contributors
10 | * [Roman Yankovsky](https://github.com/RomanYankovsky)
11 |
--------------------------------------------------------------------------------
/example/HTMLP.dpr:
--------------------------------------------------------------------------------
1 | program HTMLP;
2 |
3 | uses
4 | Forms,
5 | MainForm in 'MainForm.pas' {HTMLForm},
6 | HTMLp.DOMCore in '..\HTMLp.DOMCore.pas',
7 | HTMLp.Entities in '..\HTMLp.Entities.pas',
8 | HTMLp.Formatter in '..\HTMLp.Formatter.pas',
9 | HTMLp.Helper in '..\HTMLp.Helper.pas',
10 | HTMLp.HTMLParser in '..\HTMLp.HTMLParser.pas',
11 | HTMLp.HTMLReader in '..\HTMLp.HTMLReader.pas',
12 | HTMLp.HTMLTags in '..\HTMLp.HTMLTags.pas';
13 |
14 | {$R *.RES}
15 |
16 | begin
17 | Application.Initialize;
18 | Application.CreateForm(THTMLForm, HTMLForm);
19 | Application.Run;
20 | end.
21 |
--------------------------------------------------------------------------------
/example/HTMLP.dproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | {711105A8-17EB-4F45-8F90-D0147537B5C7}
4 | HTMLP.dpr
5 | True
6 | Debug
7 | 1153
8 | Application
9 | VCL
10 | 18.8
11 | Win32
12 |
13 |
14 | true
15 |
16 |
17 | true
18 | Base
19 | true
20 |
21 |
22 | true
23 | Base
24 | true
25 |
26 |
27 | true
28 | Base
29 | true
30 |
31 |
32 | true
33 | Cfg_1
34 | true
35 | true
36 |
37 |
38 | true
39 | Base
40 | true
41 |
42 |
43 | true
44 | Cfg_2
45 | true
46 | true
47 |
48 |
49 | true
50 | Cfg_2
51 | true
52 | true
53 |
54 |
55 | false
56 | false
57 | false
58 | false
59 | false
60 | 00400000
61 | HTMLP
62 | Vcl;Vcl.Imaging;Vcl.Touch;Vcl.Samples;Vcl.Shell;System;Xml;Data;Datasnap;Web;Soap;Winapi;$(DCC_Namespace)
63 | 1049
64 | CompanyName=;FileDescription=;FileVersion=1.0.0.0;InternalName=;LegalCopyright=;LegalTrademarks=;OriginalFilename=;ProductName=;ProductVersion=1.0.0.0;Comments=;CFBundleName=
65 |
66 |
67 | System.Win;Data.Win;Datasnap.Win;Web.Win;Soap.Win;Xml.Win;Bde;$(DCC_Namespace)
68 | Debug
69 | true
70 | CompanyName=;FileDescription=$(MSBuildProjectName);FileVersion=1.0.0.0;InternalName=;LegalCopyright=;LegalTrademarks=;OriginalFilename=;ProductName=$(MSBuildProjectName);ProductVersion=1.0.0.0;Comments=;ProgramID=com.embarcadero.$(MSBuildProjectName)
71 | 1033
72 | $(BDS)\bin\default_app.manifest
73 | true
74 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_44.png
75 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_150.png
76 |
77 |
78 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_44.png
79 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_150.png
80 |
81 |
82 | RELEASE;$(DCC_Define)
83 | 0
84 | false
85 | 0
86 |
87 |
88 | true
89 | PerMonitorV2
90 |
91 |
92 | DEBUG;$(DCC_Define)
93 | false
94 | true
95 |
96 |
97 | Debug
98 |
99 |
100 | true
101 | PerMonitorV2
102 | CompanyName=;FileDescription=$(MSBuildProjectName);FileVersion=1.0.0.0;InternalName=;LegalCopyright=;LegalTrademarks=;OriginalFilename=;ProductName=$(MSBuildProjectName);ProductVersion=1.0.0.0;Comments=;ProgramID=com.embarcadero.$(MSBuildProjectName)
103 | 3
104 |
105 |
106 |
107 | MainSource
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 | Cfg_2
121 | Base
122 |
123 |
124 | Base
125 |
126 |
127 | Cfg_1
128 | Base
129 |
130 |
131 |
132 | Delphi.Personality.12
133 |
134 |
135 |
136 |
137 | HTMLP.dpr
138 |
139 |
140 |
141 | True
142 | True
143 | True
144 | False
145 |
146 |
147 |
148 |
149 | HTMLP.exe
150 | true
151 |
152 |
153 |
154 |
155 | 1
156 |
157 |
158 | Contents\MacOS
159 | 1
160 |
161 |
162 | 0
163 |
164 |
165 |
166 |
167 | classes
168 | 1
169 |
170 |
171 | classes
172 | 1
173 |
174 |
175 |
176 |
177 | res\xml
178 | 1
179 |
180 |
181 | res\xml
182 | 1
183 |
184 |
185 |
186 |
187 | library\lib\armeabi-v7a
188 | 1
189 |
190 |
191 |
192 |
193 | library\lib\armeabi
194 | 1
195 |
196 |
197 | library\lib\armeabi
198 | 1
199 |
200 |
201 |
202 |
203 | library\lib\armeabi-v7a
204 | 1
205 |
206 |
207 |
208 |
209 | library\lib\mips
210 | 1
211 |
212 |
213 | library\lib\mips
214 | 1
215 |
216 |
217 |
218 |
219 | library\lib\armeabi-v7a
220 | 1
221 |
222 |
223 | library\lib\arm64-v8a
224 | 1
225 |
226 |
227 |
228 |
229 | library\lib\armeabi-v7a
230 | 1
231 |
232 |
233 |
234 |
235 | res\drawable
236 | 1
237 |
238 |
239 | res\drawable
240 | 1
241 |
242 |
243 |
244 |
245 | res\values
246 | 1
247 |
248 |
249 | res\values
250 | 1
251 |
252 |
253 |
254 |
255 | res\values-v21
256 | 1
257 |
258 |
259 | res\values-v21
260 | 1
261 |
262 |
263 |
264 |
265 | res\values
266 | 1
267 |
268 |
269 | res\values
270 | 1
271 |
272 |
273 |
274 |
275 | res\drawable
276 | 1
277 |
278 |
279 | res\drawable
280 | 1
281 |
282 |
283 |
284 |
285 | res\drawable-xxhdpi
286 | 1
287 |
288 |
289 | res\drawable-xxhdpi
290 | 1
291 |
292 |
293 |
294 |
295 | res\drawable-ldpi
296 | 1
297 |
298 |
299 | res\drawable-ldpi
300 | 1
301 |
302 |
303 |
304 |
305 | res\drawable-mdpi
306 | 1
307 |
308 |
309 | res\drawable-mdpi
310 | 1
311 |
312 |
313 |
314 |
315 | res\drawable-hdpi
316 | 1
317 |
318 |
319 | res\drawable-hdpi
320 | 1
321 |
322 |
323 |
324 |
325 | res\drawable-xhdpi
326 | 1
327 |
328 |
329 | res\drawable-xhdpi
330 | 1
331 |
332 |
333 |
334 |
335 | res\drawable-mdpi
336 | 1
337 |
338 |
339 | res\drawable-mdpi
340 | 1
341 |
342 |
343 |
344 |
345 | res\drawable-hdpi
346 | 1
347 |
348 |
349 | res\drawable-hdpi
350 | 1
351 |
352 |
353 |
354 |
355 | res\drawable-xhdpi
356 | 1
357 |
358 |
359 | res\drawable-xhdpi
360 | 1
361 |
362 |
363 |
364 |
365 | res\drawable-xxhdpi
366 | 1
367 |
368 |
369 | res\drawable-xxhdpi
370 | 1
371 |
372 |
373 |
374 |
375 | res\drawable-xxxhdpi
376 | 1
377 |
378 |
379 | res\drawable-xxxhdpi
380 | 1
381 |
382 |
383 |
384 |
385 | res\drawable-small
386 | 1
387 |
388 |
389 | res\drawable-small
390 | 1
391 |
392 |
393 |
394 |
395 | res\drawable-normal
396 | 1
397 |
398 |
399 | res\drawable-normal
400 | 1
401 |
402 |
403 |
404 |
405 | res\drawable-large
406 | 1
407 |
408 |
409 | res\drawable-large
410 | 1
411 |
412 |
413 |
414 |
415 | res\drawable-xlarge
416 | 1
417 |
418 |
419 | res\drawable-xlarge
420 | 1
421 |
422 |
423 |
424 |
425 | res\values
426 | 1
427 |
428 |
429 | res\values
430 | 1
431 |
432 |
433 |
434 |
435 | 1
436 |
437 |
438 | Contents\MacOS
439 | 1
440 |
441 |
442 | 0
443 |
444 |
445 |
446 |
447 | Contents\MacOS
448 | 1
449 | .framework
450 |
451 |
452 | Contents\MacOS
453 | 1
454 | .framework
455 |
456 |
457 | 0
458 |
459 |
460 |
461 |
462 | 1
463 | .dylib
464 |
465 |
466 | 1
467 | .dylib
468 |
469 |
470 | 1
471 | .dylib
472 |
473 |
474 | Contents\MacOS
475 | 1
476 | .dylib
477 |
478 |
479 | Contents\MacOS
480 | 1
481 | .dylib
482 |
483 |
484 | 0
485 | .dll;.bpl
486 |
487 |
488 |
489 |
490 | 1
491 | .dylib
492 |
493 |
494 | 1
495 | .dylib
496 |
497 |
498 | 1
499 | .dylib
500 |
501 |
502 | Contents\MacOS
503 | 1
504 | .dylib
505 |
506 |
507 | Contents\MacOS
508 | 1
509 | .dylib
510 |
511 |
512 | 0
513 | .bpl
514 |
515 |
516 |
517 |
518 | 0
519 |
520 |
521 | 0
522 |
523 |
524 | 0
525 |
526 |
527 | 0
528 |
529 |
530 | 0
531 |
532 |
533 | Contents\Resources\StartUp\
534 | 0
535 |
536 |
537 | Contents\Resources\StartUp\
538 | 0
539 |
540 |
541 | 0
542 |
543 |
544 |
545 |
546 | 1
547 |
548 |
549 | 1
550 |
551 |
552 | 1
553 |
554 |
555 |
556 |
557 | 1
558 |
559 |
560 | 1
561 |
562 |
563 | 1
564 |
565 |
566 |
567 |
568 | 1
569 |
570 |
571 | 1
572 |
573 |
574 | 1
575 |
576 |
577 |
578 |
579 | 1
580 |
581 |
582 | 1
583 |
584 |
585 | 1
586 |
587 |
588 |
589 |
590 | 1
591 |
592 |
593 | 1
594 |
595 |
596 | 1
597 |
598 |
599 |
600 |
601 | 1
602 |
603 |
604 | 1
605 |
606 |
607 | 1
608 |
609 |
610 |
611 |
612 | 1
613 |
614 |
615 | 1
616 |
617 |
618 | 1
619 |
620 |
621 |
622 |
623 | 1
624 |
625 |
626 | 1
627 |
628 |
629 | 1
630 |
631 |
632 |
633 |
634 | 1
635 |
636 |
637 | 1
638 |
639 |
640 | 1
641 |
642 |
643 |
644 |
645 | 1
646 |
647 |
648 | 1
649 |
650 |
651 | 1
652 |
653 |
654 |
655 |
656 | 1
657 |
658 |
659 | 1
660 |
661 |
662 | 1
663 |
664 |
665 |
666 |
667 | 1
668 |
669 |
670 | 1
671 |
672 |
673 | 1
674 |
675 |
676 |
677 |
678 | 1
679 |
680 |
681 | 1
682 |
683 |
684 | 1
685 |
686 |
687 |
688 |
689 | 1
690 |
691 |
692 | 1
693 |
694 |
695 | 1
696 |
697 |
698 |
699 |
700 | 1
701 |
702 |
703 | 1
704 |
705 |
706 | 1
707 |
708 |
709 |
710 |
711 | 1
712 |
713 |
714 | 1
715 |
716 |
717 | 1
718 |
719 |
720 |
721 |
722 | 1
723 |
724 |
725 | 1
726 |
727 |
728 | 1
729 |
730 |
731 |
732 |
733 | 1
734 |
735 |
736 | 1
737 |
738 |
739 | 1
740 |
741 |
742 |
743 |
744 | 1
745 |
746 |
747 | 1
748 |
749 |
750 | 1
751 |
752 |
753 |
754 |
755 | 1
756 |
757 |
758 | 1
759 |
760 |
761 | 1
762 |
763 |
764 |
765 |
766 | 1
767 |
768 |
769 | 1
770 |
771 |
772 | 1
773 |
774 |
775 |
776 |
777 | 1
778 |
779 |
780 | 1
781 |
782 |
783 | 1
784 |
785 |
786 |
787 |
788 | 1
789 |
790 |
791 | 1
792 |
793 |
794 | 1
795 |
796 |
797 |
798 |
799 | 1
800 |
801 |
802 | 1
803 |
804 |
805 | 1
806 |
807 |
808 |
809 |
810 | 1
811 |
812 |
813 | 1
814 |
815 |
816 |
817 |
818 | ..\$(PROJECTNAME).app.dSYM\Contents\Resources\DWARF
819 | 1
820 |
821 |
822 | ..\$(PROJECTNAME).app.dSYM\Contents\Resources\DWARF
823 | 1
824 |
825 |
826 |
827 |
828 | 1
829 |
830 |
831 | 1
832 |
833 |
834 |
835 |
836 | ..\
837 | 1
838 |
839 |
840 | ..\
841 | 1
842 |
843 |
844 |
845 |
846 | 1
847 |
848 |
849 | 1
850 |
851 |
852 | 1
853 |
854 |
855 |
856 |
857 | 1
858 |
859 |
860 | 1
861 |
862 |
863 | 1
864 |
865 |
866 |
867 |
868 | ..\$(PROJECTNAME).app.dSYM\Contents\Resources\DWARF
869 | 1
870 |
871 |
872 |
873 |
874 | ..\
875 | 1
876 |
877 |
878 | ..\
879 | 1
880 |
881 |
882 |
883 |
884 | Contents
885 | 1
886 |
887 |
888 | Contents
889 | 1
890 |
891 |
892 |
893 |
894 | Contents\Resources
895 | 1
896 |
897 |
898 | Contents\Resources
899 | 1
900 |
901 |
902 |
903 |
904 | library\lib\armeabi-v7a
905 | 1
906 |
907 |
908 | library\lib\arm64-v8a
909 | 1
910 |
911 |
912 | 1
913 |
914 |
915 | 1
916 |
917 |
918 | 1
919 |
920 |
921 | 1
922 |
923 |
924 | Contents\MacOS
925 | 1
926 |
927 |
928 | Contents\MacOS
929 | 1
930 |
931 |
932 | 0
933 |
934 |
935 |
936 |
937 | library\lib\armeabi-v7a
938 | 1
939 |
940 |
941 |
942 |
943 | 1
944 |
945 |
946 | 1
947 |
948 |
949 |
950 |
951 | Assets
952 | 1
953 |
954 |
955 | Assets
956 | 1
957 |
958 |
959 |
960 |
961 | Assets
962 | 1
963 |
964 |
965 | Assets
966 | 1
967 |
968 |
969 |
970 |
971 |
972 |
973 |
974 |
975 |
976 |
977 |
978 |
979 |
980 |
981 | 12
982 |
983 |
984 |
985 |
986 |
987 |
--------------------------------------------------------------------------------
/example/MainForm.dfm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ange007/HTMLp/706333a1b28b0b62ba54bb3eb367bfbc78b33e8f/example/MainForm.dfm
--------------------------------------------------------------------------------
/example/MainForm.pas:
--------------------------------------------------------------------------------
1 | unit MainForm;
2 |
3 | interface
4 |
5 | uses
6 | Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs,
7 | StdCtrls, ExtCtrls, IdHTTP, StrUtils,
8 |
9 | HTMLp.HTMLParser, HTMLp.Helper, Buttons;
10 |
11 | type
12 | THTMLForm = class(TForm)
13 | TopPanel: TPanel;
14 | FileNameEdit: TEdit;
15 | BrowseButton: TButton;
16 | ToTextButton: TButton;
17 | Memo: TMemo;
18 | OpenDialog: TOpenDialog;
19 | PanelXPath: TPanel;
20 | UseXPathButton: TSpeedButton;
21 | XPathEdit: TEdit;
22 | MultiThreadTest: TButton;
23 | procedure BrowseButtonClick(Sender: TObject);
24 | procedure ToTextButtonClick(Sender: TObject);
25 | procedure UseXPathButtonClick(Sender: TObject);
26 | procedure MultiThreadTestClick(Sender: TObject);
27 | public
28 | private
29 | HTMLParser: THTMLParser;
30 | end;
31 |
32 | function GetHTML(const fileName: string): string;
33 |
34 | var
35 | HTMLForm: THTMLForm;
36 |
37 | implementation
38 |
39 | {$R *.DFM}
40 |
41 | uses
42 | HTMLp.DomCore, HTMLp.Formatter;
43 |
44 | function GetHTML(const fileName: string): string;
45 | var
46 | F: TStringStream;
47 | begin
48 | if Pos('http', fileName) = 1 then
49 | begin
50 | with TIdHTTP.Create(nil) do
51 | begin
52 | HandleRedirects := True;
53 |
54 | try Result := Get(fileName);
55 | except
56 | on e: Exception do ShowMessage(e.Message);
57 | end;
58 |
59 | Free;
60 | end;
61 | end
62 | else
63 | begin
64 | F := TStringStream.Create({'', TEncoding.UTF8});
65 | try
66 | F.LoadFromFile(fileName);
67 | Result := F.DataString;
68 | finally
69 | F.Free
70 | end;
71 | end;
72 | end;
73 |
74 | procedure THTMLForm.MultiThreadTestClick(Sender: TObject);
75 | var
76 | i: Integer;
77 | thread: TThread;
78 | begin
79 | Memo.Clear;
80 |
81 | for i := 0 to 50 do
82 | begin
83 | thread := TThread.CreateAnonymousThread(procedure( )
84 | var
85 | HTML, body: string;
86 | HTMLDoc: TDocument;
87 | begin
88 | with THTMLParser.Create do
89 | begin
90 | HTML := GetHTML(FileNameEdit.Text);
91 | HTMLDoc := parseString(HTML);
92 | body := HTMLDoc.GetInnerHTML;
93 |
94 | FreeAndNil(HTMLDoc);
95 | Free;
96 | end;
97 |
98 | TThread.Synchronize(TThread.Current, procedure begin HTMLForm.Memo.Lines.Add(IfThen(body <> '', 'GOOD', 'BAD')); end);
99 | end);
100 |
101 | {}
102 | thread.Start;
103 | end;
104 | end;
105 |
106 | procedure THTMLForm.BrowseButtonClick(Sender: TObject);
107 | begin
108 | if OpenDialog.Execute then FileNameEdit.Text := OpenDialog.FileName
109 | end;
110 |
111 | procedure THTMLForm.UseXPathButtonClick(Sender: TObject);
112 | var
113 | HTML: string;
114 | begin
115 | HTML := GetHTML(FileNameEdit.Text);
116 | Memo.Clear;
117 |
118 | ParseHTML(HTML).Find(XPathEdit.Text).Map( procedure(AIndex: Integer; AEl: TElement)
119 | begin
120 | Memo.Lines.Add(AEl.Value);
121 | end );
122 |
123 | Memo.SelStart := 0;
124 | Memo.SelLength := 0;
125 | end;
126 |
127 | procedure THTMLForm.ToTextButtonClick(Sender: TObject);
128 | var
129 | HTML: string;
130 | HTMLDoc: TDocument;
131 | Formatter: TBaseFormatter;
132 | begin
133 | HTML := GetHTML(FileNameEdit.Text);
134 | Memo.Clear;
135 |
136 | HTMLParser := THTMLParser.Create;
137 | try
138 | HTMLDoc := HTMLParser.parseString(HTML);
139 | finally
140 | FreeAndNil(HTMLParser);
141 | end;
142 |
143 | Formatter := TTextFormatter.Create;
144 | try
145 | Memo.Lines.Text := Formatter.getText(HTMLDoc);
146 | finally
147 | FreeAndNil(Formatter);
148 | end;
149 |
150 | FreeAndNil(HTMLDoc);
151 |
152 | Memo.SelStart := 0;
153 | Memo.SelLength := 0;
154 | end;
155 |
156 | end.
157 |
--------------------------------------------------------------------------------
/example/index.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ange007/HTMLp/706333a1b28b0b62ba54bb3eb367bfbc78b33e8f/example/index.html
--------------------------------------------------------------------------------