├── .gitignore
├── HtmlParserEx.pas
├── LICENSE
├── README.md
└── Tests
├── HTMLParserEx.Tests.Main.pas
├── HTMLParserEx.Tests.dpr
└── HTMLParserEx.Tests.dproj
/.gitignore:
--------------------------------------------------------------------------------
1 | # Uncomment these types if you want even more clean repository. But be careful.
2 | # It can make harm to an existing project source. Read explanations below.
3 | #
4 | # Type library file (binary). In old Delphi versions it should be stored.
5 | # Since Delphi 2009 it is produced from .ridl file and can safely be ignored.
6 | #*.tlb
7 | #
8 | # Diagram Portfolio file. Used by the diagram editor up to Delphi 7.
9 | # Uncomment this if you are not using diagrams or use newer Delphi version.
10 | #*.ddp
11 | #
12 | # Visual LiveBindings file. Added in Delphi XE2.
13 | # Uncomment this if you are not using LiveBindings Designer.
14 | #*.vlb
15 | #
16 | # Deployment Manager configuration file for your project. Added in Delphi XE2.
17 | # Uncomment this if it is not mobile development and you do not use remote debug feature.
18 | #*.deployproj
19 |
20 | # Delphi compiler-generated binaries (safe to delete)
21 | *.a
22 | *.apk
23 | *.bpi
24 | *.bpl
25 | *.dcp
26 | *.dcu
27 | *.dll
28 | *.drc
29 | *.dres
30 | *.exe
31 | *.lib
32 | *.map
33 | *.o
34 | *.obj
35 | *.ocx
36 | *.res
37 | *.otares
38 | *.rsm
39 | *.so
40 | *.tds
41 |
42 | # Delphi autogenerated files (duplicated info)
43 | *.cfg
44 | *.hpp
45 | *Resource.rc
46 |
47 | # Delphi local files (user-specific info)
48 | *.cbk
49 | *.dsk
50 | *.identcache
51 | *.local
52 | *.projdata
53 | *.tvsconfig
54 | *.user
55 |
56 | # Delphi history and backups
57 | __history/
58 | __recovery/
59 | *.~*
60 |
61 | # Castalia statistics file (since XE7 Castalia is distributed with Delphi)
62 | *.stat
63 |
64 | # Platform output folders
65 | Win32
66 | Win64
67 | DCU
68 |
69 | # Explorer image caches
70 | [Tt]humbs.db
71 |
72 | # Other common file extensions
73 | *.bak
74 | *.temp
75 | *.tmp
76 | *.rc
77 |
--------------------------------------------------------------------------------
/HtmlParserEx.pas:
--------------------------------------------------------------------------------
1 | unit HtmlParserEx;
2 | {$DEFINE UseXPath }
3 | {$IF RTLVersion < 24.0}
4 | {$MESSAGE ERROR 'Only XE3 and later versions are supported'}
5 | {$ENDIF}
6 |
7 | interface
8 |
9 | uses
10 | System.SysUtils,
11 | System.Classes,
12 | System.Generics.Collections
13 | {$IFDEF UseXPath}
14 | ,System.RegularExpressionsCore
15 | {$ENDIF};
16 |
17 | {$IF (defined(IOS) and defined(CPUARM)) or defined(ANDROID)}
18 | {$DEFINE MOBILE_DEV}
19 | {$ENDIF}
20 | const
21 | // By design each IHtmlElement has a TagName property, even it's a 'Text' element.
22 | // so cSpecialTagName_Text represents the fake tag name of a Text element
23 | cSpecialTagName_Text = '#TEXT';
24 | LowStrIndex = low(string); // Mobile platform=0, PC platform=1
25 |
26 | type
27 | IHtmlElement = interface;
28 | IHtmlElementList = interface;
29 | TElementEachEvent = reference to procedure(AIndex:Integer; AEl:IHtmlElement);
30 |
31 |
32 | IHtmlElement = interface
33 | ['{8C75239C-8CFA-499F-B115-7CEBEDFB421B}']
34 | function GetParent:IHtmlElement; stdcall;
35 | function GetTagName:String; stdcall;
36 | procedure SetTagName(Value:String); stdcall;
37 | function GetContent:String; stdcall;
38 | function GetOrignal:String; stdcall;
39 | function GetChildrenCount:Integer; stdcall;
40 | function GetChildren(Index:Integer):IHtmlElement; stdcall;
41 | function GetCloseTag:IHtmlElement; stdcall;
42 | function GetInnerHtml():String; stdcall;
43 | function GetOuterHtml():String; stdcall;
44 | function GetInnerText():String; stdcall;
45 | procedure SetInnerText(Value:String); stdcall;
46 | function GetAttributes(Key:String):String; stdcall;
47 | procedure SetAttributes(Key:String; Value:String); stdcall;
48 | procedure RemoveAttr(AAttrName:string); stdcall;
49 | function GetSourceLineNum():Integer; stdcall;
50 | function GetSourceColNum():Integer; stdcall;
51 | // Add and remove nodes
52 | function RemoveChild(ANode:IHtmlElement):Integer; stdcall;
53 | procedure Remove; stdcall;
54 | function AppedChild(const ATag:string):IHtmlElement; stdcall;
55 | // Does the property exist
56 | function HasAttribute(AttributeName:String):Boolean; stdcall;
57 | { Find Element using CSS selector syntax, "pseudo-class" is not supported
58 | http://www.w3.org/TR/CSS2/selector.html
59 | }
60 | function SimpleCSSSelector(const selector:String):IHtmlElementList; stdcall;
61 | function Find(const selector:String):IHtmlElementList; stdcall;
62 | {$IFDEF UseXPath}
63 | function FindX(const AXPath:String):IHtmlElementList; stdcall;
64 | {$ENDIF}
65 | // enum property
66 | function EnumAttributeNames(Index:Integer):String; stdcall;
67 | property TagName:String read GetTagName write SetTagName;
68 | property ChildrenCount:Integer read GetChildrenCount;
69 | property Children[index:Integer]:IHtmlElement read GetChildren; default;
70 | property CloseTag:IHtmlElement read GetCloseTag;
71 | property Content:String read GetContent;
72 | property Orignal:String read GetOrignal;
73 | property Parent:IHtmlElement read GetParent;
74 | // Get the position of an element in the source code
75 | property SourceLineNum:Integer read GetSourceLineNum;
76 | property SourceColNum:Integer read GetSourceColNum;
77 | //
78 | property InnerHtml:String read GetInnerHtml;
79 | property OuterHtml:String read GetOuterHtml;
80 | property InnerText:String read GetInnerText write SetInnerText;
81 | property Text:String read GetInnerText write SetInnerText;
82 | property Attributes[Key:String]:String read GetAttributes write SetAttributes;
83 | // ying32 does not change the original, just simplifies the use
84 | property Attrs[Key:String]:String read GetAttributes write SetAttributes;
85 | end;
86 |
87 |
88 | THtmlListEnumerator = class
89 | private
90 | FIndex:Integer;
91 | FList:IHtmlElementList;
92 | public
93 | constructor Create(AList:IHtmlElementList);
94 | function GetCurrent:IHtmlElement; inline;
95 | function MoveNext:Boolean;
96 | property Current:IHtmlElement read GetCurrent;
97 | end;
98 |
99 |
100 | IHtmlElementList = interface
101 | ['{8E1380C6-4263-4BF6-8D10-091A86D8E7D9}']
102 | function GetCount:Integer; stdcall;
103 | function GetItems(Index:Integer):IHtmlElement; stdcall;
104 | procedure RemoveAll; stdcall;
105 | procedure Remove(ANode:IHtmlElement); stdcall;
106 | procedure Each(f:TElementEachEvent); stdcall;
107 | function GetText:String; stdcall;
108 | function GetEnumerator:THtmlListEnumerator;
109 | property Text:String read GetText;
110 | property Count:Integer read GetCount;
111 | property Items[index:Integer]:IHtmlElement read GetItems; default;
112 | end;
113 |
114 |
115 | function ParserHTML(const Source:String):IHtmlElement; stdcall;
116 | function DecodeHtmlEntities(S:string):string; forward;
117 |
118 | implementation
119 |
120 | uses
121 | StrUtils;
122 |
123 | type
124 | TStringDictionary = TDictionary;
125 | TPropDictionary = TDictionary;
126 | TStringDynArray = TArray;
127 | const
128 | WhiteSpace = [' ', #13, #10, #9];
129 | // CSS Attribute Compare Operator
130 | OperatorChar = ['=', '!', '*', '~', '|', '^', '$'];
131 | MaxListSize = Maxint div 16;
132 | // TagProperty
133 | tpBlock = $01;
134 | tpInline = $02;
135 | tpEmpty = $04;
136 | tpFormatAsInline = $08;
137 | tpPreserveWhitespace = $10;
138 | tpInlineOrEmpty = tpInline or tpEmpty;
139 |
140 | type
141 | TAttrOperator = (aoExist, aoEqual, aoNotEqual, aoIncludeWord, aoBeginWord, aoBegin, aoEnd, aoContain);
142 | PAttrSelectorItem = ^TAttrSelectorItem;
143 |
144 |
145 | TAttrSelectorItem = record
146 | Key:string;
147 | AttrOperator:TAttrOperator;
148 | Value:string;
149 | end;
150 |
151 |
152 | TSelectorItemRelation = (sirNONE, sirDescendant, sirChildren, sirYoungerBrother, sirAllYoungerBrother);
153 | PCSSSelectorItem = ^TCSSSelectorItem;
154 |
155 |
156 | TCSSSelectorItem = record
157 | Relation:TSelectorItemRelation;
158 | szTag:string;
159 | Attributes: array of TAttrSelectorItem;
160 | end;
161 |
162 |
163 | TCSSSelectorItems = array of TCSSSelectorItem;
164 | PCSSSelectorItems = ^TCSSSelectorItems;
165 | TCSSSelectorItemGroup = array of TCSSSelectorItems;
166 |
167 |
168 | //
169 | TSourceContext = record
170 | private
171 | function GetCharOfCurrent(Index:Integer):Char; inline;
172 | function PassedEndOfSourceCode:Boolean; inline;
173 | public
174 | SourceCode:string;
175 | CharIndex:Integer;
176 | LineNum:Integer;
177 | ColNum:Integer;
178 | CurrentChar:Char;
179 | {$IFDEF DEBUG}
180 | currentCode:PChar;
181 | {$ENDIF}
182 | // Jump to the next char and return true if not exceeding the end of SourceCode
183 | function JumpToNextChar:Boolean; overload; inline;
184 | // Jump multiple characters, and return the jumped steps
185 | function JumpToNextChar(Step:Integer):Integer; overload; inline;
186 | procedure setCode(const ACode:string); inline;
187 | function ReadStr(UntilChars:TSysCharSet):string; inline;
188 | function PeekStr(Index:Integer):string; overload; inline;
189 | function PeekStr():string; overload; inline;
190 | function subStr(Index, Count:Integer):string; overload; inline;
191 | function subStr(Count:Integer):string; overload; inline;
192 | procedure SkipBlank(); inline;
193 | property charOfCurrent[index:Integer]:Char read GetCharOfCurrent;
194 | end;
195 |
196 |
197 | TAttributeItem = record
198 | Key, Value:string;
199 | end;
200 |
201 |
202 | TAttributeDynArray = TArray;
203 | TIHtmlElementList = class;
204 | THtmlElement = class;
205 | THtmlElementList = TList;
206 |
207 |
208 | THtmlElement = class(TInterfacedObject, IHtmlElement)
209 | private
210 | function GetChildrens:IHtmlElementList;
211 | protected
212 | // ying32
213 | function GetParent:IHtmlElement; stdcall;
214 | function GetTagName:String; stdcall;
215 | procedure SetTagName(Value:String); stdcall;
216 | function GetContent:String; stdcall;
217 | function GetOrignal:String; stdcall;
218 | function GetChildrenCount:Integer; stdcall;
219 | function GetChildren(Index:Integer):IHtmlElement; stdcall;
220 | function GetCloseTag:IHtmlElement; stdcall;
221 | function GetInnerHtml():String; stdcall;
222 | function GetOuterHtml():String; stdcall;
223 | function GetInnerText():String; stdcall;
224 | procedure SetInnerText(Value:String); stdcall;
225 | function GetAttributes(Key:String):String; stdcall;
226 | procedure SetAttributes(Key:String; Value:String); stdcall;
227 | procedure RemoveAttr(AAttrName:string); stdcall;
228 | function GetSourceLineNum():Integer; stdcall;
229 | function GetSourceColNum():Integer; stdcall;
230 | // ying32Added
231 | function RemoveChild(ANode:IHtmlElement):Integer; stdcall;
232 | procedure Remove; stdcall;
233 | function AppedChild(const ATag:string):IHtmlElement; stdcall;
234 |
235 | // Does the property exist
236 | function HasAttribute(AttributeName:String):Boolean; stdcall;
237 | { Find Element with CSS selector syntax, does not support "pseudo-class"
238 | http://www.w3.org/TR/CSS2/selector.html
239 | }
240 | function SimpleCSSSelector(const selector:String):IHtmlElementList; stdcall;
241 | function Find(const selector:String):IHtmlElementList; stdcall;
242 | {$IFDEF UseXPath}
243 | function FindX(const AXPath:String):IHtmlElementList; stdcall;
244 | {$ENDIF}
245 | // enum property
246 | function EnumAttributeNames(Index:Integer):String; stdcall;
247 | property TagName:String read GetTagName write SetTagName;
248 | property ChildrenCount:Integer read GetChildrenCount;
249 | property Children[index:Integer]:IHtmlElement read GetChildren; default;
250 | property CloseTag:IHtmlElement read GetCloseTag;
251 | property Content:String read GetContent;
252 | property Orignal:String read GetOrignal;
253 | property Parent:IHtmlElement read GetParent;
254 | // Get the position of an element in the source code
255 | property SourceLineNum:Integer read GetSourceLineNum;
256 | property SourceColNum:Integer read GetSourceColNum;
257 | //
258 | property InnerHtml:String read GetInnerHtml;
259 | property OuterHtml:String read GetOuterHtml;
260 | property InnerText:String read GetInnerText;
261 | property Attributes[Key:String]:String read GetAttributes write SetAttributes;
262 | property Childrens:IHtmlElementList read GetChildrens;
263 | private
264 | FClosed:Boolean;
265 | //
266 | FOwner:THtmlElement;
267 | FCloseTag:IHtmlElement;
268 | FTagName:string;
269 | FIsCloseTag:Boolean;
270 | FContent:string;
271 | FOrignal:string;
272 | FSourceLine:Integer;
273 | FSourceCol:Integer;
274 | //
275 | FAttributes:TStringDictionary;
276 | FChildren:TIHtmlElementList;
277 | procedure _GetHtml(IncludeSelf:Boolean; Sb:TStringBuilder);
278 | procedure _GetText(IncludeSelf:Boolean; Sb:TStringBuilder);
279 | procedure _SimpleCSSSelector(const ItemGroup:TCSSSelectorItemGroup; r:TIHtmlElementList);
280 | procedure _Select(Item:PCSSSelectorItem; Count:Integer; r:TIHtmlElementList; OnlyTopLevel:Boolean = false);
281 | public
282 | constructor Create(AOwner:THtmlElement; AText:string; ALine, ACol:Integer);
283 | destructor Destroy; override;
284 | end;
285 |
286 |
287 | TIHtmlElementList = class(TInterfacedObject, IHtmlElementList)
288 | private
289 | // IHtmlElementList
290 | function GetItems(Index:Integer):IHtmlElement; stdcall;
291 | function GetCount:Integer; stdcall;
292 | protected
293 | FList:TList;
294 | procedure SetItems(Index:Integer; const Value:IHtmlElement); inline;
295 | function Add(Value:IHtmlElement):Integer; inline;
296 | procedure Delete(Index:Integer); inline;
297 | procedure Clear; inline;
298 | // ying32Added
299 | procedure RemoveAll; stdcall;
300 | procedure Remove(ANode:IHtmlElement); stdcall;
301 | procedure Each(f:TElementEachEvent); stdcall;
302 | function GetText:String; stdcall;
303 | public
304 | constructor Create;
305 | destructor Destroy; override;
306 | function GetEnumerator:THtmlListEnumerator;
307 | function IndexOf(Item:IHtmlElement):Integer;
308 | // IHtmlElementList
309 | property Items[index:Integer]:IHtmlElement read GetItems write SetItems; default;
310 | property Count:Integer read GetCount;
311 | end;
312 |
313 |
314 | function SplitStr(ACharSet:TSysCharSet; AStr:string):TStringDynArray;
315 | var
316 | L, I:Integer;
317 | S:string;
318 | StrChar:Char;
319 | begin
320 | Result := nil;
321 | if Length(AStr) <= 0 then
322 | Exit;
323 | I := low(AStr);
324 | L := low(AStr);
325 | StrChar := #0;
326 | while I <= high(AStr) do
327 | begin
328 | if CharInSet(AStr[I], ['''', '"']) then
329 | if StrChar = #0 then
330 | StrChar := AStr[I]
331 | else if StrChar = AStr[I] then
332 | StrChar := #0;
333 | // Not in the string, the delimiter takes effect
334 | if StrChar = #0 then
335 | if CharInSet(AStr[I], ACharSet) then
336 | begin
337 | if I > L then
338 | begin
339 | S := Copy(AStr, L{$IF (LowStrIndex = 0)} + 1{$ENDIF}, I - L);
340 | SetLength(Result, Length(Result) + 1);
341 | Result[Length(Result) - 1] := S;
342 | end;
343 | L := I + 1;
344 | end;
345 | Inc(I);
346 | end;
347 | if (I > L) then
348 | begin
349 | S := Copy(AStr, L{$IF (LowStrIndex = 0)} + 1{$ENDIF}, I - L);
350 | SetLength(Result, Length(Result) + 1);
351 | Result[Length(Result) - 1] := S;
352 | end;
353 | end;
354 |
355 |
356 | function StrRight(const Value:string; Count:Integer):string;
357 | var
358 | start:Integer;
359 | begin
360 | start := Length(Value) - Count + 1;
361 | if start <= 0 then
362 | Result := Value
363 | else
364 | Result := Copy(Value, start, Count);
365 | end;
366 |
367 |
368 | function StrLeft(const Value:string; Count:Integer):string;
369 | begin
370 | Result := Copy(Value, LowStrIndex, Count);
371 | end;
372 |
373 |
374 | // ComapreAttr
375 | function _aoExist(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
376 | begin
377 | Result := E.FAttributes.ContainsKey(Item.Key);
378 | end;
379 |
380 |
381 | function _aoEqual(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
382 | begin
383 | Result := E.FAttributes.ContainsKey(Item.Key) and (E.FAttributes[Item.Key] = Item.Value);
384 | end;
385 |
386 |
387 | function _aoNotEqual(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
388 | begin
389 | Result := E.FAttributes.ContainsKey(Item.Key) and (E.FAttributes[Item.Key] <> Item.Value);
390 | end;
391 |
392 |
393 | function _aoIncludeWord(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
394 | var
395 | S:TStringDynArray;
396 | I:Integer;
397 | begin
398 | Result := false;
399 | if not E.FAttributes.ContainsKey(Item.Key) then
400 | Exit;
401 | Result := True;
402 | S := SplitStr(WhiteSpace, E.FAttributes[Item.Key]);
403 | for I := low(S) to high(S) do
404 | if S[I] = Item.Value then
405 | Exit;
406 | Result := false;
407 | end;
408 |
409 |
410 | function _aoBeginWord(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
411 | var
412 | S:TStringDynArray;
413 | begin
414 | Result := false;
415 | if not E.FAttributes.ContainsKey(Item.Key) then
416 | Exit;
417 | S := SplitStr((WhiteSpace + ['_', '-']), E.FAttributes[Item.Key]);
418 | Result := (Length(S) > 0) and (S[0] = Item.Value);
419 | end;
420 |
421 |
422 | function _aoBegin(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
423 | var
424 | attr, Value:string;
425 | begin
426 | Result := false;
427 | if not E.FAttributes.ContainsKey(Item.Key) then
428 | Exit;
429 | attr := E.FAttributes[Item.Key];
430 | Value := Item.Value;
431 | Result := (Length(attr) > Length(Value)) and (StrLeft(attr, Length(Value)) = Value);
432 | end;
433 |
434 |
435 | function _aoEnd(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
436 | var
437 | attr, Value:string;
438 | begin
439 | Result := false;
440 | if not E.FAttributes.ContainsKey(Item.Key) then
441 | Exit;
442 | attr := E.FAttributes[Item.Key];
443 | Value := Item.Value;
444 | Result := (Length(attr) > Length(Value)) and (StrRight(attr, Length(Value)) = Value);
445 | end;
446 |
447 |
448 | function _aoContain(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
449 | begin
450 | Result := false;
451 | if not E.FAttributes.ContainsKey(Item.Key) then
452 | Exit;
453 | Result := Pos(Item.Value, E.FAttributes[Item.Key]) > 0;
454 | end;
455 |
456 | type
457 | TFNCompareAttr = function(const Item:TAttrSelectorItem; E:THtmlElement):Boolean;
458 | const
459 | AttrCompareFuns: array [TAttrOperator] of TFNCompareAttr = (_aoExist, _aoEqual, _aoNotEqual, _aoIncludeWord, _aoBeginWord, _aoBegin, _aoEnd, _aoContain);
460 | function GetTagProperty(const TagName:string):WORD; forward;
461 |
462 |
463 | procedure DoError(const Msg:string);
464 | begin
465 | raise Exception.Create(Msg);
466 | end;
467 |
468 |
469 | procedure _ParserAttrs(var sc:TSourceContext; var Attrs:TAttributeDynArray);
470 | var
471 | Item:TAttributeItem;
472 | begin
473 | SetLength(Attrs, 0);
474 | while True do
475 | begin
476 | sc.SkipBlank();
477 | if sc.CurrentChar = #0 then
478 | Break;
479 | Item.Key := sc.ReadStr((WhiteSpace + [#0, '=']));
480 | Item.Value := '';
481 | sc.SkipBlank;
482 | if sc.CurrentChar = '=' then
483 | begin
484 | sc.JumpToNextChar;
485 | sc.SkipBlank;
486 | Item.Value := sc.ReadStr((WhiteSpace + [#0]));
487 | end;
488 | SetLength(Attrs, Length(Attrs) + 1);
489 | Attrs[Length(Attrs) - 1] := Item;
490 | end;
491 | end;
492 |
493 |
494 | procedure _ParserNodeItem(S:string; var ATagName:string; var Attrs:TAttributeDynArray);
495 | var
496 | sc:TSourceContext;
497 | begin
498 | sc.setCode(S);
499 | sc.SkipBlank;
500 | ATagName := UpperCase(sc.ReadStr((WhiteSpace + [#0, '/', '>'])));
501 | _ParserAttrs(sc, Attrs);
502 | end;
503 |
504 |
505 | function CreateTextElement(AOwner:THtmlElement; AText:string; ALine, ACol:Integer):THtmlElement;
506 | begin
507 | Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
508 | with Result do
509 | begin
510 | // Edwin: 2019-07-08: html entities in the text element shouldn't be decoded?
511 | // FContent := DecodeHtmlEntities(AText);
512 | FContent := AText;
513 | FTagName := cSpecialTagName_Text;
514 | FClosed := True;
515 | end;
516 | end;
517 |
518 |
519 | function CreateScriptElement(AOwner:THtmlElement; AText:string; ALine, ACol:Integer):THtmlElement;
520 | begin
521 | Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
522 | with Result do
523 | begin
524 | // Edwin: 2019-07-09: html entities in scripts shouldn't be decoded?
525 | // FContent := DecodeHtmlEntities(AText);
526 | FContent := AText;
527 | FTagName := '#SCRIPT';
528 | FClosed := True;
529 | end;
530 | end;
531 |
532 |
533 | function CreateStyleElement(AOwner:THtmlElement; AText:string; ALine, ACol:Integer):THtmlElement;
534 | begin
535 | Result := THtmlElement.Create(AOwner, AText, ALine, ACol);
536 | with Result do
537 | begin
538 | // Edwin: 2019-07-09: html entities in