├── .gitignore ├── HtmlParserEx.pas ├── LICENSE ├── README.md └── Tests ├── HTMLParserEx.Tests.Main.pas ├── HTMLParserEx.Tests.dpr └── HTMLParserEx.Tests.dproj /.gitignore: -------------------------------------------------------------------------------- 1 | # Uncomment these types if you want even more clean repository. But be careful. 2 | # It can make harm to an existing project source. Read explanations below. 3 | # 4 | # Type library file (binary). In old Delphi versions it should be stored. 5 | # Since Delphi 2009 it is produced from .ridl file and can safely be ignored. 6 | #*.tlb 7 | # 8 | # Diagram Portfolio file. Used by the diagram editor up to Delphi 7. 9 | # Uncomment this if you are not using diagrams or use newer Delphi version. 10 | #*.ddp 11 | # 12 | # Visual LiveBindings file. Added in Delphi XE2. 13 | # Uncomment this if you are not using LiveBindings Designer. 14 | #*.vlb 15 | # 16 | # Deployment Manager configuration file for your project. Added in Delphi XE2. 17 | # Uncomment this if it is not mobile development and you do not use remote debug feature. 18 | #*.deployproj 19 | 20 | # Delphi compiler-generated binaries (safe to delete) 21 | *.a 22 | *.apk 23 | *.bpi 24 | *.bpl 25 | *.dcp 26 | *.dcu 27 | *.dll 28 | *.drc 29 | *.dres 30 | *.exe 31 | *.lib 32 | *.map 33 | *.o 34 | *.obj 35 | *.ocx 36 | *.res 37 | *.otares 38 | *.rsm 39 | *.so 40 | *.tds 41 | 42 | # Delphi autogenerated files (duplicated info) 43 | *.cfg 44 | *.hpp 45 | *Resource.rc 46 | 47 | # Delphi local files (user-specific info) 48 | *.cbk 49 | *.dsk 50 | *.identcache 51 | *.local 52 | *.projdata 53 | *.tvsconfig 54 | *.user 55 | 56 | # Delphi history and backups 57 | __history/ 58 | __recovery/ 59 | *.~* 60 | 61 | # Castalia statistics file (since XE7 Castalia is distributed with Delphi) 62 | *.stat 63 | 64 | # Platform output folders 65 | Win32 66 | Win64 67 | DCU 68 | 69 | # Explorer image caches 70 | [Tt]humbs.db 71 | 72 | # Other common file extensions 73 | *.bak 74 | *.temp 75 | *.tmp 76 | *.rc 77 | -------------------------------------------------------------------------------- /HtmlParserEx.pas: -------------------------------------------------------------------------------- 1 | unit HtmlParserEx; 2 | {$DEFINE UseXPath } 3 | {$IF RTLVersion < 24.0} 4 | {$MESSAGE ERROR 'Only XE3 and later versions are supported'} 5 | {$ENDIF} 6 | 7 | interface 8 | 9 | uses 10 | System.SysUtils, 11 | System.Classes, 12 | System.Generics.Collections 13 | {$IFDEF UseXPath} 14 | ,System.RegularExpressionsCore 15 | {$ENDIF}; 16 | 17 | {$IF (defined(IOS) and defined(CPUARM)) or defined(ANDROID)} 18 | {$DEFINE MOBILE_DEV} 19 | {$ENDIF} 20 | const 21 | // By design each IHtmlElement has a TagName property, even it's a 'Text' element. 22 | // so cSpecialTagName_Text represents the fake tag name of a Text element 23 | cSpecialTagName_Text = '#TEXT'; 24 | LowStrIndex = low(string); // Mobile platform=0, PC platform=1 25 | 26 | type 27 | IHtmlElement = interface; 28 | IHtmlElementList = interface; 29 | TElementEachEvent = reference to procedure(AIndex:Integer; AEl:IHtmlElement); 30 | 31 | 32 | IHtmlElement = interface 33 | ['{8C75239C-8CFA-499F-B115-7CEBEDFB421B}'] 34 | function GetParent:IHtmlElement; stdcall; 35 | function GetTagName:String; stdcall; 36 | procedure SetTagName(Value:String); stdcall; 37 | function GetContent:String; stdcall; 38 | function GetOrignal:String; stdcall; 39 | function GetChildrenCount:Integer; stdcall; 40 | function GetChildren(Index:Integer):IHtmlElement; stdcall; 41 | function GetCloseTag:IHtmlElement; stdcall; 42 | function GetInnerHtml():String; stdcall; 43 | function GetOuterHtml():String; stdcall; 44 | function GetInnerText():String; stdcall; 45 | procedure SetInnerText(Value:String); stdcall; 46 | function GetAttributes(Key:String):String; stdcall; 47 | procedure SetAttributes(Key:String; Value:String); stdcall; 48 | procedure RemoveAttr(AAttrName:string); stdcall; 49 | function GetSourceLineNum():Integer; stdcall; 50 | function GetSourceColNum():Integer; stdcall; 51 | // Add and remove nodes 52 | function RemoveChild(ANode:IHtmlElement):Integer; stdcall; 53 | procedure Remove; stdcall; 54 | function AppedChild(const ATag:string):IHtmlElement; stdcall; 55 | // Does the property exist 56 | function HasAttribute(AttributeName:String):Boolean; stdcall; 57 | { Find Element using CSS selector syntax, "pseudo-class" is not supported 58 | http://www.w3.org/TR/CSS2/selector.html 59 | } 60 | function SimpleCSSSelector(const selector:String):IHtmlElementList; stdcall; 61 | function Find(const selector:String):IHtmlElementList; stdcall; 62 | {$IFDEF UseXPath} 63 | function FindX(const AXPath:String):IHtmlElementList; stdcall; 64 | {$ENDIF} 65 | // enum property 66 | function EnumAttributeNames(Index:Integer):String; stdcall; 67 | property TagName:String read GetTagName write SetTagName; 68 | property ChildrenCount:Integer read GetChildrenCount; 69 | property Children[index:Integer]:IHtmlElement read GetChildren; default; 70 | property CloseTag:IHtmlElement read GetCloseTag; 71 | property Content:String read GetContent; 72 | property Orignal:String read GetOrignal; 73 | property Parent:IHtmlElement read GetParent; 74 | // Get the position of an element in the source code 75 | property SourceLineNum:Integer read GetSourceLineNum; 76 | property SourceColNum:Integer read GetSourceColNum; 77 | // 78 | property InnerHtml:String read GetInnerHtml; 79 | property OuterHtml:String read GetOuterHtml; 80 | property InnerText:String read GetInnerText write SetInnerText; 81 | property Text:String read GetInnerText write SetInnerText; 82 | property Attributes[Key:String]:String read GetAttributes write SetAttributes; 83 | // ying32 does not change the original, just simplifies the use 84 | property Attrs[Key:String]:String read GetAttributes write SetAttributes; 85 | end; 86 | 87 | 88 | THtmlListEnumerator = class 89 | private 90 | FIndex:Integer; 91 | FList:IHtmlElementList; 92 | public 93 | constructor Create(AList:IHtmlElementList); 94 | function GetCurrent:IHtmlElement; inline; 95 | function MoveNext:Boolean; 96 | property Current:IHtmlElement read GetCurrent; 97 | end; 98 | 99 | 100 | IHtmlElementList = interface 101 | ['{8E1380C6-4263-4BF6-8D10-091A86D8E7D9}'] 102 | function GetCount:Integer; stdcall; 103 | function GetItems(Index:Integer):IHtmlElement; stdcall; 104 | procedure RemoveAll; stdcall; 105 | procedure Remove(ANode:IHtmlElement); stdcall; 106 | procedure Each(f:TElementEachEvent); stdcall; 107 | function GetText:String; stdcall; 108 | function GetEnumerator:THtmlListEnumerator; 109 | property Text:String read GetText; 110 | property Count:Integer read GetCount; 111 | property Items[index:Integer]:IHtmlElement read GetItems; default; 112 | end; 113 | 114 | 115 | function ParserHTML(const Source:String):IHtmlElement; stdcall; 116 | function DecodeHtmlEntities(S:string):string; forward; 117 | 118 | implementation 119 | 120 | uses 121 | StrUtils; 122 | 123 | type 124 | TStringDictionary = TDictionary; 125 | TPropDictionary = TDictionary; 126 | TStringDynArray = TArray; 127 | const 128 | WhiteSpace = [' ', #13, #10, #9]; 129 | // CSS Attribute Compare Operator 130 | OperatorChar = ['=', '!', '*', '~', '|', '^', '$']; 131 | MaxListSize = Maxint div 16; 132 | // TagProperty 133 | tpBlock = $01; 134 | tpInline = $02; 135 | tpEmpty = $04; 136 | tpFormatAsInline = $08; 137 | tpPreserveWhitespace = $10; 138 | tpInlineOrEmpty = tpInline or tpEmpty; 139 | 140 | type 141 | TAttrOperator = (aoExist, aoEqual, aoNotEqual, aoIncludeWord, aoBeginWord, aoBegin, aoEnd, aoContain); 142 | PAttrSelectorItem = ^TAttrSelectorItem; 143 | 144 | 145 | TAttrSelectorItem = record 146 | Key:string; 147 | AttrOperator:TAttrOperator; 148 | Value:string; 149 | end; 150 | 151 | 152 | TSelectorItemRelation = (sirNONE, sirDescendant, sirChildren, sirYoungerBrother, sirAllYoungerBrother); 153 | PCSSSelectorItem = ^TCSSSelectorItem; 154 | 155 | 156 | TCSSSelectorItem = record 157 | Relation:TSelectorItemRelation; 158 | szTag:string; 159 | Attributes: array of TAttrSelectorItem; 160 | end; 161 | 162 | 163 | TCSSSelectorItems = array of TCSSSelectorItem; 164 | PCSSSelectorItems = ^TCSSSelectorItems; 165 | TCSSSelectorItemGroup = array of TCSSSelectorItems; 166 | 167 | 168 | // 169 | TSourceContext = record 170 | private 171 | function GetCharOfCurrent(Index:Integer):Char; inline; 172 | function PassedEndOfSourceCode:Boolean; inline; 173 | public 174 | SourceCode:string; 175 | CharIndex:Integer; 176 | LineNum:Integer; 177 | ColNum:Integer; 178 | CurrentChar:Char; 179 | {$IFDEF DEBUG} 180 | currentCode:PChar; 181 | {$ENDIF} 182 | // Jump to the next char and return true if not exceeding the end of SourceCode 183 | function JumpToNextChar:Boolean; overload; inline; 184 | // Jump multiple characters, and return the jumped steps 185 | function JumpToNextChar(Step:Integer):Integer; overload; inline; 186 | procedure setCode(const ACode:string); inline; 187 | function ReadStr(UntilChars:TSysCharSet):string; inline; 188 | function PeekStr(Index:Integer):string; overload; inline; 189 | function PeekStr():string; overload; inline; 190 | function subStr(Index, Count:Integer):string; overload; inline; 191 | function subStr(Count:Integer):string; overload; inline; 192 | procedure SkipBlank(); inline; 193 | property charOfCurrent[index:Integer]:Char read GetCharOfCurrent; 194 | end; 195 | 196 | 197 | TAttributeItem = record 198 | Key, Value:string; 199 | end; 200 | 201 | 202 | TAttributeDynArray = TArray; 203 | TIHtmlElementList = class; 204 | THtmlElement = class; 205 | THtmlElementList = TList; 206 | 207 | 208 | THtmlElement = class(TInterfacedObject, IHtmlElement) 209 | private 210 | function GetChildrens:IHtmlElementList; 211 | protected 212 | // ying32 213 | function GetParent:IHtmlElement; stdcall; 214 | function GetTagName:String; stdcall; 215 | procedure SetTagName(Value:String); stdcall; 216 | function GetContent:String; stdcall; 217 | function GetOrignal:String; stdcall; 218 | function GetChildrenCount:Integer; stdcall; 219 | function GetChildren(Index:Integer):IHtmlElement; stdcall; 220 | function GetCloseTag:IHtmlElement; stdcall; 221 | function GetInnerHtml():String; stdcall; 222 | function GetOuterHtml():String; stdcall; 223 | function GetInnerText():String; stdcall; 224 | procedure SetInnerText(Value:String); stdcall; 225 | function GetAttributes(Key:String):String; stdcall; 226 | procedure SetAttributes(Key:String; Value:String); stdcall; 227 | procedure RemoveAttr(AAttrName:string); stdcall; 228 | function GetSourceLineNum():Integer; stdcall; 229 | function GetSourceColNum():Integer; stdcall; 230 | // ying32Added 231 | function RemoveChild(ANode:IHtmlElement):Integer; stdcall; 232 | procedure Remove; stdcall; 233 | function AppedChild(const ATag:string):IHtmlElement; stdcall; 234 | 235 | // Does the property exist 236 | function HasAttribute(AttributeName:String):Boolean; stdcall; 237 | { Find Element with CSS selector syntax, does not support "pseudo-class" 238 | http://www.w3.org/TR/CSS2/selector.html 239 | } 240 | function SimpleCSSSelector(const selector:String):IHtmlElementList; stdcall; 241 | function Find(const selector:String):IHtmlElementList; stdcall; 242 | {$IFDEF UseXPath} 243 | function FindX(const AXPath:String):IHtmlElementList; stdcall; 244 | {$ENDIF} 245 | // enum property 246 | function EnumAttributeNames(Index:Integer):String; stdcall; 247 | property TagName:String read GetTagName write SetTagName; 248 | property ChildrenCount:Integer read GetChildrenCount; 249 | property Children[index:Integer]:IHtmlElement read GetChildren; default; 250 | property CloseTag:IHtmlElement read GetCloseTag; 251 | property Content:String read GetContent; 252 | property Orignal:String read GetOrignal; 253 | property Parent:IHtmlElement read GetParent; 254 | // Get the position of an element in the source code 255 | property SourceLineNum:Integer read GetSourceLineNum; 256 | property SourceColNum:Integer read GetSourceColNum; 257 | // 258 | property InnerHtml:String read GetInnerHtml; 259 | property OuterHtml:String read GetOuterHtml; 260 | property InnerText:String read GetInnerText; 261 | property Attributes[Key:String]:String read GetAttributes write SetAttributes; 262 | property Childrens:IHtmlElementList read GetChildrens; 263 | private 264 | FClosed:Boolean; 265 | // 266 | FOwner:THtmlElement; 267 | FCloseTag:IHtmlElement; 268 | FTagName:string; 269 | FIsCloseTag:Boolean; 270 | FContent:string; 271 | FOrignal:string; 272 | FSourceLine:Integer; 273 | FSourceCol:Integer; 274 | // 275 | FAttributes:TStringDictionary; 276 | FChildren:TIHtmlElementList; 277 | procedure _GetHtml(IncludeSelf:Boolean; Sb:TStringBuilder); 278 | procedure _GetText(IncludeSelf:Boolean; Sb:TStringBuilder); 279 | procedure _SimpleCSSSelector(const ItemGroup:TCSSSelectorItemGroup; r:TIHtmlElementList); 280 | procedure _Select(Item:PCSSSelectorItem; Count:Integer; r:TIHtmlElementList; OnlyTopLevel:Boolean = false); 281 | public 282 | constructor Create(AOwner:THtmlElement; AText:string; ALine, ACol:Integer); 283 | destructor Destroy; override; 284 | end; 285 | 286 | 287 | TIHtmlElementList = class(TInterfacedObject, IHtmlElementList) 288 | private 289 | // IHtmlElementList 290 | function GetItems(Index:Integer):IHtmlElement; stdcall; 291 | function GetCount:Integer; stdcall; 292 | protected 293 | FList:TList; 294 | procedure SetItems(Index:Integer; const Value:IHtmlElement); inline; 295 | function Add(Value:IHtmlElement):Integer; inline; 296 | procedure Delete(Index:Integer); inline; 297 | procedure Clear; inline; 298 | // ying32Added 299 | procedure RemoveAll; stdcall; 300 | procedure Remove(ANode:IHtmlElement); stdcall; 301 | procedure Each(f:TElementEachEvent); stdcall; 302 | function GetText:String; stdcall; 303 | public 304 | constructor Create; 305 | destructor Destroy; override; 306 | function GetEnumerator:THtmlListEnumerator; 307 | function IndexOf(Item:IHtmlElement):Integer; 308 | // IHtmlElementList 309 | property Items[index:Integer]:IHtmlElement read GetItems write SetItems; default; 310 | property Count:Integer read GetCount; 311 | end; 312 | 313 | 314 | function SplitStr(ACharSet:TSysCharSet; AStr:string):TStringDynArray; 315 | var 316 | L, I:Integer; 317 | S:string; 318 | StrChar:Char; 319 | begin 320 | Result := nil; 321 | if Length(AStr) <= 0 then 322 | Exit; 323 | I := low(AStr); 324 | L := low(AStr); 325 | StrChar := #0; 326 | while I <= high(AStr) do 327 | begin 328 | if CharInSet(AStr[I], ['''', '"']) then 329 | if StrChar = #0 then 330 | StrChar := AStr[I] 331 | else if StrChar = AStr[I] then 332 | StrChar := #0; 333 | // Not in the string, the delimiter takes effect 334 | if StrChar = #0 then 335 | if CharInSet(AStr[I], ACharSet) then 336 | begin 337 | if I > L then 338 | begin 339 | S := Copy(AStr, L{$IF (LowStrIndex = 0)} + 1{$ENDIF}, I - L); 340 | SetLength(Result, Length(Result) + 1); 341 | Result[Length(Result) - 1] := S; 342 | end; 343 | L := I + 1; 344 | end; 345 | Inc(I); 346 | end; 347 | if (I > L) then 348 | begin 349 | S := Copy(AStr, L{$IF (LowStrIndex = 0)} + 1{$ENDIF}, I - L); 350 | SetLength(Result, Length(Result) + 1); 351 | Result[Length(Result) - 1] := S; 352 | end; 353 | end; 354 | 355 | 356 | function StrRight(const Value:string; Count:Integer):string; 357 | var 358 | start:Integer; 359 | begin 360 | start := Length(Value) - Count + 1; 361 | if start <= 0 then 362 | Result := Value 363 | else 364 | Result := Copy(Value, start, Count); 365 | end; 366 | 367 | 368 | function StrLeft(const Value:string; Count:Integer):string; 369 | begin 370 | Result := Copy(Value, LowStrIndex, Count); 371 | end; 372 | 373 | 374 | // ComapreAttr 375 | function _aoExist(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 376 | begin 377 | Result := E.FAttributes.ContainsKey(Item.Key); 378 | end; 379 | 380 | 381 | function _aoEqual(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 382 | begin 383 | Result := E.FAttributes.ContainsKey(Item.Key) and (E.FAttributes[Item.Key] = Item.Value); 384 | end; 385 | 386 | 387 | function _aoNotEqual(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 388 | begin 389 | Result := E.FAttributes.ContainsKey(Item.Key) and (E.FAttributes[Item.Key] <> Item.Value); 390 | end; 391 | 392 | 393 | function _aoIncludeWord(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 394 | var 395 | S:TStringDynArray; 396 | I:Integer; 397 | begin 398 | Result := false; 399 | if not E.FAttributes.ContainsKey(Item.Key) then 400 | Exit; 401 | Result := True; 402 | S := SplitStr(WhiteSpace, E.FAttributes[Item.Key]); 403 | for I := low(S) to high(S) do 404 | if S[I] = Item.Value then 405 | Exit; 406 | Result := false; 407 | end; 408 | 409 | 410 | function _aoBeginWord(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 411 | var 412 | S:TStringDynArray; 413 | begin 414 | Result := false; 415 | if not E.FAttributes.ContainsKey(Item.Key) then 416 | Exit; 417 | S := SplitStr((WhiteSpace + ['_', '-']), E.FAttributes[Item.Key]); 418 | Result := (Length(S) > 0) and (S[0] = Item.Value); 419 | end; 420 | 421 | 422 | function _aoBegin(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 423 | var 424 | attr, Value:string; 425 | begin 426 | Result := false; 427 | if not E.FAttributes.ContainsKey(Item.Key) then 428 | Exit; 429 | attr := E.FAttributes[Item.Key]; 430 | Value := Item.Value; 431 | Result := (Length(attr) > Length(Value)) and (StrLeft(attr, Length(Value)) = Value); 432 | end; 433 | 434 | 435 | function _aoEnd(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 436 | var 437 | attr, Value:string; 438 | begin 439 | Result := false; 440 | if not E.FAttributes.ContainsKey(Item.Key) then 441 | Exit; 442 | attr := E.FAttributes[Item.Key]; 443 | Value := Item.Value; 444 | Result := (Length(attr) > Length(Value)) and (StrRight(attr, Length(Value)) = Value); 445 | end; 446 | 447 | 448 | function _aoContain(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 449 | begin 450 | Result := false; 451 | if not E.FAttributes.ContainsKey(Item.Key) then 452 | Exit; 453 | Result := Pos(Item.Value, E.FAttributes[Item.Key]) > 0; 454 | end; 455 | 456 | type 457 | TFNCompareAttr = function(const Item:TAttrSelectorItem; E:THtmlElement):Boolean; 458 | const 459 | AttrCompareFuns: array [TAttrOperator] of TFNCompareAttr = (_aoExist, _aoEqual, _aoNotEqual, _aoIncludeWord, _aoBeginWord, _aoBegin, _aoEnd, _aoContain); 460 | function GetTagProperty(const TagName:string):WORD; forward; 461 | 462 | 463 | procedure DoError(const Msg:string); 464 | begin 465 | raise Exception.Create(Msg); 466 | end; 467 | 468 | 469 | procedure _ParserAttrs(var sc:TSourceContext; var Attrs:TAttributeDynArray); 470 | var 471 | Item:TAttributeItem; 472 | begin 473 | SetLength(Attrs, 0); 474 | while True do 475 | begin 476 | sc.SkipBlank(); 477 | if sc.CurrentChar = #0 then 478 | Break; 479 | Item.Key := sc.ReadStr((WhiteSpace + [#0, '='])); 480 | Item.Value := ''; 481 | sc.SkipBlank; 482 | if sc.CurrentChar = '=' then 483 | begin 484 | sc.JumpToNextChar; 485 | sc.SkipBlank; 486 | Item.Value := sc.ReadStr((WhiteSpace + [#0])); 487 | end; 488 | SetLength(Attrs, Length(Attrs) + 1); 489 | Attrs[Length(Attrs) - 1] := Item; 490 | end; 491 | end; 492 | 493 | 494 | procedure _ParserNodeItem(S:string; var ATagName:string; var Attrs:TAttributeDynArray); 495 | var 496 | sc:TSourceContext; 497 | begin 498 | sc.setCode(S); 499 | sc.SkipBlank; 500 | ATagName := UpperCase(sc.ReadStr((WhiteSpace + [#0, '/', '>']))); 501 | _ParserAttrs(sc, Attrs); 502 | end; 503 | 504 | 505 | function CreateTextElement(AOwner:THtmlElement; AText:string; ALine, ACol:Integer):THtmlElement; 506 | begin 507 | Result := THtmlElement.Create(AOwner, AText, ALine, ACol); 508 | with Result do 509 | begin 510 | // Edwin: 2019-07-08: html entities in the text element shouldn't be decoded? 511 | // FContent := DecodeHtmlEntities(AText); 512 | FContent := AText; 513 | FTagName := cSpecialTagName_Text; 514 | FClosed := True; 515 | end; 516 | end; 517 | 518 | 519 | function CreateScriptElement(AOwner:THtmlElement; AText:string; ALine, ACol:Integer):THtmlElement; 520 | begin 521 | Result := THtmlElement.Create(AOwner, AText, ALine, ACol); 522 | with Result do 523 | begin 524 | // Edwin: 2019-07-09: html entities in scripts shouldn't be decoded? 525 | // FContent := DecodeHtmlEntities(AText); 526 | FContent := AText; 527 | FTagName := '#SCRIPT'; 528 | FClosed := True; 529 | end; 530 | end; 531 | 532 | 533 | function CreateStyleElement(AOwner:THtmlElement; AText:string; ALine, ACol:Integer):THtmlElement; 534 | begin 535 | Result := THtmlElement.Create(AOwner, AText, ALine, ACol); 536 | with Result do 537 | begin 538 | // Edwin: 2019-07-09: html entities in