├── .gitignore ├── HTMLp.DOMCore.pas ├── HTMLp.Entities.pas ├── HTMLp.Formatter.pas ├── HTMLp.HTMLParser.pas ├── HTMLp.HTMLReader.pas ├── HTMLp.HTMLTags.pas ├── HTMLp.Helper.pas ├── LICENSE ├── README.md └── example ├── HTMLP.dpr ├── HTMLP.dproj ├── MainForm.dfm ├── MainForm.pas └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Uncomment these types if you want even more clean repository. But be careful. 2 | # It can make harm to an existing project source. Read explanations below. 3 | # 4 | # Resource files are binaries containing manifest, project icon and version info. 5 | # They can not be viewed as text or compared by diff-tools. Consider replacing them with .rc files. 6 | #*.res 7 | # 8 | # Type library file (binary). In old Delphi versions it should be stored. 9 | # Since Delphi 2009 it is produced from .ridl file and can safely be ignored. 10 | #*.tlb 11 | # 12 | # Diagram Portfolio file. Used by the diagram editor up to Delphi 7. 13 | # Uncomment this if you are not using diagrams or use newer Delphi version. 14 | #*.ddp 15 | # 16 | # Visual LiveBindings file. Added in Delphi XE2. 17 | # Uncomment this if you are not using LiveBindings Designer. 18 | #*.vlb 19 | # 20 | # Deployment Manager configuration file for your project. Added in Delphi XE2. 21 | # Uncomment this if it is not mobile development and you do not use remote debug feature. 22 | #*.deployproj 23 | # 24 | # C++ object files produced when C/C++ Output file generation is configured. 25 | # Uncomment this if you are not using external objects (zlib library for example). 26 | #*.obj 27 | # 28 | 29 | # Delphi compiler-generated binaries (safe to delete) 30 | *.exe 31 | *.dll 32 | *.bpl 33 | *.bpi 34 | *.dcp 35 | *.so 36 | *.apk 37 | *.drc 38 | *.map 39 | *.dres 40 | *.rsm 41 | *.tds 42 | *.dcu 43 | *.lib 44 | *.a 45 | *.o 46 | *.ocx 47 | 48 | # Delphi autogenerated files (duplicated info) 49 | *.cfg 50 | *.hpp 51 | *Resource.rc 52 | 53 | # Delphi local files (user-specific info) 54 | *.local 55 | *.identcache 56 | *.projdata 57 | *.tvsconfig 58 | *.dsk 59 | 60 | # Delphi history and backups 61 | __history/ 62 | __recovery/ 63 | *.~* 64 | 65 | # Castalia statistics file (since XE7 Castalia is distributed with Delphi) 66 | *.stat 67 | -------------------------------------------------------------------------------- /HTMLp.Entities.pas: -------------------------------------------------------------------------------- 1 | unit HTMLp.Entities; 2 | 3 | interface 4 | 5 | const 6 | MaxEntNameLen = 8; 7 | 8 | function GetEntValue(const Name: string): WideChar; 9 | function GetEntName(Code: Word): string; 10 | 11 | implementation 12 | 13 | uses 14 | SysUtils, Classes; 15 | 16 | const 17 | EntCount = 252; 18 | 19 | type 20 | PEntity = ^TEntity; 21 | TEntity = record 22 | Name: String; 23 | Code: Word 24 | end; 25 | 26 | TEntities = array[0..EntCount - 1] of TEntity; 27 | 28 | const 29 | EntTab: TEntities = ( 30 | (Name: 'nbsp'; Code: 160), 31 | (Name: 'iexcl'; Code: 161), 32 | (Name: 'cent'; Code: 162), 33 | (Name: 'pound'; Code: 163), 34 | (Name: 'curren'; Code: 164), 35 | (Name: 'yen'; Code: 165), 36 | (Name: 'brvbar'; Code: 166), 37 | (Name: 'sect'; Code: 167), 38 | (Name: 'uml'; Code: 168), 39 | (Name: 'copy'; Code: 169), 40 | (Name: 'ordf'; Code: 170), 41 | (Name: 'laquo'; Code: 171), 42 | (Name: 'not'; Code: 172), 43 | (Name: 'shy'; Code: 173), 44 | (Name: 'reg'; Code: 174), 45 | (Name: 'macr'; Code: 175), 46 | (Name: 'deg'; Code: 176), 47 | (Name: 'plusmn'; Code: 177), 48 | (Name: 'sup2'; Code: 178), 49 | (Name: 'sup3'; Code: 179), 50 | (Name: 'acute'; Code: 180), 51 | (Name: 'micro'; Code: 181), 52 | (Name: 'para'; Code: 182), 53 | (Name: 'middot'; Code: 183), 54 | (Name: 'cedil'; Code: 184), 55 | (Name: 'sup1'; Code: 185), 56 | (Name: 'ordm'; Code: 186), 57 | (Name: 'raquo'; Code: 187), 58 | (Name: 'frac14'; Code: 188), 59 | (Name: 'frac12'; Code: 189), 60 | (Name: 'frac34'; Code: 190), 61 | (Name: 'iquest'; Code: 191), 62 | (Name: 'Agrave'; Code: 192), 63 | (Name: 'Aacute'; Code: 193), 64 | (Name: 'Acirc'; Code: 194), 65 | (Name: 'Atilde'; Code: 195), 66 | (Name: 'Auml'; Code: 196), 67 | (Name: 'Aring'; Code: 197), 68 | (Name: 'AElig'; Code: 198), 69 | (Name: 'Ccedil'; Code: 199), 70 | (Name: 'Egrave'; Code: 200), 71 | (Name: 'Eacute'; Code: 201), 72 | (Name: 'Ecirc'; Code: 202), 73 | (Name: 'Euml'; Code: 203), 74 | (Name: 'Igrave'; Code: 204), 75 | (Name: 'Iacute'; Code: 205), 76 | (Name: 'Icirc'; Code: 206), 77 | (Name: 'Iuml'; Code: 207), 78 | (Name: 'ETH'; Code: 208), 79 | (Name: 'Ntilde'; Code: 209), 80 | (Name: 'Ograve'; Code: 210), 81 | (Name: 'Oacute'; Code: 211), 82 | (Name: 'Ocirc'; Code: 212), 83 | (Name: 'Otilde'; Code: 213), 84 | (Name: 'Ouml'; Code: 214), 85 | (Name: 'times'; Code: 215), 86 | (Name: 'Oslash'; Code: 216), 87 | (Name: 'Ugrave'; Code: 217), 88 | (Name: 'Uacute'; Code: 218), 89 | (Name: 'Ucirc'; Code: 219), 90 | (Name: 'Uuml'; Code: 220), 91 | (Name: 'Yacute'; Code: 221), 92 | (Name: 'THORN'; Code: 222), 93 | (Name: 'szlig'; Code: 223), 94 | (Name: 'agrave'; Code: 224), 95 | (Name: 'aacute'; Code: 225), 96 | (Name: 'acirc'; Code: 226), 97 | (Name: 'atilde'; Code: 227), 98 | (Name: 'auml'; Code: 228), 99 | (Name: 'aring'; Code: 229), 100 | (Name: 'aelig'; Code: 230), 101 | (Name: 'ccedil'; Code: 231), 102 | (Name: 'egrave'; Code: 232), 103 | (Name: 'eacute'; Code: 233), 104 | (Name: 'ecirc'; Code: 234), 105 | (Name: 'euml'; Code: 235), 106 | (Name: 'igrave'; Code: 236), 107 | (Name: 'iacute'; Code: 237), 108 | (Name: 'icirc'; Code: 238), 109 | (Name: 'iuml'; Code: 239), 110 | (Name: 'eth'; Code: 240), 111 | (Name: 'ntilde'; Code: 241), 112 | (Name: 'ograve'; Code: 242), 113 | (Name: 'oacute'; Code: 243), 114 | (Name: 'ocirc'; Code: 244), 115 | (Name: 'otilde'; Code: 245), 116 | (Name: 'ouml'; Code: 246), 117 | (Name: 'divide'; Code: 247), 118 | (Name: 'oslash'; Code: 248), 119 | (Name: 'ugrave'; Code: 249), 120 | (Name: 'uacute'; Code: 250), 121 | (Name: 'ucirc'; Code: 251), 122 | (Name: 'uuml'; Code: 252), 123 | (Name: 'yacute'; Code: 253), 124 | (Name: 'thorn'; Code: 254), 125 | (Name: 'yuml'; Code: 255), 126 | (Name: 'fnof'; Code: 402), 127 | (Name: 'Alpha'; Code: 913), 128 | (Name: 'Beta'; Code: 914), 129 | (Name: 'Gamma'; Code: 915), 130 | (Name: 'Delta'; Code: 916), 131 | (Name: 'Epsilon'; Code: 917), 132 | (Name: 'Zeta'; Code: 918), 133 | (Name: 'Eta'; Code: 919), 134 | (Name: 'Theta'; Code: 920), 135 | (Name: 'Iota'; Code: 921), 136 | (Name: 'Kappa'; Code: 922), 137 | (Name: 'Lambda'; Code: 923), 138 | (Name: 'Mu'; Code: 924), 139 | (Name: 'Nu'; Code: 925), 140 | (Name: 'Xi'; Code: 926), 141 | (Name: 'Omicron'; Code: 927), 142 | (Name: 'Pi'; Code: 928), 143 | (Name: 'Rho'; Code: 929), 144 | (Name: 'Sigma'; Code: 931), 145 | (Name: 'Tau'; Code: 932), 146 | (Name: 'Upsilon'; Code: 933), 147 | (Name: 'Phi'; Code: 934), 148 | (Name: 'Chi'; Code: 935), 149 | (Name: 'Psi'; Code: 936), 150 | (Name: 'Omega'; Code: 937), 151 | (Name: 'alpha'; Code: 945), 152 | (Name: 'beta'; Code: 946), 153 | (Name: 'gamma'; Code: 947), 154 | (Name: 'delta'; Code: 948), 155 | (Name: 'epsilon'; Code: 949), 156 | (Name: 'zeta'; Code: 950), 157 | (Name: 'eta'; Code: 951), 158 | (Name: 'theta'; Code: 952), 159 | (Name: 'iota'; Code: 953), 160 | (Name: 'kappa'; Code: 954), 161 | (Name: 'lambda'; Code: 955), 162 | (Name: 'mu'; Code: 956), 163 | (Name: 'nu'; Code: 957), 164 | (Name: 'xi'; Code: 958), 165 | (Name: 'omicron'; Code: 959), 166 | (Name: 'pi'; Code: 960), 167 | (Name: 'rho'; Code: 961), 168 | (Name: 'sigmaf'; Code: 962), 169 | (Name: 'sigma'; Code: 963), 170 | (Name: 'tau'; Code: 964), 171 | (Name: 'upsilon'; Code: 965), 172 | (Name: 'phi'; Code: 966), 173 | (Name: 'chi'; Code: 967), 174 | (Name: 'psi'; Code: 968), 175 | (Name: 'omega'; Code: 969), 176 | (Name: 'thetasym'; Code: 977), 177 | (Name: 'upsih'; Code: 978), 178 | (Name: 'piv'; Code: 982), 179 | (Name: 'bull'; Code: 8226), 180 | (Name: 'hellip'; Code: 8230), 181 | (Name: 'prime'; Code: 8242), 182 | (Name: 'Prime'; Code: 8243), 183 | (Name: 'oline'; Code: 8254), 184 | (Name: 'frasl'; Code: 8260), 185 | (Name: 'weierp'; Code: 8472), 186 | (Name: 'image'; Code: 8465), 187 | (Name: 'real'; Code: 8476), 188 | (Name: 'trade'; Code: 8482), 189 | (Name: 'alefsym'; Code: 8501), 190 | (Name: 'larr'; Code: 8592), 191 | (Name: 'uarr'; Code: 8593), 192 | (Name: 'rarr'; Code: 8594), 193 | (Name: 'darr'; Code: 8595), 194 | (Name: 'harr'; Code: 8596), 195 | (Name: 'crarr'; Code: 8629), 196 | (Name: 'lArr'; Code: 8656), 197 | (Name: 'uArr'; Code: 8657), 198 | (Name: 'rArr'; Code: 8658), 199 | (Name: 'dArr'; Code: 8659), 200 | (Name: 'hArr'; Code: 8660), 201 | (Name: 'forall'; Code: 8704), 202 | (Name: 'part'; Code: 8706), 203 | (Name: 'exist'; Code: 8707), 204 | (Name: 'empty'; Code: 8709), 205 | (Name: 'nabla'; Code: 8711), 206 | (Name: 'isin'; Code: 8712), 207 | (Name: 'notin'; Code: 8713), 208 | (Name: 'ni'; Code: 8715), 209 | (Name: 'prod'; Code: 8719), 210 | (Name: 'sum'; Code: 8721), 211 | (Name: 'minus'; Code: 8722), 212 | (Name: 'lowast'; Code: 8727), 213 | (Name: 'radic'; Code: 8730), 214 | (Name: 'prop'; Code: 8733), 215 | (Name: 'infin'; Code: 8734), 216 | (Name: 'ang'; Code: 8736), 217 | (Name: 'and'; Code: 8743), 218 | (Name: 'or'; Code: 8744), 219 | (Name: 'cap'; Code: 8745), 220 | (Name: 'cup'; Code: 8746), 221 | (Name: 'int'; Code: 8747), 222 | (Name: 'there4'; Code: 8756), 223 | (Name: 'sim'; Code: 8764), 224 | (Name: 'cong'; Code: 8773), 225 | (Name: 'asymp'; Code: 8776), 226 | (Name: 'ne'; Code: 8800), 227 | (Name: 'equiv'; Code: 8801), 228 | (Name: 'le'; Code: 8804), 229 | (Name: 'ge'; Code: 8805), 230 | (Name: 'sub'; Code: 8834), 231 | (Name: 'sup'; Code: 8835), 232 | (Name: 'nsub'; Code: 8836), 233 | (Name: 'sube'; Code: 8838), 234 | (Name: 'supe'; Code: 8839), 235 | (Name: 'oplus'; Code: 8853), 236 | (Name: 'otimes'; Code: 8855), 237 | (Name: 'perp'; Code: 8869), 238 | (Name: 'sdot'; Code: 8901), 239 | (Name: 'lceil'; Code: 8968), 240 | (Name: 'rceil'; Code: 8969), 241 | (Name: 'lfloor'; Code: 8970), 242 | (Name: 'rfloor'; Code: 8971), 243 | (Name: 'lang'; Code: 9001), 244 | (Name: 'rang'; Code: 9002), 245 | (Name: 'loz'; Code: 9674), 246 | (Name: 'spades'; Code: 9824), 247 | (Name: 'clubs'; Code: 9827), 248 | (Name: 'hearts'; Code: 9829), 249 | (Name: 'diams'; Code: 9830), 250 | (Name: 'quot'; Code: 34), 251 | (Name: 'amp'; Code: 38), 252 | (Name: 'lt'; Code: 60), 253 | (Name: 'gt'; Code: 62), 254 | (Name: 'OElig'; Code: 338), 255 | (Name: 'oelig'; Code: 339), 256 | (Name: 'Scaron'; Code: 352), 257 | (Name: 'scaron'; Code: 353), 258 | (Name: 'Yuml'; Code: 376), 259 | (Name: 'circ'; Code: 710), 260 | (Name: 'tilde'; Code: 732), 261 | (Name: 'ensp'; Code: 8194), 262 | (Name: 'emsp'; Code: 8195), 263 | (Name: 'thinsp'; Code: 8201), 264 | (Name: 'zwnj'; Code: 8204), 265 | (Name: 'zwj'; Code: 8205), 266 | (Name: 'lrm'; Code: 8206), 267 | (Name: 'rlm'; Code: 8207), 268 | (Name: 'ndash'; Code: 8211), 269 | (Name: 'mdash'; Code: 8212), 270 | (Name: 'lsquo'; Code: 8216), 271 | (Name: 'rsquo'; Code: 8217), 272 | (Name: 'sbquo'; Code: 8218), 273 | (Name: 'ldquo'; Code: 8220), 274 | (Name: 'rdquo'; Code: 8221), 275 | (Name: 'bdquo'; Code: 8222), 276 | (Name: 'dagger'; Code: 8224), 277 | (Name: 'Dagger'; Code: 8225), 278 | (Name: 'permil'; Code: 8240), 279 | (Name: 'lsaquo'; Code: 8249), 280 | (Name: 'rsaquo'; Code: 8250), 281 | (Name: 'euro'; Code: 8364) 282 | ); 283 | 284 | type 285 | TEntList = class(TList) 286 | private 287 | function GetCode(const Name: string): Integer; 288 | public 289 | constructor Create; 290 | property Code[const Name: string]: Integer read GetCode; 291 | end; 292 | 293 | var 294 | EntityList: TEntList; 295 | 296 | function EntCompare(Ent1, Ent2: Pointer): Integer; 297 | begin 298 | Result := AnsiCompareStr(PEntity(Ent1)^.Name, PEntity(Ent2)^.Name); 299 | end; 300 | 301 | constructor TEntList.Create; 302 | var 303 | I: Integer; 304 | begin 305 | inherited Create; 306 | 307 | Capacity := EntCount; 308 | for I := 0 to EntCount - 1 do Add(@EntTab[I]); 309 | Sort(EntCompare); 310 | end; 311 | 312 | function TEntList.GetCode(const Name: string): Integer; 313 | var 314 | I, L, U, Cmp: Integer; 315 | begin 316 | L := 0; 317 | U := Count - 1; 318 | while L <= U do 319 | begin 320 | I := (L + U) div 2; 321 | 322 | Cmp := AnsiCompareStr(Name, PEntity(Items[I])^.Name); 323 | if Cmp = 0 then 324 | begin 325 | Result := PEntity(Items[I])^.Code; 326 | Exit 327 | end; 328 | 329 | if Cmp < 0 then U := I - 1 330 | else L := I + 1; 331 | end; 332 | 333 | Result := 32; 334 | end; 335 | 336 | function GetEntValue(const Name: string): WideChar; 337 | begin 338 | Result := WideChar(EntityList.Code[Name]); 339 | end; 340 | 341 | function GetEntName(Code: Word): string; 342 | var 343 | I: Integer; 344 | begin 345 | for I := 0 to EntCount - 1 do 346 | begin 347 | if EntTab[I].Code = Code then 348 | begin 349 | Result := EntTab[I].Name; 350 | Exit 351 | end; 352 | end; 353 | 354 | Result := ''; 355 | end; 356 | 357 | initialization 358 | EntityList := TEntList.Create; 359 | finalization 360 | EntityList.Free; 361 | end. 362 | -------------------------------------------------------------------------------- /HTMLp.Formatter.pas: -------------------------------------------------------------------------------- 1 | unit HTMLp.Formatter; 2 | 3 | interface 4 | 5 | uses 6 | HTMLp.DomCore, HTMLp.HtmlTags; 7 | 8 | const 9 | SHOW_ALL = $FFFFFFFF; 10 | SHOW_ELEMENT = $00000001; 11 | SHOW_ATTRIBUTE = $00000002; 12 | SHOW_TEXT = $00000004; 13 | SHOW_CDATA_SECTION = $00000008; 14 | SHOW_ENTITY_REFERENCE = $00000010; 15 | SHOW_ENTITY = $00000020; 16 | SHOW_PROCESSING_INSTRUCTION = $00000040; 17 | SHOW_COMMENT = $00000080; 18 | SHOW_DOCUMENT = $00000100; 19 | SHOW_DOCUMENT_TYPE = $00000200; 20 | SHOW_DOCUMENT_FRAGMENT = $00000400; 21 | SHOW_NOTATION = $00000800; 22 | 23 | type 24 | TStringBuilder = class 25 | private 26 | FCapacity: Integer; 27 | FLength: Integer; 28 | FValue: WideString; 29 | public 30 | constructor Create(ACapacity: Integer); 31 | function EndWithWhiteSpace: Boolean; 32 | function TailMatch(const Tail: WideString): Boolean; 33 | function ToString: WideString; 34 | procedure AppendText(const TextStr: WideString); 35 | property Length: Integer read FLength; 36 | end; 37 | 38 | TBaseFormatter = class 39 | private 40 | procedure ProcessNode(Node: TNode); 41 | protected 42 | FDocument: TDocument; 43 | FStringBuilder: TStringBuilder; 44 | FHTMLTagList: THTMLTagList; 45 | FDepth: Integer; 46 | FWhatToShow: Integer; 47 | FExpandEntities: Boolean; 48 | FPreserveWhiteSpace: Boolean; 49 | FInAttributes: Boolean; 50 | procedure AppendNewLine; 51 | procedure AppendParagraph; 52 | procedure AppendText(const TextStr: WideString); virtual; 53 | procedure ProcessAttribute(Attr: TAttr); virtual; 54 | procedure ProcessAttributes(Element: TElement); virtual; 55 | procedure ProcessCDataSection(CDataSection: TCDataSection); virtual; 56 | procedure ProcessComment(Comment: TComment); virtual; 57 | procedure ProcessDocumentElement; virtual; 58 | procedure ProcessElement(Element: TElement); virtual; 59 | procedure ProcessEntityReference(EntityReference: TEntityReference); virtual; 60 | // procedure ProcessNotation(Notation: TNotation); virtual; 61 | procedure ProcessProcessingInstruction(ProcessingInstruction: TProcessingInstruction); virtual; 62 | procedure ProcessTextNode(TextNode: TTextNode); virtual; 63 | public 64 | constructor Create; 65 | destructor Destroy; override; 66 | 67 | function getText(document: TDocument): WideString; 68 | end; 69 | 70 | THtmlFormatter = class(TBaseFormatter) 71 | private 72 | FIndent: Integer; 73 | function OnlyTextContent(Element: TElement): Boolean; 74 | protected 75 | procedure ProcessAttribute(Attr: TAttr); override; 76 | procedure ProcessComment(Comment: TComment); override; 77 | procedure ProcessElement(Element: TElement); override; 78 | procedure ProcessTextNode(TextNode: TTextNode); override; 79 | public 80 | constructor Create; 81 | property Indent: Integer read FIndent write FIndent; 82 | end; 83 | 84 | TTextFormatter = class(TBaseFormatter) 85 | protected 86 | FInsideAnchor: Boolean; 87 | function GetAnchorText(Node: TElement): WideString; virtual; 88 | function GetImageText(Node: TElement): WideString; virtual; 89 | procedure AppendText(const TextStr: WideString); override; 90 | procedure ProcessElement(Element: TElement); override; 91 | procedure ProcessEntityReference(EntityReference: TEntityReference); override; 92 | procedure ProcessTextNode(TextNode: TTextNode); override; 93 | public 94 | constructor Create; 95 | end; 96 | 97 | implementation 98 | 99 | uses 100 | SysUtils, 101 | 102 | HTMLp.Entities; 103 | 104 | const 105 | CRLF: WideString = #13#10; 106 | PARAGRAPH_SEPARATOR: WideString = #13#10#13#10; 107 | 108 | ViewAsBlockTags: THTMLTagSet = [ 109 | ADDRESS_TAG, BLOCKQUOTE_TAG, CAPTION_TAG, CENTER_TAG, DD_TAG, DIV_TAG, 110 | DL_TAG, DT_TAG, FIELDSET_TAG, FORM_TAG, FRAME_TAG, H1_TAG, H2_TAG, H3_TAG, 111 | H4_TAG, H5_TAG, H6_TAG, HR_TAG, IFRAME_TAG, LI_TAG, NOFRAMES_TAG, NOSCRIPT_TAG, 112 | OL_TAG, P_TAG, PRE_TAG, TABLE_TAG, TD_TAG, TH_TAG, TITLE_TAG, UL_TAG 113 | ]; 114 | 115 | function IsWhiteSpace(W: WideChar): Boolean; 116 | begin 117 | Result := (Ord(W) in WhiteSpace); 118 | end; 119 | 120 | function normalizeWhiteSpace(const TextStr: WideString): WideString; 121 | var 122 | I, J, Count: Integer; 123 | begin 124 | SetLength(Result, Length(TextStr)); 125 | 126 | J := 0; 127 | Count := 0; 128 | for I := 1 to Length(TextStr) do 129 | begin 130 | if IsWhiteSpace(TextStr[I]) then 131 | begin 132 | Inc(Count); 133 | Continue 134 | end; 135 | if Count <> 0 then 136 | begin 137 | Count := 0; 138 | Inc(J); 139 | Result[J] := ' '; 140 | end; 141 | 142 | Inc(J); 143 | Result[J] := TextStr[I]; 144 | end; 145 | 146 | if Count <> 0 then 147 | begin 148 | Inc(J); 149 | Result[J] := ' ' 150 | end; 151 | 152 | SetLength(Result, J); 153 | end; 154 | 155 | function Spaces(Count: Integer): WideString; 156 | var 157 | I: Integer; 158 | begin 159 | SetLength(Result, Count); 160 | for I := 1 to Count do Result[I] := ' '; 161 | end; 162 | 163 | function TrimLeftSpaces(const S: WideString): WideString; 164 | var 165 | I: Integer; 166 | begin 167 | I := 1; 168 | while (I <= Length(S)) and (Ord(S[I]) = SP) do Inc(I); 169 | Result := Copy(S, I, Length(S) - I + 1); 170 | end; 171 | 172 | constructor TStringBuilder.Create(ACapacity: Integer); 173 | begin 174 | inherited Create; 175 | FCapacity := ACapacity; 176 | SetLength(FValue, FCapacity); 177 | end; 178 | 179 | function TStringBuilder.EndWithWhiteSpace: Boolean; 180 | begin 181 | Result := IsWhiteSpace(FValue[FLength]); 182 | end; 183 | 184 | function TStringBuilder.TailMatch(const Tail: WideString): Boolean; 185 | var 186 | TailLen, I: Integer; 187 | begin 188 | Result := false; 189 | TailLen := System.Length(Tail); 190 | if TailLen > FLength then Exit; 191 | 192 | for I := 1 to TailLen do 193 | begin 194 | if FValue[FLength - TailLen + I] <> Tail[I] then Exit; 195 | end; 196 | 197 | Result := True; 198 | end; 199 | 200 | function TStringBuilder.ToString: WideString; 201 | begin 202 | SetLength(FValue, FLength); 203 | Result := FValue; 204 | end; 205 | 206 | procedure TStringBuilder.AppendText(const TextStr: WideString); 207 | var 208 | TextLen, I: Integer; 209 | begin 210 | if (FLength + System.Length(TextStr)) > FCapacity then 211 | begin 212 | FCapacity := 2 * FCapacity; 213 | SetLength(FValue, FCapacity) 214 | end; 215 | TextLen := System.Length(TextStr); 216 | for I := 1 to TextLen do FValue[FLength + I] := TextStr[I]; 217 | Inc(FLength, TextLen); 218 | end; 219 | 220 | constructor TBaseFormatter.Create; 221 | begin 222 | inherited Create; 223 | FWhatToShow := Integer(SHOW_ALL); 224 | 225 | FHTMLTagList := THTMLTagList.Create; 226 | end; 227 | 228 | destructor TBaseFormatter.Destroy; 229 | begin 230 | FreeAndNil(FHTMLTagList); 231 | 232 | inherited Destroy; 233 | end; 234 | 235 | procedure TBaseFormatter.ProcessNode(Node: TNode); 236 | begin 237 | case Node.NodeType of 238 | ELEMENT_NODE: ProcessElement(Node as TElement); 239 | TEXT_NODE: if (FWhatToShow and SHOW_TEXT) <> 0 then ProcessTextNode(Node as TTextNode); 240 | CDATA_SECTION_NODE: if (FWhatToShow and SHOW_CDATA_SECTION) <> 0 then ProcessCDataSection(Node as TCDataSection); 241 | ENTITY_REFERENCE_NODE: if (FWhatToShow and SHOW_ENTITY_REFERENCE) <> 0 then ProcessEntityReference(Node as TEntityReference); 242 | PROCESSING_INSTRUCTION_NODE: if (FWhatToShow and SHOW_PROCESSING_INSTRUCTION) <> 0 then ProcessProcessingInstruction(Node as TProcessingInstruction); 243 | COMMENT_NODE: if (FWhatToShow and SHOW_COMMENT) <> 0 then ProcessComment(Node as TComment); 244 | // NOTATION_NODE: if (FWhatToShow and SHOW_NOTATION) <> 0 then ProcessNotation(Node as Notation) 245 | end 246 | end; 247 | 248 | procedure TBaseFormatter.AppendNewLine; 249 | begin 250 | if FStringBuilder.Length > 0 then 251 | begin 252 | if not FStringBuilder.TailMatch(CRLF) then FStringBuilder.AppendText(CRLF) 253 | end; 254 | end; 255 | 256 | procedure TBaseFormatter.AppendParagraph; 257 | begin 258 | if FStringBuilder.Length > 0 then 259 | begin 260 | if not FStringBuilder.TailMatch(CRLF) then FStringBuilder.AppendText(PARAGRAPH_SEPARATOR) 261 | else if not FStringBuilder.TailMatch(PARAGRAPH_SEPARATOR) then FStringBuilder.AppendText(CRLF); 262 | end 263 | end; 264 | 265 | procedure TBaseFormatter.AppendText(const TextStr: WideString); 266 | begin 267 | FStringBuilder.AppendText(TextStr); 268 | end; 269 | 270 | procedure TBaseFormatter.ProcessAttribute(Attr: TAttr); 271 | var 272 | I: Integer; 273 | begin 274 | for I := 0 to Attr.ChildNodes.Count - 1 do ProcessNode(Attr.ChildNodes.Items[I]); 275 | end; 276 | 277 | procedure TBaseFormatter.ProcessAttributes(Element: TElement); 278 | var 279 | I: Integer; 280 | begin 281 | if (FWhatToShow and SHOW_ATTRIBUTE) <> 0 then 282 | begin 283 | FInAttributes := true; 284 | for I := 0 to Element.Attributes.Count - 1 do ProcessAttribute(Element.Attributes.Items[I] as TAttr); 285 | FInAttributes := False; 286 | end 287 | end; 288 | 289 | procedure TBaseFormatter.ProcessCDataSection(CDataSection: TCDataSection); 290 | begin 291 | // TODO 292 | end; 293 | 294 | procedure TBaseFormatter.ProcessComment(Comment: TComment); 295 | begin 296 | AppendText(''); 299 | end; 300 | 301 | procedure TBaseFormatter.ProcessDocumentElement; 302 | begin 303 | if Assigned(FDocument.DocumentElement) then 304 | begin 305 | FDepth := 0; 306 | ProcessElement(FDocument.DocumentElement) 307 | end; 308 | end; 309 | 310 | procedure TBaseFormatter.ProcessElement(Element: TElement); 311 | var 312 | I: Integer; 313 | begin 314 | Inc(FDepth); 315 | for I := 0 to Element.ChildNodes.Count - 1 do ProcessNode(Element.ChildNodes.Items[I]); 316 | Dec(FDepth) 317 | end; 318 | 319 | procedure TBaseFormatter.ProcessEntityReference(EntityReference: TEntityReference); 320 | begin 321 | if FExpandEntities then AppendText(GetEntValue(EntityReference.Name)) 322 | else AppendText('&' + EntityReference.Name + ';'); 323 | end; 324 | { 325 | procedure TBaseFormatter.ProcessNotation(Notation: TNotation); 326 | begin 327 | // TODO 328 | end; 329 | } 330 | procedure TBaseFormatter.ProcessProcessingInstruction(ProcessingInstruction: TProcessingInstruction); 331 | begin 332 | // TODO 333 | end; 334 | 335 | procedure TBaseFormatter.ProcessTextNode(TextNode: TTextNode); 336 | begin 337 | AppendText(TextNode.Data); 338 | end; 339 | 340 | function TBaseFormatter.getText(document: TDocument): WideString; 341 | begin 342 | FDocument := document; 343 | FStringBuilder := TStringBuilder.Create(65530); 344 | try 345 | ProcessDocumentElement; 346 | Result := FStringBuilder.ToString; 347 | finally 348 | FStringBuilder.Free; 349 | end 350 | end; 351 | 352 | constructor THtmlFormatter.Create; 353 | begin 354 | inherited Create; 355 | 356 | FIndent := 2; 357 | end; 358 | 359 | function THtmlFormatter.OnlyTextContent(Element: TElement): Boolean; 360 | var 361 | I: Integer; 362 | Node: TNode; 363 | begin 364 | Result := False; 365 | 366 | for I := 0 to Element.ChildNodes.Count - 1 do 367 | begin 368 | Node := Element.ChildNodes.Items[I]; 369 | if not (Node.NodeType in [TEXT_NODE, ENTITY_REFERENCE_NODE]) then Exit 370 | end; 371 | 372 | Result := True; 373 | end; 374 | 375 | procedure THtmlFormatter.ProcessAttribute(Attr: TAttr); 376 | begin 377 | if Attr.HasChildNodes then 378 | begin 379 | AppendText(' ' + Attr.Name + '="'); 380 | inherited ProcessAttribute(Attr); 381 | AppendText('"'); 382 | end 383 | else AppendText(' ' + Attr.Name + '="' + Attr.Name + '"'); 384 | end; 385 | 386 | procedure THtmlFormatter.ProcessComment(Comment: TComment); 387 | begin 388 | AppendNewLine; 389 | AppendText(Spaces(FIndent * FDepth)); 390 | 391 | inherited ProcessComment(Comment) 392 | end; 393 | 394 | procedure THtmlFormatter.ProcessElement(Element: TElement); 395 | var 396 | HTMLTag: THTMLTag; 397 | begin 398 | HTMLTag := FHTMLTagList.GetTagByName(Element.TagName); 399 | AppendNewLine; 400 | AppendText(Spaces(FIndent * FDepth)); 401 | AppendText('<' + Element.TagName); 402 | ProcessAttributes(Element); 403 | 404 | if Element.HasChildNodes then 405 | begin 406 | AppendText('>'); 407 | 408 | if HTMLTag.Number in PreserveWhiteSpaceTags then FPreserveWhiteSpace := True; 409 | inherited ProcessElement(Element); 410 | FPreserveWhiteSpace := False; 411 | 412 | if not OnlyTextContent(Element) then 413 | begin 414 | AppendNewLine; 415 | AppendText(Spaces(FIndent * FDepth)) 416 | end; 417 | 418 | AppendText('') 419 | end 420 | else AppendText(' />'); 421 | end; 422 | 423 | procedure THtmlFormatter.ProcessTextNode(TextNode: TTextNode); 424 | var 425 | TextStr: WideString; 426 | begin 427 | if FPreserveWhiteSpace then AppendText(TextNode.Data) 428 | else 429 | begin 430 | TextStr := normalizeWhiteSpace(TextNode.Data); 431 | if TextStr <> ' ' then AppendText(TextStr) 432 | end; 433 | end; 434 | 435 | constructor TTextFormatter.Create; 436 | begin 437 | inherited Create; 438 | FWhatToShow := SHOW_ELEMENT or SHOW_TEXT or SHOW_ENTITY_REFERENCE; 439 | FExpandEntities := True; 440 | end; 441 | 442 | function TTextFormatter.GetAnchorText(Node: TElement): WideString; 443 | var 444 | Attr: TAttr; 445 | begin 446 | Result := ''; 447 | 448 | if Node.HasAttribute('href') then 449 | begin 450 | Attr := Node.GetAttributeNode('href'); 451 | Result := ' '; 452 | if UrlSchemes.GetScheme(Attr.Value) = '' then Result := Result + 'http://'; 453 | Result := Result + Attr.Value; 454 | end 455 | end; 456 | 457 | function TTextFormatter.GetImageText(Node: TElement): WideString; 458 | begin 459 | if Node.HasAttribute('alt') then Result := Node.GetAttributeNode('alt').Value 460 | else Result := ''; 461 | end; 462 | 463 | procedure TTextFormatter.AppendText(const TextStr: WideString); 464 | begin 465 | if (FStringBuilder.Length = 0) 466 | or FStringBuilder.EndWithWhiteSpace then 467 | begin 468 | inherited AppendText(TrimLeftSpaces(TextStr)) 469 | end 470 | else 471 | begin 472 | inherited AppendText(TextStr); 473 | end; 474 | end; 475 | 476 | procedure TTextFormatter.ProcessElement(Element: TElement); 477 | var 478 | HTMLTag: THTMLTag; 479 | begin 480 | HTMLTag := FHTMLTagList.GetTagByName(Element.TagName); 481 | if HTMLTag.Number in ViewAsBlockTags then AppendParagraph; 482 | 483 | case HTMLTag.Number of 484 | A_TAG: FInsideAnchor := True; 485 | LI_TAG: AppendText('* '); 486 | end; 487 | 488 | if HTMLTag.Number in PreserveWhiteSpaceTags then FPreserveWhiteSpace := True; 489 | inherited ProcessElement(Element); 490 | FPreserveWhiteSpace := False; 491 | 492 | case HTMLTag.Number of 493 | BR_TAG: 494 | AppendNewLine; 495 | 496 | A_TAG: 497 | begin 498 | AppendText(GetAnchorText(Element)); 499 | FInsideAnchor := false 500 | end; 501 | 502 | IMG_TAG: 503 | begin 504 | if FInsideAnchor then AppendText(GetImageText(Element)); 505 | end 506 | end; 507 | 508 | if HTMLTag.Number in ViewAsBlockTags then AppendParagraph; 509 | end; 510 | 511 | procedure TTextFormatter.ProcessEntityReference(EntityReference: TEntityReference); 512 | begin 513 | if EntityReference.Name = 'nbsp' then AppendText(' ') 514 | else inherited ProcessEntityReference(EntityReference); 515 | end; 516 | 517 | procedure TTextFormatter.ProcessTextNode(TextNode: TTextNode); 518 | begin 519 | if FPreserveWhiteSpace then AppendText(TextNode.Data) 520 | else AppendText(normalizeWhiteSpace(TextNode.Data)); 521 | end; 522 | 523 | end. 524 | -------------------------------------------------------------------------------- /HTMLp.HTMLParser.pas: -------------------------------------------------------------------------------- 1 | unit HTMLp.HTMLParser; 2 | 3 | interface 4 | 5 | uses 6 | HTMLp.DomCore, HTMLp.HTMLReader, HTMLp.HTMLTags; 7 | 8 | type 9 | THTMLParser = class 10 | private 11 | FHTMLDocument: TDocument; 12 | FHTMLReader: THTMLReader; 13 | FHTMLTagList: THTMLTagList; 14 | FCurrentNode: TNode; 15 | FCurrentTag: THTMLTag; 16 | function FindDefParent: TElement; 17 | function FindParent: TElement; 18 | function FindParentElement(tagList: THTMLTagSet): TElement; 19 | function FindTableParent: TElement; 20 | function FindThisElement: TElement; 21 | function GetMainElement(const tagName: WideString): TElement; 22 | procedure ProcessAttributeEnd(Sender: TObject); 23 | procedure ProcessAttributeStart(Sender: TObject); 24 | procedure ProcessCDataSection(Sender: TObject); 25 | procedure ProcessComment(Sender: TObject); 26 | procedure ProcessDocType(Sender: TObject); 27 | procedure ProcessElementEnd(Sender: TObject); 28 | procedure ProcessElementStart(Sender: TObject); 29 | procedure ProcessEndElement(Sender: TObject); 30 | procedure ProcessEntityReference(Sender: TObject); 31 | procedure ProcessScript(Sender: TObject); 32 | procedure ProcessTextNode(Sender: TObject); 33 | public 34 | constructor Create; 35 | destructor Destroy; override; 36 | 37 | function ParseString(const htmlStr: WideString): TDocument; 38 | property HTMLDocument: TDocument read FHTMLDocument; 39 | end; 40 | 41 | implementation 42 | 43 | const 44 | htmlTagName = 'html'; 45 | headTagName = 'head'; 46 | bodyTagName = 'body'; 47 | 48 | constructor THTMLParser.Create; 49 | begin 50 | inherited Create; 51 | 52 | FHTMLTagList := THTMLTagList.Create; 53 | FHTMLReader := THTMLReader.Create; 54 | 55 | with FHTMLReader do 56 | begin 57 | OnAttributeEnd := ProcessAttributeEnd; 58 | OnAttributeStart := ProcessAttributeStart; 59 | OnCDataSection := ProcessCDataSection; 60 | OnComment := ProcessComment; 61 | OnDocType := ProcessDocType; 62 | OnElementEnd := ProcessElementEnd; 63 | OnElementStart := ProcessElementStart; 64 | OnEndElement := ProcessEndElement; 65 | OnEntityReference := ProcessEntityReference; 66 | OnScript := ProcessScript; 67 | //OnNotation := ProcessNotation; 68 | //OnProcessingInstruction := ProcessProcessingInstruction; 69 | OnTextNode := ProcessTextNode; 70 | end 71 | end; 72 | 73 | destructor THTMLParser.Destroy; 74 | begin 75 | FHTMLReader.Free; 76 | FHTMLTagList.Free; 77 | 78 | inherited Destroy; 79 | end; 80 | 81 | function THTMLParser.FindDefParent: TElement; 82 | begin 83 | if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHTMLDocument.AppendChild(FHTMLDocument.CreateElement(htmlTagName)) as TElement 84 | else if FCurrentTag.Number in HeadTags then Result := GetMainElement(headTagName) 85 | else Result := GetMainElement(bodyTagName); 86 | end; 87 | 88 | function THTMLParser.FindParent: TElement; 89 | begin 90 | if (FCurrentTag.Number = P_TAG) or (FCurrentTag.Number in BlockTags) then Result := FindParentElement(BlockParentTags) 91 | else if FCurrentTag.Number in [LI_TAG] then Result := FindParentElement(ListItemParentTags) 92 | else if FCurrentTag.Number in [DD_TAG, DT_TAG] then Result := FindParentElement(DefItemParentTags) 93 | else if FCurrentTag.Number in [TD_TAG, TH_TAG] then Result := FindParentElement(CellParentTags) 94 | else if FCurrentTag.Number in [TR_TAG] then Result := FindParentElement(RowParentTags) 95 | else if FCurrentTag.Number in [COL_TAG] then Result := FindParentElement(ColParentTags) 96 | else if FCurrentTag.Number in [COLGROUP_TAG, THEAD_TAG, TFOOT_TAG, TBODY_TAG] then Result := FindParentElement(TableSectionParentTags) 97 | else if FCurrentTag.Number in [TABLE_TAG] then Result := FindTableParent 98 | else if FCurrentTag.Number in [OPTION_TAG] then Result := FindParentElement(OptionParentTags) 99 | else if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHTMLDocument.DocumentElement as TElement 100 | else Result := nil; 101 | 102 | if Result = nil then Result := FindDefParent; 103 | end; 104 | 105 | function THTMLParser.FindParentElement(tagList: THTMLTagSet): TElement; 106 | var 107 | Node: TNode; 108 | HTMLTag: THTMLTag; 109 | begin 110 | Node := FCurrentNode; 111 | 112 | while Node.NodeType = ELEMENT_NODE do 113 | begin 114 | HTMLTag := FHTMLTagList.GetTagByName(Node.Name); 115 | 116 | if HTMLTag.Number in tagList then 117 | begin 118 | Result := Node as TElement; 119 | Exit; 120 | end; 121 | 122 | Node := Node.ParentNode; 123 | end; 124 | 125 | Result := nil; 126 | end; 127 | 128 | function THTMLParser.FindTableParent: TElement; 129 | var 130 | Node: TNode; 131 | HTMLTag: THTMLTag; 132 | begin 133 | Node := FCurrentNode; 134 | 135 | while Node.NodeType = ELEMENT_NODE do 136 | begin 137 | HTMLTag := FHTMLTagList.GetTagByName(Node.Name); 138 | 139 | if (HTMLTag.Number = TD_TAG) or (HTMLTag.Number in BlockTags) then 140 | begin 141 | Result := (Node as TElement); 142 | Exit; 143 | end; 144 | 145 | Node := Node.ParentNode; 146 | end; 147 | 148 | Result := GetMainElement(bodyTagName); 149 | end; 150 | 151 | function THTMLParser.FindThisElement: TElement; 152 | var 153 | Node: TNode; 154 | begin 155 | Node := FCurrentNode; 156 | 157 | while Node.NodeType = ELEMENT_NODE do 158 | begin 159 | Result := (Node as TElement); 160 | if Result.TagName = FHTMLReader.Name then Exit; 161 | 162 | Node := Node.ParentNode; 163 | end; 164 | 165 | Result := nil; 166 | end; 167 | 168 | function THTMLParser.GetMainElement(const tagName: WideString): TElement; 169 | var 170 | child: TNode; 171 | I: Integer; 172 | begin 173 | if (FHTMLDocument.DocumentElement = nil) then FHTMLDocument.AppendChild(FHTMLDocument.CreateElement(htmlTagName)); 174 | 175 | for I := 0 to FHTMLDocument.DocumentElement.ChildNodes.Count - 1 do 176 | begin 177 | child := FHTMLDocument.DocumentElement.ChildNodes.Items[I]; 178 | 179 | if (child.NodeType = ELEMENT_NODE) and (child.Name = tagName) then 180 | begin 181 | Result := (child as TElement); 182 | 183 | Exit 184 | end 185 | end; 186 | 187 | Result := FHTMLDocument.CreateElement(tagName); 188 | FHTMLDocument.DocumentElement.AppendChild(Result); 189 | end; 190 | 191 | procedure THTMLParser.ProcessAttributeEnd(Sender: TObject); 192 | begin 193 | FCurrentNode := (FCurrentNode as TAttr).OwnerElement; 194 | end; 195 | 196 | procedure THTMLParser.ProcessAttributeStart(Sender: TObject); 197 | var 198 | newAttrName: string; 199 | Attr: TAttr; 200 | begin 201 | newAttrName := (Sender as THTMLReader).Name; 202 | Attr := (FCurrentNode as TElement).GetAttributeNode(newAttrName); 203 | 204 | if not (Assigned(Attr)) then 205 | begin 206 | Attr := FHTMLDocument.CreateAttribute(newAttrName); 207 | (FCurrentNode as TElement).SetAttributeNode(Attr); 208 | end; 209 | 210 | FCurrentNode := Attr; 211 | end; 212 | 213 | procedure THTMLParser.ProcessCDataSection(Sender: TObject); 214 | var 215 | CDataSection: TCDataSection; 216 | begin 217 | CDataSection := FHTMLDocument.CreateCDATASection(FHTMLReader.NodeValue); 218 | FCurrentNode.AppendChild(CDataSection) 219 | end; 220 | 221 | procedure THTMLParser.ProcessComment(Sender: TObject); 222 | var 223 | Comment: TComment; 224 | begin 225 | Comment := FHTMLDocument.CreateComment(FHTMLReader.NodeValue); 226 | FCurrentNode.AppendChild(Comment); 227 | end; 228 | 229 | procedure THTMLParser.ProcessDocType(Sender: TObject); 230 | begin 231 | with FHTMLReader do FHTMLDocument.Doctype := DomImplementation.CreateDocumentType(Name, PublicID, SystemID); 232 | end; 233 | 234 | procedure THTMLParser.ProcessElementEnd(Sender: TObject); 235 | begin 236 | if FHTMLReader.isEmptyElement 237 | or (FCurrentTag.Number in EmptyTags) then FCurrentNode := FCurrentNode.ParentNode; 238 | 239 | FCurrentTag := nil; 240 | end; 241 | 242 | procedure THTMLParser.ProcessElementStart(Sender: TObject); 243 | var 244 | Element: TElement; 245 | Parent: TNode; 246 | begin 247 | FCurrentTag := FHTMLTagList.GetTagByName(FHTMLReader.Name); 248 | if FCurrentTag.Number in (NeedFindParentTags + BlockTags) then 249 | begin 250 | Parent := FindParent; 251 | if not Assigned(Parent) then raise DomException.Create(HIERARCHY_REQUEST_ERR); 252 | FCurrentNode := Parent; 253 | end; 254 | 255 | Element := FHTMLDocument.CreateElement(FHTMLReader.Name); 256 | FCurrentNode.AppendChild(Element); 257 | FCurrentNode := Element; 258 | end; 259 | 260 | procedure THTMLParser.ProcessEndElement(Sender: TObject); 261 | var 262 | Element: TElement; 263 | begin 264 | Element := FindThisElement; 265 | if Assigned(Element) then FCurrentNode := Element.ParentNode 266 | // else if IsBlockTagName(FHtmlReader.nodeName) then raise DomException.Create(HIERARCHY_REQUEST_ERR); 267 | end; 268 | 269 | procedure THTMLParser.ProcessEntityReference(Sender: TObject); 270 | var 271 | EntityReference: TEntityReference; 272 | begin 273 | EntityReference := FHTMLDocument.CreateEntityReference(FHTMLReader.Name); 274 | FCurrentNode.AppendChild(EntityReference); 275 | end; 276 | 277 | procedure THtmlParser.ProcessScript(Sender: TObject); 278 | var 279 | Script: TScript; 280 | begin 281 | Script := FHTMLDocument.CreateScript(FHTMLReader.NodeValue); 282 | FCurrentNode.AppendChild(Script); 283 | end; 284 | 285 | procedure THtmlParser.ProcessTextNode(Sender: TObject); 286 | var 287 | TextNode: TTextNode; 288 | begin 289 | TextNode := FHTMLDocument.CreateTextNode(FHTMLReader.NodeValue); 290 | FCurrentNode.AppendChild(TextNode); 291 | end; 292 | 293 | function THTMLParser.ParseString(const htmlStr: WideString): TDocument; 294 | begin 295 | FHTMLReader.HTMLStr := htmlStr; 296 | FHTMLDocument := DomImplementation.CreateEmptyDocument(nil); 297 | FCurrentNode := FHTMLDocument; 298 | try 299 | while FHTMLReader.Read do; 300 | except 301 | // TODO: Add event ? 302 | end; 303 | 304 | Result := FHTMLDocument; 305 | end; 306 | 307 | end. 308 | -------------------------------------------------------------------------------- /HTMLp.HTMLReader.pas: -------------------------------------------------------------------------------- 1 | unit HTMLp.HTMLReader; 2 | 3 | interface 4 | 5 | uses 6 | Classes, 7 | 8 | HTMLp.DomCore; 9 | 10 | type 11 | TDelimiters = set of Byte; 12 | TReaderState = (rsInitial, rsBeforeAttr, rsBeforeValue, rsInValue, rsInQuotedValue); 13 | 14 | THTMLReader = class 15 | private 16 | FHTMLStr: string; 17 | FPosition: Integer; 18 | FNodeType: Integer; 19 | FPrefix: string; 20 | FLocalName: string; 21 | FNodeValue: string; 22 | FPublicID: string; 23 | FSystemID: string; 24 | FIsEmptyElement: Boolean; 25 | FState: TReaderState; 26 | FQuotation: Word; 27 | FOnAttributeEnd: TNotifyEvent; 28 | FOnAttributeStart: TNotifyEvent; 29 | FOnCDataSection: TNotifyEvent; 30 | FOnComment: TNotifyEvent; 31 | FOnDocType: TNotifyEvent; 32 | FOnElementEnd: TNotifyEvent; 33 | FOnElementStart: TNotifyEvent; 34 | FOnEndElement: TNotifyEvent; 35 | FOnEntityReference: TNotifyEvent; 36 | FOnNotation: TNotifyEvent; 37 | FOnProcessingInstruction: TNotifyEvent; 38 | FOnScript: TNotifyEvent; 39 | FOnTextNode: TNotifyEvent; 40 | function GetNodeName: string; 41 | function GetToken(Delimiters: TDelimiters): string; 42 | function IsAttrTextChar: Boolean; 43 | function IsDigit(HexBase: Boolean): Boolean; 44 | function IsEndEntityChar: Boolean; 45 | function IsEntityChar: Boolean; 46 | function IsEqualChar: Boolean; 47 | function IsHexEntityChar: Boolean; 48 | function IsNumericEntity: Boolean; 49 | function IsQuotation: Boolean; 50 | function IsSlashChar: Boolean; 51 | function IsSpecialTagChar: Boolean; 52 | function IsStartCharacterData: Boolean; 53 | function IsStartComment: Boolean; 54 | function IsStartDocumentType: Boolean; 55 | function IsStartEntityChar: Boolean; 56 | function IsStartMarkupChar: Boolean; 57 | function IsStartScript: Boolean; 58 | function IsStartTagChar: Boolean; 59 | function Match(const Signature: string; IgnoreCase: Boolean): Boolean; 60 | function ReadAttrNode: Boolean; 61 | function ReadAttrTextNode: Boolean; 62 | function ReadCharacterData: Boolean; 63 | function ReadComment: Boolean; 64 | function ReadDocumentType: Boolean; 65 | function ReadElementNode: Boolean; 66 | function ReadEndElementNode: Boolean; 67 | function ReadEntityNode: Boolean; 68 | function ReadNamedEntityNode: Boolean; 69 | function ReadNumericEntityNode: Boolean; 70 | function ReadQuotedValue(var Value: string): Boolean; 71 | function ReadScript: Boolean; 72 | function ReadSpecialNode: Boolean; 73 | function ReadTagNode: Boolean; 74 | function ReadValueNode: Boolean; 75 | function SkipTo(const Signature: string): Boolean; 76 | procedure FireEvent(Event: TNotifyEvent); 77 | procedure ReadElementTail; 78 | procedure ReadTextNode; 79 | procedure SetHTMLStr(const Value: string); 80 | procedure SetNodeName(Value: string); 81 | procedure SkipWhiteSpaces; 82 | public 83 | constructor Create; 84 | function Read: Boolean; 85 | 86 | property HTMLStr: string read FHTMLStr write SetHTMLStr; 87 | property isEmptyElement: Boolean read FIsEmptyElement; 88 | property LocalName: string read FLocalName; 89 | property Name: string read GetNodeName; 90 | property NodeType: Integer read FNodeType; 91 | property Position: Integer read FPosition; 92 | property Prefix: string read FPrefix; 93 | property PublicID: string read FPublicID; 94 | property State: TReaderState read FState; 95 | property SystemID: string read FSystemID; 96 | property NodeValue: string read FNodeValue; 97 | property OnAttributeEnd: TNotifyEvent read FOnAttributeEnd write FOnAttributeEnd; 98 | property OnAttributeStart: TNotifyEvent read FOnAttributeStart write FOnAttributeStart; 99 | property OnCDataSection: TNotifyEvent read FOnCDataSection write FOnCDataSection; 100 | property OnComment: TNotifyEvent read FOnComment write FOnComment; 101 | property OnDocType: TNotifyEvent read FOnDocType write FOnDocType; 102 | property OnElementEnd: TNotifyEvent read FOnElementEnd write FOnElementEnd; 103 | property OnElementStart: TNotifyEvent read FOnElementStart write FOnElementStart; 104 | property OnEndElement: TNotifyEvent read FOnEndElement write FOnEndElement; 105 | property OnEntityReference: TNotifyEvent read FOnEntityReference write FOnEntityReference; 106 | property OnNotation: TNotifyEvent read FOnNotation write FOnNotation; 107 | property OnProcessingInstruction: TNotifyEvent read FOnProcessingInstruction write FOnProcessingInstruction; 108 | property OnScript: TNotifyEvent read FOnScript write FOnScript; 109 | property OnTextNode: TNotifyEvent read FOnTextNode write FOnTextNode; 110 | end; 111 | 112 | implementation 113 | 114 | uses 115 | SysUtils; 116 | 117 | const 118 | startTagChar = Ord('<'); 119 | endTagChar = Ord('>'); 120 | specialTagChar = Ord('!'); 121 | slashChar = Ord('/'); 122 | equalChar = Ord('='); 123 | quotation = [Ord(''''), Ord('"')]; 124 | tagDelimiter = [slashChar, endTagChar]; 125 | tagNameDelimiter = whiteSpace + tagDelimiter; 126 | attrNameDelimiter = tagNameDelimiter + [equalChar]; 127 | startEntity = Ord('&'); 128 | startMarkup = [startTagChar, startEntity]; 129 | endEntity = Ord(';'); 130 | notEntity = [endEntity] + startMarkup + whiteSpace; 131 | notAttrText = whiteSpace + quotation + tagDelimiter; 132 | numericEntity = Ord('#'); 133 | hexEntity = [Ord('x'), Ord('X')]; 134 | decDigit = [Ord('0')..Ord('9')]; 135 | hexDigit = [Ord('a')..Ord('f'), Ord('A')..Ord('F')]; 136 | 137 | DocTypeStartStr = 'DOCTYPE'; 138 | DocTypeEndStr = '>'; 139 | CDataStartStr = '[CDATA['; 140 | CDataEndStr = ']]>'; 141 | CommentStartStr = '--'; 142 | CommentEndStr = '-->'; 143 | ScriptStartStr = 'script'; 144 | ScriptEndStr = ''; 145 | 146 | function DecValue(const Digit: WideChar): Word; 147 | begin 148 | Result := Ord(Digit) - Ord('0') 149 | end; 150 | 151 | function HexValue(const HexChar: WideChar): Word; 152 | var 153 | C: Char; 154 | begin 155 | if Ord(HexChar) in decDigit then Result := Ord(HexChar) - Ord('0') 156 | else 157 | begin 158 | C := UpCase(Chr(Ord(HexChar))); 159 | Result := Ord(C) - Ord('A'); 160 | end 161 | end; 162 | 163 | constructor THTMLReader.Create; 164 | begin 165 | inherited Create; 166 | 167 | FHTMLStr := HTMLStr; 168 | FPosition := 1 169 | end; 170 | 171 | function THTMLReader.GetNodeName: string; 172 | begin 173 | if FPrefix <> '' then Result := FPrefix + ':' + FLocalName 174 | else Result := FLocalName; 175 | end; 176 | 177 | function THTMLReader.GetToken(Delimiters: TDelimiters): string; 178 | var 179 | Start: Integer; 180 | begin 181 | Start := FPosition; 182 | while (FPosition <= Length(FHTMLStr)) and not (Ord(FHTMLStr[FPosition]) in Delimiters) do Inc(FPosition); 183 | Result := Copy(FHTMLStr, Start, FPosition - Start); 184 | end; 185 | 186 | function THTMLReader.IsAttrTextChar: Boolean; 187 | var 188 | WC: WideChar; 189 | begin 190 | WC := FHTMLStr[FPosition]; 191 | if FState = rsInQuotedValue then Result := (Ord(WC) <> FQuotation) and (Ord(WC) <> startEntity) 192 | else Result := not (Ord(WC) in notAttrText); 193 | end; 194 | 195 | function THTMLReader.IsDigit(HexBase: Boolean): Boolean; 196 | var 197 | WC: WideChar; 198 | begin 199 | WC := FHTMLStr[FPosition]; 200 | Result := Ord(WC) in decDigit; 201 | if not Result and HexBase then Result := (Ord(WC) in hexDigit); 202 | end; 203 | 204 | function THTMLReader.IsEndEntityChar: Boolean; 205 | var 206 | WC: WideChar; 207 | begin 208 | WC := FHTMLStr[FPosition]; 209 | Result := (Ord(WC) = endEntity); 210 | end; 211 | 212 | function THTMLReader.IsEntityChar: Boolean; 213 | var 214 | WC: WideChar; 215 | begin 216 | WC := FHTMLStr[FPosition]; 217 | Result := not (Ord(WC) in notEntity); 218 | end; 219 | 220 | function THTMLReader.IsEqualChar: Boolean; 221 | var 222 | WC: WideChar; 223 | begin 224 | WC := FHTMLStr[FPosition]; 225 | Result := (Ord(WC) = equalChar); 226 | end; 227 | 228 | function THTMLReader.IsHexEntityChar: Boolean; 229 | var 230 | WC: WideChar; 231 | begin 232 | WC := FHTMLStr[FPosition]; 233 | Result := (Ord(WC) in hexEntity); 234 | end; 235 | 236 | function THTMLReader.IsNumericEntity: Boolean; 237 | var 238 | WC: WideChar; 239 | begin 240 | WC := FHTMLStr[FPosition]; 241 | Result := (Ord(WC) = numericEntity); 242 | end; 243 | 244 | function THTMLReader.IsQuotation: Boolean; 245 | var 246 | WC: WideChar; 247 | begin 248 | WC := FHTMLStr[FPosition]; 249 | if FQuotation = 0 then Result := (Ord(WC) in quotation) 250 | else Result := (Ord(WC) = FQuotation); 251 | end; 252 | 253 | function THTMLReader.IsSlashChar: Boolean; 254 | var 255 | WC: WideChar; 256 | begin 257 | WC := FHTMLStr[FPosition]; 258 | Result := (Ord(WC) = slashChar); 259 | end; 260 | 261 | function THTMLReader.IsSpecialTagChar: Boolean; 262 | var 263 | WC: WideChar; 264 | begin 265 | WC := FHTMLStr[FPosition]; 266 | Result := (Ord(WC) = specialTagChar); 267 | end; 268 | 269 | function THTMLReader.IsStartCharacterData: Boolean; 270 | begin 271 | Result := Match(CDataStartStr, False); 272 | end; 273 | 274 | function THTMLReader.IsStartComment: Boolean; 275 | begin 276 | Result := Match(CommentStartStr, False); 277 | end; 278 | 279 | function THTMLReader.IsStartDocumentType: Boolean; 280 | begin 281 | Result := Match(DocTypeStartStr, True); 282 | end; 283 | 284 | function THTMLReader.IsStartEntityChar: Boolean; 285 | var 286 | WC: WideChar; 287 | begin 288 | WC := FHTMLStr[FPosition]; 289 | Result := (Ord(WC) = startEntity); 290 | end; 291 | 292 | function THTMLReader.IsStartMarkupChar: Boolean; 293 | var 294 | WC: WideChar; 295 | begin 296 | WC := FHTMLStr[FPosition]; 297 | Result := (Ord(WC) in startMarkup); 298 | end; 299 | 300 | function THTMLReader.IsStartScript: Boolean; 301 | begin 302 | Result := Match(ScriptStartStr, true); 303 | end; 304 | 305 | function THTMLReader.IsStartTagChar: Boolean; 306 | var 307 | WC: WideChar; 308 | begin 309 | WC := FHTMLStr[FPosition]; 310 | Result := (Ord(WC) = startTagChar); 311 | end; 312 | 313 | function THTMLReader.Match(const Signature: string; IgnoreCase: Boolean): Boolean; 314 | var 315 | I, J: Integer; 316 | W1, W2: WideChar; 317 | begin 318 | Result := False; 319 | 320 | for I := 1 to Length(Signature) do 321 | begin 322 | J := FPosition + I - 1; 323 | if (J < 1) or (J > Length(FHTMLStr)) then Exit; 324 | W1 := Signature[I]; 325 | W2 := FHTMLStr[J]; 326 | if (W1 <> W2) 327 | and (not IgnoreCase or (UpperCase(W1) <> UpperCase(W2))) then Exit; 328 | end; 329 | 330 | Result := True; 331 | end; 332 | 333 | function THTMLReader.ReadAttrNode: Boolean; 334 | var 335 | AttrName: string; 336 | begin 337 | Result := False; 338 | SkipWhiteSpaces; 339 | AttrName := LowerCase(GetToken(attrNameDelimiter)); 340 | if AttrName = '' then Exit; 341 | 342 | SetNodeName(AttrName); 343 | FireEvent(FOnAttributeStart); 344 | 345 | FState := rsBeforeValue; 346 | FQuotation := 0; 347 | 348 | Result := True; 349 | end; 350 | 351 | function THTMLReader.ReadAttrTextNode: Boolean; 352 | var 353 | Start: Integer; 354 | begin 355 | Result := False; 356 | Start := FPosition; 357 | 358 | while (FPosition <= Length(FHTMLStr)) and (IsAttrTextChar) do Inc(FPosition); 359 | if FPosition = Start then Exit; 360 | 361 | FNodeType := TEXT_NODE; 362 | FNodeValue:= Copy(FHTMLStr, Start, FPosition - Start); 363 | FireEvent(FOnTextNode); 364 | 365 | Result := True; 366 | end; 367 | 368 | function THTMLReader.ReadCharacterData: Boolean; 369 | var 370 | StartPos: Integer; 371 | begin 372 | Inc(FPosition, Length(CDataStartStr)); 373 | StartPos := FPosition; 374 | Result := SkipTo(CDataEndStr); 375 | if Result then 376 | begin 377 | FNodeType := CDATA_SECTION_NODE; 378 | FNodeValue := Copy(FHTMLStr, StartPos, FPosition - StartPos - Length(CDataEndStr)); 379 | FireEvent(FOnCDataSection) 380 | end 381 | end; 382 | 383 | function THTMLReader.ReadComment: Boolean; 384 | var 385 | StartPos: Integer; 386 | begin 387 | Inc(FPosition, Length(CommentStartStr)); 388 | StartPos := FPosition; 389 | Result := SkipTo(CommentEndStr); 390 | if Result then 391 | begin 392 | FNodeType := COMMENT_NODE; 393 | FNodeValue := Copy(FHTMLStr, StartPos, FPosition - StartPos - Length(CommentEndStr)); 394 | FireEvent(FOnComment); 395 | end; 396 | end; 397 | 398 | function THTMLReader.ReadDocumentType: Boolean; 399 | var 400 | Name: string; 401 | begin 402 | Result := False; 403 | 404 | Inc(FPosition, Length(DocTypeStartStr)); 405 | SkipWhiteSpaces; 406 | Name := GetToken(tagNameDelimiter); 407 | if Name = '' then Exit; 408 | 409 | SetNodeName(Name); 410 | SkipWhiteSpaces; 411 | GetToken(tagNameDelimiter); 412 | SkipWhiteSpaces; 413 | if ((FHTMLStr[FPosition] = '"') or (FHTMLStr[FPosition] = '''')) and not (ReadQuotedValue(FPublicID)) then Exit; 414 | 415 | SkipWhiteSpaces; 416 | if FHTMLStr[FPosition] = '"' then 417 | begin 418 | if not ReadQuotedValue(FSystemID) then Exit 419 | end; 420 | 421 | Result := SkipTo(DocTypeEndStr); 422 | end; 423 | 424 | function THTMLReader.ReadElementNode: Boolean; 425 | var 426 | TagName: string; 427 | begin 428 | Result := False; 429 | 430 | if FPosition < Length(FHTMLStr) then 431 | begin 432 | TagName := LowerCase(GetToken(tagNameDelimiter)); 433 | if TagName = '' then Exit; 434 | 435 | FNodeType := ELEMENT_NODE; 436 | SetNodeName(TagName); 437 | 438 | FState := rsBeforeAttr; 439 | FireEvent(FOnElementStart); 440 | Result := True; 441 | end; 442 | end; 443 | 444 | function THTMLReader.ReadEndElementNode: Boolean; 445 | var 446 | TagName: string; 447 | begin 448 | Result := False; 449 | 450 | Inc(FPosition); 451 | if FPosition > Length(FHTMLStr) then Exit; 452 | 453 | TagName := LowerCase(GetToken(tagNameDelimiter)); 454 | if TagName = '' then Exit; 455 | 456 | Result := SkipTo(WideChar(endTagChar)); 457 | if Result then 458 | begin 459 | FNodeType := END_ELEMENT_NODE; 460 | SetNodeName(TagName); 461 | FireEvent(FOnEndElement); 462 | Result := True; 463 | end; 464 | end; 465 | 466 | function THTMLReader.ReadEntityNode: Boolean; 467 | var 468 | CurrPos: Integer; 469 | begin 470 | Result := False; 471 | CurrPos := FPosition; 472 | Inc(FPosition); 473 | if FPosition > Length(FHTMLStr) then Exit; 474 | 475 | if IsNumericEntity then 476 | begin 477 | Inc(FPosition); 478 | Result := ReadNumericEntityNode; 479 | end 480 | else Result := ReadNamedEntityNode; 481 | 482 | if Result then 483 | begin 484 | FNodeType := ENTITY_REFERENCE_NODE; 485 | // FireEvent(FOnEntityReference); VVV - remove, entity node is added in ReadXXXEntityNode 486 | end 487 | else FPosition := CurrPos; 488 | end; 489 | 490 | function THTMLReader.ReadNamedEntityNode: Boolean; 491 | var 492 | Start: Integer; 493 | begin 494 | Result := False; 495 | if FPosition > Length(FHTMLStr) then Exit; 496 | 497 | Start := FPosition; 498 | while (FPosition <= Length(FHTMLStr)) and IsEntityChar do Inc(FPosition); 499 | if (FPosition > Length(FHTMLStr)) or not IsEndEntityChar then Exit; 500 | 501 | FNodeType := ENTITY_REFERENCE_NODE; 502 | SetNodeName(Copy(FHTMLStr, Start, FPosition - Start)); 503 | Inc(FPosition); 504 | 505 | FireEvent(FOnEntityReference); 506 | Result := True; 507 | end; 508 | 509 | function THTMLReader.ReadNumericEntityNode: Boolean; 510 | var 511 | Value: Word; 512 | HexBase: Boolean; 513 | begin 514 | Result := False; 515 | if FPosition > Length(FHTMLStr) then Exit; 516 | 517 | HexBase := IsHexEntityChar; 518 | if HexBase then Inc(FPosition); 519 | 520 | Value := 0; 521 | while (FPosition <= Length(FHTMLStr)) and IsDigit(HexBase) do 522 | begin 523 | try 524 | if HexBase then Value := Value * 16 + HexValue(FHTMLStr[FPosition]) 525 | else Value := Value * 10 + DecValue(FHTMLStr[FPosition]) 526 | except 527 | Exit 528 | end; 529 | 530 | Inc(FPosition); 531 | end; 532 | 533 | if (FPosition > Length(FHTMLStr)) or not IsEndEntityChar then Exit; 534 | Inc(FPosition); 535 | FNodeType := TEXT_NODE; 536 | FNodeValue := WideChar(Value); 537 | FireEvent(FOnTextNode); 538 | Result := True; 539 | end; 540 | 541 | function THTMLReader.ReadQuotedValue(var Value: string): Boolean; 542 | var 543 | QuotedChar: WideChar; 544 | Start: Integer; 545 | begin 546 | QuotedChar := FHTMLStr[FPosition]; 547 | Inc(FPosition); 548 | Start := FPosition; 549 | Result := SkipTo(QuotedChar); 550 | if Result then Value := Copy(FHTMLStr, Start, FPosition - Start); 551 | end; 552 | 553 | function THTMLReader.ReadScript: Boolean; 554 | var 555 | StartPos: Integer; 556 | begin 557 | Inc(FPosition, Length(ScriptStartStr)); 558 | StartPos := FPosition; 559 | Result := SkipTo(ScriptEndStr); 560 | if Result then 561 | begin 562 | FNodeType := SCRIPT_NODE; 563 | FNodeValue := Copy(FHtmlStr, StartPos, FPosition - StartPos - Length(ScriptEndStr)); 564 | FireEvent(FOnScript) 565 | end 566 | end; 567 | 568 | function THTMLReader.ReadSpecialNode: Boolean; 569 | begin 570 | Result := False; 571 | 572 | Inc(FPosition); 573 | if FPosition > Length(FHTMLStr) then Exit; 574 | 575 | if IsStartDocumentType then Result := ReadDocumentType 576 | else if IsStartCharacterData then Result := ReadCharacterData 577 | else if IsStartComment then Result := ReadComment; 578 | end; 579 | 580 | function THTMLReader.ReadTagNode: Boolean; 581 | var 582 | CurrPos: Integer; 583 | begin 584 | Result := False; 585 | 586 | CurrPos := FPosition; 587 | Inc(FPosition); 588 | if FPosition > Length(FHTMLStr) then Exit; 589 | 590 | if IsSlashChar then Result := ReadEndElementNode 591 | else if IsSpecialTagChar then Result := ReadSpecialNode 592 | else if IsStartScript then Result := ReadScript 593 | else Result := ReadElementNode; 594 | 595 | if not Result then FPosition := CurrPos; 596 | end; 597 | 598 | function THTMLReader.SkipTo(const Signature: string): Boolean; 599 | begin 600 | while FPosition <= Length(FHTMLStr) do 601 | begin 602 | if Match(Signature, False) then 603 | begin 604 | Inc(FPosition, Length(Signature)); 605 | Exit(True); 606 | end; 607 | 608 | Inc(FPosition); 609 | end; 610 | 611 | Result := False; 612 | end; 613 | 614 | procedure THTMLReader.FireEvent(Event: TNotifyEvent); 615 | begin 616 | if Assigned(Event) then Event(Self); 617 | end; 618 | 619 | function THTMLReader.Read: Boolean; 620 | begin 621 | FNodeType := NONE; 622 | FPrefix := ''; 623 | FLocalName := ''; 624 | FNodeValue := ''; 625 | FPublicID := ''; 626 | FSystemID := ''; 627 | FIsEmptyElement := False; 628 | Result := False; 629 | 630 | if FPosition > Length(FHTMLStr) then Exit; 631 | Result := True; 632 | 633 | if FState in [rsBeforeValue, rsInValue, rsInQuotedValue] then 634 | begin 635 | if ReadValueNode then Exit; 636 | 637 | if FState = rsInQuotedValue then Inc(FPosition); 638 | FNodeType := ATTRIBUTE_NODE; 639 | FireEvent(FOnAttributeEnd); 640 | FState := rsBeforeAttr; 641 | end 642 | else if FState = rsBeforeAttr then 643 | begin 644 | if ReadAttrNode then Exit; 645 | 646 | ReadElementTail; 647 | FState := rsInitial; 648 | end 649 | else if IsStartTagChar then 650 | begin 651 | if ReadTagNode then Exit; 652 | 653 | Inc(FPosition); 654 | FNodeType := ENTITY_REFERENCE_NODE; 655 | SetNodeName('lt'); 656 | FireEvent(FOnEntityReference); 657 | end 658 | else if IsStartEntityChar then 659 | begin 660 | if ReadEntityNode then Exit; 661 | 662 | Inc(FPosition); 663 | FNodeType := ENTITY_REFERENCE_NODE; 664 | SetNodeName('amp'); 665 | FireEvent(FOnEntityReference); 666 | end 667 | else ReadTextNode; 668 | end; 669 | 670 | procedure THTMLReader.ReadTextNode; 671 | var 672 | Start: Integer; 673 | begin 674 | Start := FPosition; 675 | repeat Inc(FPosition) 676 | until (FPosition > Length(FHTMLStr)) or IsStartMarkupChar; 677 | FNodeType := TEXT_NODE; 678 | FNodeValue:= Copy(FHTMLStr, Start, FPosition - Start); 679 | 680 | FireEvent(FOnTextNode); 681 | end; 682 | 683 | function THTMLReader.ReadValueNode: Boolean; 684 | begin 685 | Result := False; 686 | if FState = rsBeforeValue then 687 | begin 688 | SkipWhiteSpaces; 689 | if FPosition > Length(FHTMLStr) then Exit; 690 | if not IsEqualChar then Exit; 691 | 692 | Inc(FPosition); 693 | SkipWhiteSpaces; 694 | if FPosition > Length(FHTMLStr) then Exit; 695 | 696 | if IsQuotation then 697 | begin 698 | FQuotation := Ord(FHTMLStr[FPosition]); 699 | Inc(FPosition); 700 | FState := rsInQuotedValue; 701 | end 702 | else FState := rsInValue; 703 | end; 704 | 705 | if FPosition > Length(FHTMLStr) then Exit; 706 | 707 | if IsStartEntityChar then 708 | begin 709 | Result := True; 710 | if ReadEntityNode then Exit; 711 | 712 | Inc(FPosition); 713 | FNodeType := ENTITY_REFERENCE_NODE; 714 | 715 | SetNodeName('amp'); 716 | FireEvent(FOnEntityReference); 717 | end 718 | else Result := ReadAttrTextNode; 719 | end; 720 | 721 | procedure THTMLReader.ReadElementTail; 722 | begin 723 | SkipWhiteSpaces; 724 | 725 | if (FPosition <= Length(FHTMLStr)) and (IsSlashChar) then 726 | begin 727 | FIsEmptyElement := True; 728 | Inc(FPosition) 729 | end; 730 | 731 | SkipTo(WideChar(endTagChar)); 732 | FNodeType := ELEMENT_NODE; 733 | FireEvent(FOnElementEnd); 734 | end; 735 | 736 | procedure THTMLReader.SetHTMLStr(const Value: string); 737 | begin 738 | FHTMLStr := Value; 739 | FPosition := 1; 740 | end; 741 | 742 | procedure THTMLReader.SetNodeName(Value: string); 743 | var 744 | I: Integer; 745 | begin 746 | I := Pos(':', Value); 747 | 748 | if I > 0 then 749 | begin 750 | FPrefix := Copy(Value, 1, I - 1); 751 | FLocalName := Copy(Value, I + 1, Length(Value) - I); 752 | end 753 | else 754 | begin 755 | FPrefix := ''; 756 | FLocalName := Value; 757 | end 758 | end; 759 | 760 | procedure THTMLReader.SkipWhiteSpaces; 761 | begin 762 | while (FPosition <= Length(FHTMLStr)) 763 | and (Ord(FHTMLStr[FPosition]) in whiteSpace) do Inc(FPosition); 764 | end; 765 | 766 | end. 767 | -------------------------------------------------------------------------------- /HTMLp.HTMLTags.pas: -------------------------------------------------------------------------------- 1 | unit HTMLp.HTMLTags; 2 | 3 | interface 4 | 5 | uses 6 | Classes, 7 | 8 | HTMLp.DomCore; 9 | 10 | const 11 | MAX_TAGS_COUNT = 128; 12 | MAX_FLAGS_COUNT = 32; 13 | 14 | type 15 | THTMLTagSet = set of 0..MAX_TAGS_COUNT - 1; 16 | THTMLTagFlags = set of 0..MAX_FLAGS_COUNT - 1; 17 | 18 | THTMLTag = class 19 | private 20 | FName: WideString; 21 | FNumber: Integer; 22 | FParserFlags: THTMLTagFlags; 23 | FFormatterFlags: THTMLTagFlags; 24 | public 25 | constructor Create(const AName: WideString; ANumber: Integer; AParserFlags, AFormatterFlags: THTMLTagFlags); 26 | 27 | property Name: WideString read FName; 28 | property Number: Integer read FNumber; 29 | property ParserFlags: THTMLTagFlags read FParserFlags; 30 | property FormatterFlags: THTMLTagFlags read FFormatterFlags; 31 | end; 32 | 33 | TCompareTag = function(Tag: THTMLTag): Integer of object; 34 | 35 | THTMLTagList = class 36 | private 37 | FList: TList; 38 | FUnknownTag: THTMLTag; 39 | FSearchName: WideString; 40 | FSearchNumber: Integer; 41 | function CompareName(Tag: THTMLTag): Integer; 42 | function CompareNumber(Tag: THTMLTag): Integer; 43 | function GetTag(Compare: TCompareTag): THTMLTag; 44 | public 45 | constructor Create; 46 | destructor Destroy; override; 47 | 48 | function GetTagByName(const Name: string): THTMLTag; 49 | function GetTagByNumber(Number: Integer): THTMLTag; 50 | end; 51 | 52 | TURLSchemes = class(TStringList) 53 | private 54 | FMaxLen: Integer; 55 | public 56 | function Add(const S: string): Integer; override; 57 | function IsURL(const S: string): Boolean; 58 | function GetScheme(const S: string): string; 59 | 60 | property MaxLen: Integer read FMaxLen; 61 | end; 62 | 63 | var 64 | URLSchemes: TURLSchemes; 65 | 66 | const 67 | UNKNOWN_TAG = 0; 68 | A_TAG = 1; 69 | ABBR_TAG = 2; 70 | ACRONYM_TAG = 3; 71 | ADDRESS_TAG = 4; 72 | APPLET_TAG = 5; 73 | AREA_TAG = 6; 74 | B_TAG = 7; 75 | BASE_TAG = 8; 76 | BASEFONT_TAG = 9; 77 | BDO_TAG = 10; 78 | BIG_TAG = 11; 79 | BLOCKQUOTE_TAG = 12; 80 | BODY_TAG = 13; 81 | BR_TAG = 14; 82 | BUTTON_TAG = 15; 83 | CAPTION_TAG = 16; 84 | CENTER_TAG = 17; 85 | CITE_TAG = 18; 86 | CODE_TAG = 19; 87 | COL_TAG = 20; 88 | COLGROUP_TAG = 21; 89 | DD_TAG = 22; 90 | DEL_TAG = 23; 91 | DFN_TAG = 24; 92 | DIR_TAG = 25; 93 | DIV_TAG = 26; 94 | DL_TAG = 27; 95 | DT_TAG = 28; 96 | EM_TAG = 29; 97 | FIELDSET_TAG = 30; 98 | FONT_TAG = 31; 99 | FORM_TAG = 32; 100 | FRAME_TAG = 33; 101 | FRAMESET_TAG = 34; 102 | H1_TAG = 35; 103 | H2_TAG = 36; 104 | H3_TAG = 37; 105 | H4_TAG = 38; 106 | H5_TAG = 39; 107 | H6_TAG = 40; 108 | HEAD_TAG = 41; 109 | HR_TAG = 42; 110 | HTML_TAG = 43; 111 | I_TAG = 44; 112 | IFRAME_TAG = 45; 113 | IMG_TAG = 46; 114 | INPUT_TAG = 47; 115 | INS_TAG = 48; 116 | ISINDEX_TAG = 49; 117 | KBD_TAG = 50; 118 | LABEL_TAG = 51; 119 | LEGEND_TAG = 52; 120 | LI_TAG = 53; 121 | LINK_TAG = 54; 122 | MAP_TAG = 55; 123 | MENU_TAG = 56; 124 | META_TAG = 57; 125 | NOFRAMES_TAG = 58; 126 | NOSCRIPT_TAG = 59; 127 | OBJECT_TAG = 60; 128 | OL_TAG = 61; 129 | OPTGROUP_TAG = 62; 130 | OPTION_TAG = 63; 131 | P_TAG = 64; 132 | PARAM_TAG = 65; 133 | PRE_TAG = 66; 134 | Q_TAG = 67; 135 | S_TAG = 68; 136 | SAMP_TAG = 69; 137 | SCRIPT_TAG = 70; 138 | SELECT_TAG = 71; 139 | SMALL_TAG = 72; 140 | SPAN_TAG = 73; 141 | STRIKE_TAG = 74; 142 | STRONG_TAG = 75; 143 | STYLE_TAG = 76; 144 | SUB_TAG = 77; 145 | SUP_TAG = 78; 146 | TABLE_TAG = 79; 147 | TBODY_TAG = 80; 148 | TD_TAG = 81; 149 | TEXTAREA_TAG = 82; 150 | TFOOT_TAG = 83; 151 | TH_TAG = 84; 152 | THEAD_TAG = 85; 153 | TITLE_TAG = 86; 154 | TR_TAG = 87; 155 | TT_TAG = 88; 156 | U_TAG = 89; 157 | UL_TAG = 90; 158 | VAR_TAG = 91; 159 | {} 160 | SOURCE_TAG = 92; 161 | TRACK_TAG = 93; 162 | WBR_TAG = 94; 163 | EMBED_TAG = 95; 164 | ASIDE_TAG = 96; 165 | ARTICLE_TAG = 97; 166 | BDI_TAG = 98; 167 | FIGURE_TAG = 99; 168 | FOOTER_TAG = 100; 169 | FIGCAPTION_TAG = 101; 170 | HEADER_TAG = 102; 171 | MAIN_TAG = 103; 172 | MARK_TAG = 104; 173 | NAV_TAG = 105; 174 | TIME_TAG = 106; 175 | SECTION_TAG = 107; 176 | PLAINTEXT_TAG = 108; 177 | XMP_TAG = 109; 178 | VIDEO_TAG = 110; 179 | AUDIO_TAG = 111; 180 | 181 | {https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements} 182 | BlockTags = [ADDRESS_TAG, TIME_TAG, BLOCKQUOTE_TAG, CENTER_TAG, DIV_TAG, DL_TAG, FIELDSET_TAG, {FORM_TAG,} A_TAG, H1_TAG, 183 | H2_TAG, H3_TAG, H4_TAG, H5_TAG, H6_TAG, HR_TAG, NOSCRIPT_TAG, OL_TAG, PRE_TAG, TABLE_TAG, UL_TAG, 184 | ARTICLE_TAG, SECTION_TAG, ASIDE_TAG, MAIN_TAG, HEADER_TAG, FOOTER_TAG]; 185 | {} 186 | BlockParentTags = [ADDRESS_TAG, BLOCKQUOTE_TAG, CENTER_TAG, DIV_TAG, DL_TAG, FIELDSET_TAG, A_TAG, 187 | H1_TAG, H2_TAG, H3_TAG, H4_TAG, H5_TAG, H6_TAG, HR_TAG, LI_TAG, NOSCRIPT_TAG, OL_TAG, PRE_TAG, TD_TAG, TH_TAG, UL_TAG, 188 | ARTICLE_TAG, SECTION_TAG, ASIDE_TAG, MAIN_TAG, HEADER_TAG, FOOTER_TAG]; 189 | {https://developer.mozilla.org/en-US/docs/Learn/HTML/Introduction_to_HTML/The_head_metadata_in_HTML} 190 | HeadTags = [BASE_TAG, LINK_TAG, META_TAG, SCRIPT_TAG, STYLE_TAG, TITLE_TAG]; 191 | {https://developer.mozilla.org/en-US/docs/Glossary/Empty_element} 192 | EmptyTags = [AREA_TAG, BASE_TAG, BASEFONT_TAG, BR_TAG, COL_TAG, FRAME_TAG, EMBED_TAG, HR_TAG, IMG_TAG, INPUT_TAG, ISINDEX_TAG, 193 | LINK_TAG, META_TAG, PARAM_TAG, SOURCE_TAG, TRACK_TAG, WBR_TAG]; 194 | {} 195 | NeedFindParentTags = [COL_TAG, COLGROUP_TAG, DD_TAG, DT_TAG, LI_TAG, OPTION_TAG, P_TAG, TABLE_TAG, TBODY_TAG, TD_TAG, 196 | TFOOT_TAG, TH_TAG, THEAD_TAG, TR_TAG]; 197 | {Other} 198 | PreserveWhiteSpaceTags = [PRE_TAG]; 199 | ListItemParentTags = [DIR_TAG, MENU_TAG, OL_TAG, UL_TAG]; 200 | DefItemParentTags = [DL_TAG]; 201 | TableSectionParentTags = [TABLE_TAG]; 202 | ColParentTags = [COLGROUP_TAG]; 203 | RowParentTags = [TABLE_TAG, TBODY_TAG, TFOOT_TAG, THEAD_TAG]; 204 | CellParentTags = [TR_TAG]; 205 | OptionParentTags = [OPTGROUP_TAG, SELECT_TAG]; 206 | PlainTextTags = [CODE_TAG, SCRIPT_TAG, PLAINTEXT_TAG, XMP_TAG]; 207 | 208 | implementation 209 | 210 | uses 211 | SysUtils; 212 | 213 | constructor THTMLTag.Create(const AName: WideString; ANumber: Integer; AParserFlags, AFormatterFlags: THTMLTagFlags); 214 | begin 215 | inherited Create; 216 | 217 | FName := AName; 218 | FNumber := ANumber 219 | end; 220 | 221 | constructor THTMLTagList.Create; 222 | begin 223 | inherited Create; 224 | 225 | FList := TList.Create; 226 | FList.Capacity := MAX_TAGS_COUNT; 227 | FList.Add(THTMLTag.Create('a', A_TAG, [], [])); 228 | FList.Add(THTMLTag.Create('abbr', ABBR_TAG, [], [])); 229 | FList.Add(THTMLTag.Create('acronym', ACRONYM_TAG, [], [])); 230 | FList.Add(THTMLTag.Create('address', ADDRESS_TAG, [], [])); 231 | FList.Add(THTMLTag.Create('applet', APPLET_TAG, [], [])); 232 | FList.Add(THTMLTag.Create('area', AREA_TAG, [], [])); 233 | FList.Add(THTMLTag.Create('aside', ASIDE_TAG, [], [])); 234 | FList.Add(THTMLTag.Create('article', ARTICLE_TAG, [], [])); 235 | FList.Add(THTMLTag.Create('audio', AUDIO_TAG, [], [])); 236 | FList.Add(THTMLTag.Create('b', B_TAG, [], [])); 237 | FList.Add(THTMLTag.Create('base', BASE_TAG, [], [])); 238 | FList.Add(THTMLTag.Create('basefont', BASEFONT_TAG, [], [])); 239 | FList.Add(THTMLTag.Create('bdo', BDO_TAG, [], [])); 240 | FList.Add(THTMLTag.Create('big', BIG_TAG, [], [])); 241 | FList.Add(THTMLTag.Create('blockquote', BLOCKQUOTE_TAG, [], [])); 242 | FList.Add(THTMLTag.Create('body', BODY_TAG, [], [])); 243 | FList.Add(THTMLTag.Create('br', BR_TAG, [], [])); 244 | FList.Add(THTMLTag.Create('button', BUTTON_TAG, [], [])); 245 | FList.Add(THTMLTag.Create('bdi', BDI_TAG, [], [])); 246 | FList.Add(THTMLTag.Create('caption', CAPTION_TAG, [], [])); 247 | FList.Add(THTMLTag.Create('center', CENTER_TAG, [], [])); 248 | FList.Add(THTMLTag.Create('cite', CITE_TAG, [], [])); 249 | FList.Add(THTMLTag.Create('code', CODE_TAG, [], [])); 250 | FList.Add(THTMLTag.Create('col', COL_TAG, [], [])); 251 | FList.Add(THTMLTag.Create('colgroup', COLGROUP_TAG, [], [])); 252 | FList.Add(THTMLTag.Create('dd', DD_TAG, [], [])); 253 | FList.Add(THTMLTag.Create('del', DEL_TAG, [], [])); 254 | FList.Add(THTMLTag.Create('dfn', DFN_TAG, [], [])); 255 | FList.Add(THTMLTag.Create('dir', DIR_TAG, [], [])); 256 | FList.Add(THTMLTag.Create('div', DIV_TAG, [], [])); 257 | FList.Add(THTMLTag.Create('dl', DL_TAG, [], [])); 258 | FList.Add(THTMLTag.Create('dt', DT_TAG, [], [])); 259 | FList.Add(THTMLTag.Create('em', EM_TAG, [], [])); 260 | FList.Add(THTMLTag.Create('embed', EMBED_TAG, [], [])); 261 | FList.Add(THTMLTag.Create('fieldset', FIELDSET_TAG, [], [])); 262 | FList.Add(THTMLTag.Create('font', FONT_TAG, [], [])); 263 | FList.Add(THTMLTag.Create('form', FORM_TAG, [], [])); 264 | FList.Add(THTMLTag.Create('frame', FRAME_TAG, [], [])); 265 | FList.Add(THTMLTag.Create('frameset', FRAMESET_TAG, [], [])); 266 | FList.Add(THTMLTag.Create('figure', FIGURE_TAG, [], [])); 267 | FList.Add(THTMLTag.Create('footer', FOOTER_TAG, [], [])); 268 | FList.Add(THTMLTag.Create('figcaption', FIGCAPTION_TAG, [], [])); 269 | FList.Add(THTMLTag.Create('h1', H1_TAG, [], [])); 270 | FList.Add(THTMLTag.Create('h2', H2_TAG, [], [])); 271 | FList.Add(THTMLTag.Create('h3', H3_TAG, [], [])); 272 | FList.Add(THTMLTag.Create('h4', H4_TAG, [], [])); 273 | FList.Add(THTMLTag.Create('h5', H5_TAG, [], [])); 274 | FList.Add(THTMLTag.Create('h6', H6_TAG, [], [])); 275 | FList.Add(THTMLTag.Create('head', HEAD_TAG, [], [])); 276 | FList.Add(THTMLTag.Create('header', HEADER_TAG, [], [])); 277 | FList.Add(THTMLTag.Create('hr', HR_TAG, [], [])); 278 | FList.Add(THTMLTag.Create('html', HTML_TAG, [], [])); 279 | FList.Add(THTMLTag.Create('i', I_TAG, [], [])); 280 | FList.Add(THTMLTag.Create('iframe', IFRAME_TAG, [], [])); 281 | FList.Add(THTMLTag.Create('img', IMG_TAG, [], [])); 282 | FList.Add(THTMLTag.Create('input', INPUT_TAG, [], [])); 283 | FList.Add(THTMLTag.Create('ins', INS_TAG, [], [])); 284 | FList.Add(THTMLTag.Create('isindex', ISINDEX_TAG, [], [])); 285 | FList.Add(THTMLTag.Create('kbd', KBD_TAG, [], [])); 286 | FList.Add(THTMLTag.Create('label', LABEL_TAG, [], [])); 287 | FList.Add(THTMLTag.Create('legend', LEGEND_TAG, [], [])); 288 | FList.Add(THTMLTag.Create('li', LI_TAG, [], [])); 289 | FList.Add(THTMLTag.Create('link', LINK_TAG, [], [])); 290 | FList.Add(THTMLTag.Create('map', MAP_TAG, [], [])); 291 | FList.Add(THTMLTag.Create('menu', MENU_TAG, [], [])); 292 | FList.Add(THTMLTag.Create('meta', META_TAG, [], [])); 293 | FList.Add(THTMLTag.Create('main', MAIN_TAG, [], [])); 294 | FList.Add(THTMLTag.Create('mark', MARK_TAG, [], [])); 295 | FList.Add(THTMLTag.Create('noframes', NOFRAMES_TAG, [], [])); 296 | FList.Add(THTMLTag.Create('noscript', NOSCRIPT_TAG, [], [])); 297 | FList.Add(THTMLTag.Create('nav', NAV_TAG, [], [])); 298 | FList.Add(THTMLTag.Create('object', OBJECT_TAG, [], [])); 299 | FList.Add(THTMLTag.Create('ol', OL_TAG, [], [])); 300 | FList.Add(THTMLTag.Create('optgroup', OPTGROUP_TAG, [], [])); 301 | FList.Add(THTMLTag.Create('option', OPTION_TAG, [], [])); 302 | FList.Add(THTMLTag.Create('p', P_TAG, [], [])); 303 | FList.Add(THTMLTag.Create('param', PARAM_TAG, [], [])); 304 | FList.Add(THTMLTag.Create('pre', PRE_TAG, [], [])); 305 | FList.Add(THTMLTag.Create('plaintext', PLAINTEXT_TAG, [], [])); 306 | FList.Add(THTMLTag.Create('q', Q_TAG, [], [])); 307 | FList.Add(THTMLTag.Create('s', S_TAG, [], [])); 308 | FList.Add(THTMLTag.Create('samp', SAMP_TAG, [], [])); 309 | FList.Add(THTMLTag.Create('script', SCRIPT_TAG, [], [])); 310 | FList.Add(THTMLTag.Create('select', SELECT_TAG, [], [])); 311 | FList.Add(THTMLTag.Create('small', SMALL_TAG, [], [])); 312 | FList.Add(THTMLTag.Create('span', SPAN_TAG, [], [])); 313 | FList.Add(THTMLTag.Create('strike', STRIKE_TAG, [], [])); 314 | FList.Add(THTMLTag.Create('strong', STRONG_TAG, [], [])); 315 | FList.Add(THTMLTag.Create('style', STYLE_TAG, [], [])); 316 | FList.Add(THTMLTag.Create('sub', SUB_TAG, [], [])); 317 | FList.Add(THTMLTag.Create('sup', SUP_TAG, [], [])); 318 | FList.Add(THTMLTag.Create('source', SOURCE_TAG, [], [])); 319 | FList.Add(THTMLTag.Create('section', SECTION_TAG, [], [])); 320 | FList.Add(THTMLTag.Create('table', TABLE_TAG, [], [])); 321 | FList.Add(THTMLTag.Create('tbody', TBODY_TAG, [], [])); 322 | FList.Add(THTMLTag.Create('td', TD_TAG, [], [])); 323 | FList.Add(THTMLTag.Create('textarea', TEXTAREA_TAG, [], [])); 324 | FList.Add(THTMLTag.Create('tfoot', TFOOT_TAG, [], [])); 325 | FList.Add(THTMLTag.Create('th', TH_TAG, [], [])); 326 | FList.Add(THTMLTag.Create('thead', THEAD_TAG, [], [])); 327 | FList.Add(THTMLTag.Create('title', TITLE_TAG, [], [])); 328 | FList.Add(THTMLTag.Create('tr', TR_TAG, [], [])); 329 | FList.Add(THTMLTag.Create('tt', TT_TAG, [], [])); 330 | FList.Add(THTMLTag.Create('track', TRACK_TAG, [], [])); 331 | FList.Add(THTMLTag.Create('time', TIME_TAG, [], [])); 332 | FList.Add(THTMLTag.Create('u', U_TAG, [], [])); 333 | FList.Add(THTMLTag.Create('ul', UL_TAG, [], [])); 334 | FList.Add(THTMLTag.Create('var', VAR_TAG, [], [])); 335 | FList.Add(THTMLTag.Create('video', VIDEO_TAG, [], [])); 336 | FList.Add(THTMLTag.Create('wbr', WBR_TAG, [], [])); 337 | FList.Add(THTMLTag.Create('xmp', XMP_TAG, [], [])); 338 | 339 | FUnknownTag := THTMLTag.Create('', UNKNOWN_TAG, [], []); 340 | end; 341 | 342 | destructor THTMLTagList.Destroy; 343 | var 344 | I: Integer; 345 | begin 346 | for I := FList.Count - 1 downto 0 do THTMLTag(FList[I]).Free; 347 | FList.Free; 348 | FUnknownTag.Free; 349 | 350 | inherited Destroy; 351 | end; 352 | 353 | {function THTMLTagList.GetTag(Compare: TCompareTag): THTMLTag; 354 | var 355 | I, Low, High, Rel: Integer; 356 | begin 357 | Result := nil; 358 | 359 | for i := FList.Count - 1 downto 0 do 360 | begin 361 | if Compare(FList[i]) <> 0 then 362 | begin 363 | Result := FList[i]; 364 | Exit; 365 | end; 366 | end; 367 | end;} 368 | 369 | function THTMLTagList.GetTag(Compare: TCompareTag): THTMLTag; 370 | var 371 | I, Low, High, Rel: Integer; 372 | begin 373 | Low := -1; 374 | High := FList.Count - 1; 375 | 376 | while High - Low > 1 do 377 | begin 378 | I := (High + Low) div 2; 379 | Result := FList[I]; 380 | 381 | Rel := Compare(Result); 382 | if Rel < 0 then High := I 383 | else if Rel > 0 then Low := I 384 | else Exit; 385 | end; 386 | 387 | if High >= 0 then 388 | begin 389 | Result := FList[High]; 390 | if Compare(Result) = 0 then Exit; 391 | end; 392 | 393 | Result := nil; 394 | end; 395 | 396 | function THTMLTagList.CompareName(Tag: THTMLTag): Integer; 397 | begin 398 | Result := AnsiCompareStr(FSearchName, Tag.Name) 399 | end; 400 | 401 | function THTMLTagList.CompareNumber(Tag: THTMLTag): Integer; 402 | begin 403 | Result := (FSearchNumber - Tag.Number); 404 | end; 405 | 406 | function THTMLTagList.GetTagByName(const Name: string): THTMLTag; 407 | begin 408 | FSearchName := Name; 409 | Result := GetTag(CompareName); 410 | if Result = nil then Result := FUnknownTag; 411 | end; 412 | 413 | function THTMLTagList.GetTagByNumber(Number: Integer): THTMLTag; 414 | begin 415 | FSearchNumber := Number; 416 | Result := GetTag(CompareNumber); 417 | end; 418 | 419 | function TURLSchemes.Add(const S: string): Integer; 420 | begin 421 | if Length(S) > FMaxLen then FMaxLen := Length(S); 422 | Result := inherited Add(S); 423 | end; 424 | 425 | function TURLSchemes.IsURL(const S: string): Boolean; 426 | begin 427 | Result := IndexOf(LowerCase(S)) >= 0; 428 | end; 429 | 430 | function TURLSchemes.GetScheme(const S: string): string; 431 | const 432 | SchemeChars = [Ord('A')..Ord('Z'), Ord('a')..Ord('z')]; 433 | var 434 | I: Integer; 435 | begin 436 | Result := ''; 437 | for I := 1 to MaxLen + 1 do 438 | begin 439 | if I > Length(S) then Exit; 440 | 441 | if S[I] = ':' then 442 | begin 443 | if IsURL(Copy(S, 1, I - 1)) then Result := Copy(S, 1, I - 1); 444 | Exit 445 | end 446 | end 447 | end; 448 | 449 | initialization 450 | //HTMLTagList := THTMLTagList.Create; 451 | URLSchemes := TURLSchemes.Create; 452 | URLSchemes.Add('http'); 453 | URLSchemes.Add('https'); 454 | URLSchemes.Add('ftp'); 455 | URLSchemes.Add('mailto'); 456 | URLSchemes.Add('news'); 457 | URLSchemes.Add('nntp'); 458 | URLSchemes.Add('gopher'); 459 | 460 | finalization 461 | //HTMLTagList.Free; 462 | URLSchemes.Free; 463 | 464 | end. 465 | -------------------------------------------------------------------------------- /HTMLp.Helper.pas: -------------------------------------------------------------------------------- 1 | unit HTMLp.Helper; 2 | 3 | interface 4 | 5 | uses System.Classes, System.SysUtils, System.StrUtils, System.Math, System.Variants, System.Types, 6 | System.Generics.Collections, 7 | 8 | HTMLp.DomCore, HTMLp.Formatter, 9 | HTMLp.HtmlParser; 10 | 11 | type 12 | TNode = HTMLp.DomCore.TNode; 13 | TNodeList = HTMLp.DomCore.TNodeList; 14 | TElement = HTMLp.DomCore.TElement; 15 | 16 | IHTMLParser = interface 17 | ['{A2496DE9-17B0-40FC-A804-D19363A14154}'] 18 | function Find(const selector: string): IHTMLParser; 19 | function Map(callback: TProc): IHTMLParser; 20 | {} 21 | function IsValid: Boolean; 22 | function GetRootNode: TElement; 23 | function GetFirstNode: TElement; 24 | function GetLastNode: TElement; 25 | function GetNodeCount: Integer; 26 | function GetNodeList: TNodeList; 27 | function Clear: IHTMLParser; 28 | {} 29 | property RootNode: TElement read GetRootNode; 30 | property NodeList: TNodeList read GetNodeList; 31 | property NodeCount: Integer read GetNodeCount; 32 | end; 33 | 34 | THTMLParserHelper = class(TInterfacedObject, IHTMLParser) 35 | private 36 | FDocument: TDocument; 37 | FCurrentNodeList: TNodeList; 38 | FCurrentElement: TElement; 39 | FNodeList: TList; 40 | public 41 | constructor Create(const HTML: string); overload; 42 | destructor Destroy; override; 43 | 44 | function Find(const selector: string): IHTMLParser; 45 | function SelectNode(const nodeIndex: Integer): IHTMLParser; 46 | function Map(callback: TProc): IHTMLParser; 47 | 48 | function IsValid: Boolean; 49 | function GetRootNode: TElement; 50 | function GetFirstNode: TElement; 51 | function GetLastNode: TElement; 52 | function GetNodeCount: Integer; 53 | function GetNodeList: TNodeList; 54 | function Clear: IHTMLParser; 55 | 56 | property RootNode: TElement read GetRootNode; 57 | property NodeList: TNodeList read GetNodeList; 58 | property NodeCount: Integer read GetNodeCount; 59 | end; 60 | 61 | function ParseHTML(const HTML: string): IHTMLParser; 62 | 63 | implementation 64 | 65 | function ParseHTML(const HTML: string): IHTMLParser; 66 | begin 67 | Result := THTMLParserHelper.Create(HTML); 68 | end; 69 | 70 | constructor THTMLParserHelper.Create(const HTML: string); 71 | begin 72 | inherited Create; 73 | 74 | {} 75 | with THTMLParser.Create do 76 | begin 77 | try FDocument := parseString(HTML); except end; 78 | FCurrentElement := GetRootNode; 79 | 80 | Free; 81 | end; 82 | 83 | {} 84 | FNodeList := TList.Create; 85 | end; 86 | 87 | destructor THTMLParserHelper.Destroy; 88 | begin 89 | Clear; 90 | FCurrentElement := nil; 91 | 92 | FreeAndNil(FNodeList); 93 | FreeAndNil(FDocument); 94 | 95 | inherited; 96 | end; 97 | 98 | function THTMLParserHelper.Find(const selector: string): IHTMLParser; 99 | begin 100 | Result := Self; 101 | FCurrentNodeList := nil; 102 | 103 | try 104 | if Pos('/', selector) = 1 then FCurrentNodeList := FCurrentElement.GetElementsByXPath(selector) 105 | else FCurrentNodeList := FCurrentElement.GetElementsByCSSSelector(selector); 106 | except end; 107 | 108 | {if not (Assigned(FCurrentNodeList)) then 109 | begin 110 | FCurrentNodeList := FDocument.CreateElement('div'); 111 | end; } 112 | 113 | if Assigned(FCurrentNodeList) then FNodeList.Add(FCurrentNodeList); 114 | end; 115 | 116 | function THTMLParserHelper.SelectNode(const nodeIndex: Integer): IHTMLParser; 117 | begin 118 | Result := Self; 119 | if nodeIndex >= FCurrentNodeList.Count then Exit; 120 | 121 | FCurrentElement := (FCurrentNodeList[nodeIndex] as TElement); 122 | end; 123 | 124 | function THTMLParserHelper.Map(callback: TProc): IHTMLParser; 125 | var 126 | i: Integer; 127 | begin 128 | Result := Self; 129 | if not (Assigned(FCurrentNodeList)) then Exit; 130 | 131 | for i := 0 to FCurrentNodeList.Count - 1 do callback(i, (FCurrentNodeList[i] as TElement)); 132 | end; 133 | 134 | 135 | {} 136 | 137 | function THTMLParserHelper.IsValid: Boolean; 138 | begin 139 | Result := Assigned(FCurrentElement); 140 | end; 141 | 142 | function THTMLParserHelper.GetRootNode: TElement; 143 | begin 144 | Result := FDocument.DocumentElement; 145 | end; 146 | 147 | function THTMLParserHelper.GetFirstNode: TElement; 148 | var 149 | node: TNode; 150 | begin 151 | Result := nil; 152 | 153 | node := FCurrentNodeList.GetFirst; 154 | if Assigned(node) then Result := (node as TElement); 155 | end; 156 | 157 | function THTMLParserHelper.GetLastNode: TElement; 158 | var 159 | node: TNode; 160 | begin 161 | Result := nil; 162 | 163 | node := FCurrentNodeList.GetLast; 164 | if Assigned(node) then Result := (node as TElement); 165 | end; 166 | 167 | function THTMLParserHelper.GetNodeCount: Integer; 168 | begin 169 | Result := FCurrentNodeList.Count; 170 | end; 171 | 172 | function THTMLParserHelper.GetNodeList: TNodeList; 173 | begin 174 | Result := FCurrentNodeList; 175 | end; 176 | 177 | function THTMLParserHelper.Clear: IHTMLParser; 178 | var 179 | i: Integer; 180 | begin 181 | FCurrentNodeList := nil; 182 | 183 | for i := 0 to FNodeList.Count - 1 do FNodeList[i].Destroy; 184 | FNodeList.Clear; 185 | 186 | Result := Self; 187 | end; 188 | 189 | end. 190 | 191 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 489 | USA 490 | 491 | Also add information on how to contact you by electronic and paper mail. 492 | 493 | You should also get your employer (if you work as a programmer) or your 494 | school, if any, to sign a "copyright disclaimer" for the library, if 495 | necessary. Here is a sample; alter the names: 496 | 497 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 498 | library `Frob' (a library for tweaking knobs) written by James Random 499 | Hacker. 500 | 501 | , 1 April 1990 502 | Ty Coon, President of Vice 503 | 504 | That's all there is to it! 505 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTMLp 2 | Delphi Dom HTML Parser and Converter. 3 | > Fork from: https://sourceforge.net/projects/htmlp/ 4 | 5 | ## Original Authors 6 | * Krzysztof Kochanski 7 | * [Sébastien Morin](https://github.com/smsisko) 8 | 9 | ## Contributors 10 | * [Roman Yankovsky](https://github.com/RomanYankovsky) 11 | -------------------------------------------------------------------------------- /example/HTMLP.dpr: -------------------------------------------------------------------------------- 1 | program HTMLP; 2 | 3 | uses 4 | Forms, 5 | MainForm in 'MainForm.pas' {HTMLForm}, 6 | HTMLp.DOMCore in '..\HTMLp.DOMCore.pas', 7 | HTMLp.Entities in '..\HTMLp.Entities.pas', 8 | HTMLp.Formatter in '..\HTMLp.Formatter.pas', 9 | HTMLp.Helper in '..\HTMLp.Helper.pas', 10 | HTMLp.HTMLParser in '..\HTMLp.HTMLParser.pas', 11 | HTMLp.HTMLReader in '..\HTMLp.HTMLReader.pas', 12 | HTMLp.HTMLTags in '..\HTMLp.HTMLTags.pas'; 13 | 14 | {$R *.RES} 15 | 16 | begin 17 | Application.Initialize; 18 | Application.CreateForm(THTMLForm, HTMLForm); 19 | Application.Run; 20 | end. 21 | -------------------------------------------------------------------------------- /example/HTMLP.dproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | {711105A8-17EB-4F45-8F90-D0147537B5C7} 4 | HTMLP.dpr 5 | True 6 | Debug 7 | 1153 8 | Application 9 | VCL 10 | 18.8 11 | Win32 12 | 13 | 14 | true 15 | 16 | 17 | true 18 | Base 19 | true 20 | 21 | 22 | true 23 | Base 24 | true 25 | 26 | 27 | true 28 | Base 29 | true 30 | 31 | 32 | true 33 | Cfg_1 34 | true 35 | true 36 | 37 | 38 | true 39 | Base 40 | true 41 | 42 | 43 | true 44 | Cfg_2 45 | true 46 | true 47 | 48 | 49 | true 50 | Cfg_2 51 | true 52 | true 53 | 54 | 55 | false 56 | false 57 | false 58 | false 59 | false 60 | 00400000 61 | HTMLP 62 | Vcl;Vcl.Imaging;Vcl.Touch;Vcl.Samples;Vcl.Shell;System;Xml;Data;Datasnap;Web;Soap;Winapi;$(DCC_Namespace) 63 | 1049 64 | CompanyName=;FileDescription=;FileVersion=1.0.0.0;InternalName=;LegalCopyright=;LegalTrademarks=;OriginalFilename=;ProductName=;ProductVersion=1.0.0.0;Comments=;CFBundleName= 65 | 66 | 67 | System.Win;Data.Win;Datasnap.Win;Web.Win;Soap.Win;Xml.Win;Bde;$(DCC_Namespace) 68 | Debug 69 | true 70 | CompanyName=;FileDescription=$(MSBuildProjectName);FileVersion=1.0.0.0;InternalName=;LegalCopyright=;LegalTrademarks=;OriginalFilename=;ProductName=$(MSBuildProjectName);ProductVersion=1.0.0.0;Comments=;ProgramID=com.embarcadero.$(MSBuildProjectName) 71 | 1033 72 | $(BDS)\bin\default_app.manifest 73 | true 74 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_44.png 75 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_150.png 76 | 77 | 78 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_44.png 79 | $(BDS)\bin\Artwork\Windows\UWP\delphi_UwpDefault_150.png 80 | 81 | 82 | RELEASE;$(DCC_Define) 83 | 0 84 | false 85 | 0 86 | 87 | 88 | true 89 | PerMonitorV2 90 | 91 | 92 | DEBUG;$(DCC_Define) 93 | false 94 | true 95 | 96 | 97 | Debug 98 | 99 | 100 | true 101 | PerMonitorV2 102 | CompanyName=;FileDescription=$(MSBuildProjectName);FileVersion=1.0.0.0;InternalName=;LegalCopyright=;LegalTrademarks=;OriginalFilename=;ProductName=$(MSBuildProjectName);ProductVersion=1.0.0.0;Comments=;ProgramID=com.embarcadero.$(MSBuildProjectName) 103 | 3 104 | 105 | 106 | 107 | MainSource 108 | 109 | 110 |
HTMLForm
111 |
112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | Cfg_2 121 | Base 122 | 123 | 124 | Base 125 | 126 | 127 | Cfg_1 128 | Base 129 | 130 |
131 | 132 | Delphi.Personality.12 133 | 134 | 135 | 136 | 137 | HTMLP.dpr 138 | 139 | 140 | 141 | True 142 | True 143 | True 144 | False 145 | 146 | 147 | 148 | 149 | HTMLP.exe 150 | true 151 | 152 | 153 | 154 | 155 | 1 156 | 157 | 158 | Contents\MacOS 159 | 1 160 | 161 | 162 | 0 163 | 164 | 165 | 166 | 167 | classes 168 | 1 169 | 170 | 171 | classes 172 | 1 173 | 174 | 175 | 176 | 177 | res\xml 178 | 1 179 | 180 | 181 | res\xml 182 | 1 183 | 184 | 185 | 186 | 187 | library\lib\armeabi-v7a 188 | 1 189 | 190 | 191 | 192 | 193 | library\lib\armeabi 194 | 1 195 | 196 | 197 | library\lib\armeabi 198 | 1 199 | 200 | 201 | 202 | 203 | library\lib\armeabi-v7a 204 | 1 205 | 206 | 207 | 208 | 209 | library\lib\mips 210 | 1 211 | 212 | 213 | library\lib\mips 214 | 1 215 | 216 | 217 | 218 | 219 | library\lib\armeabi-v7a 220 | 1 221 | 222 | 223 | library\lib\arm64-v8a 224 | 1 225 | 226 | 227 | 228 | 229 | library\lib\armeabi-v7a 230 | 1 231 | 232 | 233 | 234 | 235 | res\drawable 236 | 1 237 | 238 | 239 | res\drawable 240 | 1 241 | 242 | 243 | 244 | 245 | res\values 246 | 1 247 | 248 | 249 | res\values 250 | 1 251 | 252 | 253 | 254 | 255 | res\values-v21 256 | 1 257 | 258 | 259 | res\values-v21 260 | 1 261 | 262 | 263 | 264 | 265 | res\values 266 | 1 267 | 268 | 269 | res\values 270 | 1 271 | 272 | 273 | 274 | 275 | res\drawable 276 | 1 277 | 278 | 279 | res\drawable 280 | 1 281 | 282 | 283 | 284 | 285 | res\drawable-xxhdpi 286 | 1 287 | 288 | 289 | res\drawable-xxhdpi 290 | 1 291 | 292 | 293 | 294 | 295 | res\drawable-ldpi 296 | 1 297 | 298 | 299 | res\drawable-ldpi 300 | 1 301 | 302 | 303 | 304 | 305 | res\drawable-mdpi 306 | 1 307 | 308 | 309 | res\drawable-mdpi 310 | 1 311 | 312 | 313 | 314 | 315 | res\drawable-hdpi 316 | 1 317 | 318 | 319 | res\drawable-hdpi 320 | 1 321 | 322 | 323 | 324 | 325 | res\drawable-xhdpi 326 | 1 327 | 328 | 329 | res\drawable-xhdpi 330 | 1 331 | 332 | 333 | 334 | 335 | res\drawable-mdpi 336 | 1 337 | 338 | 339 | res\drawable-mdpi 340 | 1 341 | 342 | 343 | 344 | 345 | res\drawable-hdpi 346 | 1 347 | 348 | 349 | res\drawable-hdpi 350 | 1 351 | 352 | 353 | 354 | 355 | res\drawable-xhdpi 356 | 1 357 | 358 | 359 | res\drawable-xhdpi 360 | 1 361 | 362 | 363 | 364 | 365 | res\drawable-xxhdpi 366 | 1 367 | 368 | 369 | res\drawable-xxhdpi 370 | 1 371 | 372 | 373 | 374 | 375 | res\drawable-xxxhdpi 376 | 1 377 | 378 | 379 | res\drawable-xxxhdpi 380 | 1 381 | 382 | 383 | 384 | 385 | res\drawable-small 386 | 1 387 | 388 | 389 | res\drawable-small 390 | 1 391 | 392 | 393 | 394 | 395 | res\drawable-normal 396 | 1 397 | 398 | 399 | res\drawable-normal 400 | 1 401 | 402 | 403 | 404 | 405 | res\drawable-large 406 | 1 407 | 408 | 409 | res\drawable-large 410 | 1 411 | 412 | 413 | 414 | 415 | res\drawable-xlarge 416 | 1 417 | 418 | 419 | res\drawable-xlarge 420 | 1 421 | 422 | 423 | 424 | 425 | res\values 426 | 1 427 | 428 | 429 | res\values 430 | 1 431 | 432 | 433 | 434 | 435 | 1 436 | 437 | 438 | Contents\MacOS 439 | 1 440 | 441 | 442 | 0 443 | 444 | 445 | 446 | 447 | Contents\MacOS 448 | 1 449 | .framework 450 | 451 | 452 | Contents\MacOS 453 | 1 454 | .framework 455 | 456 | 457 | 0 458 | 459 | 460 | 461 | 462 | 1 463 | .dylib 464 | 465 | 466 | 1 467 | .dylib 468 | 469 | 470 | 1 471 | .dylib 472 | 473 | 474 | Contents\MacOS 475 | 1 476 | .dylib 477 | 478 | 479 | Contents\MacOS 480 | 1 481 | .dylib 482 | 483 | 484 | 0 485 | .dll;.bpl 486 | 487 | 488 | 489 | 490 | 1 491 | .dylib 492 | 493 | 494 | 1 495 | .dylib 496 | 497 | 498 | 1 499 | .dylib 500 | 501 | 502 | Contents\MacOS 503 | 1 504 | .dylib 505 | 506 | 507 | Contents\MacOS 508 | 1 509 | .dylib 510 | 511 | 512 | 0 513 | .bpl 514 | 515 | 516 | 517 | 518 | 0 519 | 520 | 521 | 0 522 | 523 | 524 | 0 525 | 526 | 527 | 0 528 | 529 | 530 | 0 531 | 532 | 533 | Contents\Resources\StartUp\ 534 | 0 535 | 536 | 537 | Contents\Resources\StartUp\ 538 | 0 539 | 540 | 541 | 0 542 | 543 | 544 | 545 | 546 | 1 547 | 548 | 549 | 1 550 | 551 | 552 | 1 553 | 554 | 555 | 556 | 557 | 1 558 | 559 | 560 | 1 561 | 562 | 563 | 1 564 | 565 | 566 | 567 | 568 | 1 569 | 570 | 571 | 1 572 | 573 | 574 | 1 575 | 576 | 577 | 578 | 579 | 1 580 | 581 | 582 | 1 583 | 584 | 585 | 1 586 | 587 | 588 | 589 | 590 | 1 591 | 592 | 593 | 1 594 | 595 | 596 | 1 597 | 598 | 599 | 600 | 601 | 1 602 | 603 | 604 | 1 605 | 606 | 607 | 1 608 | 609 | 610 | 611 | 612 | 1 613 | 614 | 615 | 1 616 | 617 | 618 | 1 619 | 620 | 621 | 622 | 623 | 1 624 | 625 | 626 | 1 627 | 628 | 629 | 1 630 | 631 | 632 | 633 | 634 | 1 635 | 636 | 637 | 1 638 | 639 | 640 | 1 641 | 642 | 643 | 644 | 645 | 1 646 | 647 | 648 | 1 649 | 650 | 651 | 1 652 | 653 | 654 | 655 | 656 | 1 657 | 658 | 659 | 1 660 | 661 | 662 | 1 663 | 664 | 665 | 666 | 667 | 1 668 | 669 | 670 | 1 671 | 672 | 673 | 1 674 | 675 | 676 | 677 | 678 | 1 679 | 680 | 681 | 1 682 | 683 | 684 | 1 685 | 686 | 687 | 688 | 689 | 1 690 | 691 | 692 | 1 693 | 694 | 695 | 1 696 | 697 | 698 | 699 | 700 | 1 701 | 702 | 703 | 1 704 | 705 | 706 | 1 707 | 708 | 709 | 710 | 711 | 1 712 | 713 | 714 | 1 715 | 716 | 717 | 1 718 | 719 | 720 | 721 | 722 | 1 723 | 724 | 725 | 1 726 | 727 | 728 | 1 729 | 730 | 731 | 732 | 733 | 1 734 | 735 | 736 | 1 737 | 738 | 739 | 1 740 | 741 | 742 | 743 | 744 | 1 745 | 746 | 747 | 1 748 | 749 | 750 | 1 751 | 752 | 753 | 754 | 755 | 1 756 | 757 | 758 | 1 759 | 760 | 761 | 1 762 | 763 | 764 | 765 | 766 | 1 767 | 768 | 769 | 1 770 | 771 | 772 | 1 773 | 774 | 775 | 776 | 777 | 1 778 | 779 | 780 | 1 781 | 782 | 783 | 1 784 | 785 | 786 | 787 | 788 | 1 789 | 790 | 791 | 1 792 | 793 | 794 | 1 795 | 796 | 797 | 798 | 799 | 1 800 | 801 | 802 | 1 803 | 804 | 805 | 1 806 | 807 | 808 | 809 | 810 | 1 811 | 812 | 813 | 1 814 | 815 | 816 | 817 | 818 | ..\$(PROJECTNAME).app.dSYM\Contents\Resources\DWARF 819 | 1 820 | 821 | 822 | ..\$(PROJECTNAME).app.dSYM\Contents\Resources\DWARF 823 | 1 824 | 825 | 826 | 827 | 828 | 1 829 | 830 | 831 | 1 832 | 833 | 834 | 835 | 836 | ..\ 837 | 1 838 | 839 | 840 | ..\ 841 | 1 842 | 843 | 844 | 845 | 846 | 1 847 | 848 | 849 | 1 850 | 851 | 852 | 1 853 | 854 | 855 | 856 | 857 | 1 858 | 859 | 860 | 1 861 | 862 | 863 | 1 864 | 865 | 866 | 867 | 868 | ..\$(PROJECTNAME).app.dSYM\Contents\Resources\DWARF 869 | 1 870 | 871 | 872 | 873 | 874 | ..\ 875 | 1 876 | 877 | 878 | ..\ 879 | 1 880 | 881 | 882 | 883 | 884 | Contents 885 | 1 886 | 887 | 888 | Contents 889 | 1 890 | 891 | 892 | 893 | 894 | Contents\Resources 895 | 1 896 | 897 | 898 | Contents\Resources 899 | 1 900 | 901 | 902 | 903 | 904 | library\lib\armeabi-v7a 905 | 1 906 | 907 | 908 | library\lib\arm64-v8a 909 | 1 910 | 911 | 912 | 1 913 | 914 | 915 | 1 916 | 917 | 918 | 1 919 | 920 | 921 | 1 922 | 923 | 924 | Contents\MacOS 925 | 1 926 | 927 | 928 | Contents\MacOS 929 | 1 930 | 931 | 932 | 0 933 | 934 | 935 | 936 | 937 | library\lib\armeabi-v7a 938 | 1 939 | 940 | 941 | 942 | 943 | 1 944 | 945 | 946 | 1 947 | 948 | 949 | 950 | 951 | Assets 952 | 1 953 | 954 | 955 | Assets 956 | 1 957 | 958 | 959 | 960 | 961 | Assets 962 | 1 963 | 964 | 965 | Assets 966 | 1 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 12 982 | 983 | 984 | 985 | 986 |
987 | -------------------------------------------------------------------------------- /example/MainForm.dfm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ange007/HTMLp/706333a1b28b0b62ba54bb3eb367bfbc78b33e8f/example/MainForm.dfm -------------------------------------------------------------------------------- /example/MainForm.pas: -------------------------------------------------------------------------------- 1 | unit MainForm; 2 | 3 | interface 4 | 5 | uses 6 | Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs, 7 | StdCtrls, ExtCtrls, IdHTTP, StrUtils, 8 | 9 | HTMLp.HTMLParser, HTMLp.Helper, Buttons; 10 | 11 | type 12 | THTMLForm = class(TForm) 13 | TopPanel: TPanel; 14 | FileNameEdit: TEdit; 15 | BrowseButton: TButton; 16 | ToTextButton: TButton; 17 | Memo: TMemo; 18 | OpenDialog: TOpenDialog; 19 | PanelXPath: TPanel; 20 | UseXPathButton: TSpeedButton; 21 | XPathEdit: TEdit; 22 | MultiThreadTest: TButton; 23 | procedure BrowseButtonClick(Sender: TObject); 24 | procedure ToTextButtonClick(Sender: TObject); 25 | procedure UseXPathButtonClick(Sender: TObject); 26 | procedure MultiThreadTestClick(Sender: TObject); 27 | public 28 | private 29 | HTMLParser: THTMLParser; 30 | end; 31 | 32 | function GetHTML(const fileName: string): string; 33 | 34 | var 35 | HTMLForm: THTMLForm; 36 | 37 | implementation 38 | 39 | {$R *.DFM} 40 | 41 | uses 42 | HTMLp.DomCore, HTMLp.Formatter; 43 | 44 | function GetHTML(const fileName: string): string; 45 | var 46 | F: TStringStream; 47 | begin 48 | if Pos('http', fileName) = 1 then 49 | begin 50 | with TIdHTTP.Create(nil) do 51 | begin 52 | HandleRedirects := True; 53 | 54 | try Result := Get(fileName); 55 | except 56 | on e: Exception do ShowMessage(e.Message); 57 | end; 58 | 59 | Free; 60 | end; 61 | end 62 | else 63 | begin 64 | F := TStringStream.Create({'', TEncoding.UTF8}); 65 | try 66 | F.LoadFromFile(fileName); 67 | Result := F.DataString; 68 | finally 69 | F.Free 70 | end; 71 | end; 72 | end; 73 | 74 | procedure THTMLForm.MultiThreadTestClick(Sender: TObject); 75 | var 76 | i: Integer; 77 | thread: TThread; 78 | begin 79 | Memo.Clear; 80 | 81 | for i := 0 to 50 do 82 | begin 83 | thread := TThread.CreateAnonymousThread(procedure( ) 84 | var 85 | HTML, body: string; 86 | HTMLDoc: TDocument; 87 | begin 88 | with THTMLParser.Create do 89 | begin 90 | HTML := GetHTML(FileNameEdit.Text); 91 | HTMLDoc := parseString(HTML); 92 | body := HTMLDoc.GetInnerHTML; 93 | 94 | FreeAndNil(HTMLDoc); 95 | Free; 96 | end; 97 | 98 | TThread.Synchronize(TThread.Current, procedure begin HTMLForm.Memo.Lines.Add(IfThen(body <> '', 'GOOD', 'BAD')); end); 99 | end); 100 | 101 | {} 102 | thread.Start; 103 | end; 104 | end; 105 | 106 | procedure THTMLForm.BrowseButtonClick(Sender: TObject); 107 | begin 108 | if OpenDialog.Execute then FileNameEdit.Text := OpenDialog.FileName 109 | end; 110 | 111 | procedure THTMLForm.UseXPathButtonClick(Sender: TObject); 112 | var 113 | HTML: string; 114 | begin 115 | HTML := GetHTML(FileNameEdit.Text); 116 | Memo.Clear; 117 | 118 | ParseHTML(HTML).Find(XPathEdit.Text).Map( procedure(AIndex: Integer; AEl: TElement) 119 | begin 120 | Memo.Lines.Add(AEl.Value); 121 | end ); 122 | 123 | Memo.SelStart := 0; 124 | Memo.SelLength := 0; 125 | end; 126 | 127 | procedure THTMLForm.ToTextButtonClick(Sender: TObject); 128 | var 129 | HTML: string; 130 | HTMLDoc: TDocument; 131 | Formatter: TBaseFormatter; 132 | begin 133 | HTML := GetHTML(FileNameEdit.Text); 134 | Memo.Clear; 135 | 136 | HTMLParser := THTMLParser.Create; 137 | try 138 | HTMLDoc := HTMLParser.parseString(HTML); 139 | finally 140 | FreeAndNil(HTMLParser); 141 | end; 142 | 143 | Formatter := TTextFormatter.Create; 144 | try 145 | Memo.Lines.Text := Formatter.getText(HTMLDoc); 146 | finally 147 | FreeAndNil(Formatter); 148 | end; 149 | 150 | FreeAndNil(HTMLDoc); 151 | 152 | Memo.SelStart := 0; 153 | Memo.SelLength := 0; 154 | end; 155 | 156 | end. 157 | -------------------------------------------------------------------------------- /example/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ange007/HTMLp/706333a1b28b0b62ba54bb3eb367bfbc78b33e8f/example/index.html --------------------------------------------------------------------------------