├── HtmlParser.h ├── HtmlParser.cpp ├── test ├── test-html-parser.cpp ├── testfiles │ ├── 163.com.html │ ├── qq.com.html │ ├── baidu.com.html │ ├── cnbeta.com.html │ ├── sohu.com.html │ ├── taobao.com.html │ ├── sina.com.cn.html │ └── google.com.html ├── test_html_parser.dsw └── test_html_parser.dsp └── README.md /HtmlParser.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/HtmlParser.h -------------------------------------------------------------------------------- /HtmlParser.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/HtmlParser.cpp -------------------------------------------------------------------------------- /test/test-html-parser.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/test-html-parser.cpp -------------------------------------------------------------------------------- /test/testfiles/163.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/163.com.html -------------------------------------------------------------------------------- /test/testfiles/qq.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/qq.com.html -------------------------------------------------------------------------------- /test/testfiles/baidu.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/baidu.com.html -------------------------------------------------------------------------------- /test/testfiles/cnbeta.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/cnbeta.com.html -------------------------------------------------------------------------------- /test/testfiles/sohu.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/sohu.com.html -------------------------------------------------------------------------------- /test/testfiles/taobao.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/taobao.com.html -------------------------------------------------------------------------------- /test/testfiles/sina.com.cn.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liigo/html-parser/HEAD/test/testfiles/sina.com.cn.html -------------------------------------------------------------------------------- /test/test_html_parser.dsw: -------------------------------------------------------------------------------- 1 | Microsoft Developer Studio Workspace File, Format Version 6.00 2 | # WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE! 3 | 4 | ############################################################################### 5 | 6 | Project: "test_html_parser"=".\test_html_parser.dsp" - Package Owner=<4> 7 | 8 | Package=<5> 9 | {{{ 10 | }}} 11 | 12 | Package=<4> 13 | {{{ 14 | }}} 15 | 16 | ############################################################################### 17 | 18 | Global: 19 | 20 | Package=<5> 21 | {{{ 22 | }}} 23 | 24 | Package=<3> 25 | {{{ 26 | }}} 27 | 28 | ############################################################################### 29 | 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于节点的HTML文本解析器的设计和实现 2 | ------------------------------------ 3 | by Liigo http://blog.csdn.net/liigo/article/details/6153829 4 | 5 | 6 | 众所周知,HTML是结构化文档(Structured Document),由诸多标签(<p>等)嵌套形成的著名的文档对象模型(DOM, Document Object Model),是显而易见的树形多层次结构。如果带着这种思路看待HTML、编写HTML解析器,无疑将导致问题复杂化。不妨从另一视角俯视HTML文本,视其为一维线状结构:诸多单一节点的顺序排列。仔细审视任何一段HTML文本,以左右尖括号(<和>)为边界,会发现HTML文本被天然地分割为:一个标签(Tag),接一段普通文字,再一个标签,再一段普通文字…… 如下图所示: 7 | 8 | ![HTML: Tags + Texts](http://hi.csdn.net/attachment/201101/19/11443_1295450955dZ04.png) 9 | 10 | 标签有两种,开始标签(如<p>)和结束标签(</p>),它们和普通文字一起,顺序排列,共同构成了HTML文本的全部。 11 | 12 | 为了再次简化编程模型,我(liigo)继续将“开始标签”“结束标签”“普通文字”三者统一抽象归纳为“节点”(HtmlNode),相应的,“节点”有三种类型,要么是开始标签,要么是结束标签,要么是普通文字。现在,HTML在我们眼里更加单纯了,它就是“节点”的线性顺序组合,是一维的“节点”数组。如下图所示:HTML文本 = 节点1 + 节点2 + 节点3 + …… 13 | 14 | ![HTML Nodes(tags/texts)](http://hi.csdn.net/attachment/201101/19/11443_12954566710NLB.png) 15 | -------------------------------------------------------------------------------- /test/test_html_parser.dsp: -------------------------------------------------------------------------------- 1 | # Microsoft Developer Studio Project File - Name="test_html_parser" - Package Owner=<4> 2 | # Microsoft Developer Studio Generated Build File, Format Version 6.00 3 | # ** DO NOT EDIT ** 4 | 5 | # TARGTYPE "Win32 (x86) Console Application" 0x0103 6 | 7 | CFG=test_html_parser - Win32 Debug 8 | !MESSAGE This is not a valid makefile. To build this project using NMAKE, 9 | !MESSAGE use the Export Makefile command and run 10 | !MESSAGE 11 | !MESSAGE NMAKE /f "test_html_parser.mak". 12 | !MESSAGE 13 | !MESSAGE You can specify a configuration when running NMAKE 14 | !MESSAGE by defining the macro CFG on the command line. For example: 15 | !MESSAGE 16 | !MESSAGE NMAKE /f "test_html_parser.mak" CFG="test_html_parser - Win32 Debug" 17 | !MESSAGE 18 | !MESSAGE Possible choices for configuration are: 19 | !MESSAGE 20 | !MESSAGE "test_html_parser - Win32 Release" (based on "Win32 (x86) Console Application") 21 | !MESSAGE "test_html_parser - Win32 Debug" (based on "Win32 (x86) Console Application") 22 | !MESSAGE 23 | 24 | # Begin Project 25 | # PROP AllowPerConfigDependencies 0 26 | # PROP Scc_ProjName "" 27 | # PROP Scc_LocalPath "" 28 | CPP=cl.exe 29 | RSC=rc.exe 30 | 31 | !IF "$(CFG)" == "test_html_parser - Win32 Release" 32 | 33 | # PROP BASE Use_MFC 0 34 | # PROP BASE Use_Debug_Libraries 0 35 | # PROP BASE Output_Dir "Release" 36 | # PROP BASE Intermediate_Dir "Release" 37 | # PROP BASE Target_Dir "" 38 | # PROP Use_MFC 0 39 | # PROP Use_Debug_Libraries 0 40 | # PROP Output_Dir "Release" 41 | # PROP Intermediate_Dir "Release" 42 | # PROP Target_Dir "" 43 | # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c 44 | # ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c 45 | # ADD BASE RSC /l 0x804 /d "NDEBUG" 46 | # ADD RSC /l 0x804 /d "NDEBUG" 47 | BSC32=bscmake.exe 48 | # ADD BASE BSC32 /nologo 49 | # ADD BSC32 /nologo 50 | LINK32=link.exe 51 | # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 52 | # ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 53 | 54 | !ELSEIF "$(CFG)" == "test_html_parser - Win32 Debug" 55 | 56 | # PROP BASE Use_MFC 0 57 | # PROP BASE Use_Debug_Libraries 1 58 | # PROP BASE Output_Dir "Debug" 59 | # PROP BASE Intermediate_Dir "Debug" 60 | # PROP BASE Target_Dir "" 61 | # PROP Use_MFC 0 62 | # PROP Use_Debug_Libraries 1 63 | # PROP Output_Dir "Debug" 64 | # PROP Intermediate_Dir "Debug" 65 | # PROP Target_Dir "" 66 | # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c 67 | # ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /YX /FD /GZ /c 68 | # ADD BASE RSC /l 0x804 /d "_DEBUG" 69 | # ADD RSC /l 0x804 /d "_DEBUG" 70 | BSC32=bscmake.exe 71 | # ADD BASE BSC32 /nologo 72 | # ADD BSC32 /nologo 73 | LINK32=link.exe 74 | # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept 75 | # ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept 76 | 77 | !ENDIF 78 | 79 | # Begin Target 80 | 81 | # Name "test_html_parser - Win32 Release" 82 | # Name "test_html_parser - Win32 Debug" 83 | # Begin Group "Source Files" 84 | 85 | # PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" 86 | # Begin Source File 87 | 88 | SOURCE=..\HtmlParser.cpp 89 | # End Source File 90 | # Begin Source File 91 | 92 | SOURCE=".\test-html-parser.cpp" 93 | # End Source File 94 | # End Group 95 | # Begin Group "Header Files" 96 | 97 | # PROP Default_Filter "h;hpp;hxx;hm;inl" 98 | # Begin Source File 99 | 100 | SOURCE=..\HtmlParser.h 101 | # End Source File 102 | # End Group 103 | # Begin Group "Resource Files" 104 | 105 | # PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe" 106 | # End Group 107 | # End Target 108 | # End Project 109 | -------------------------------------------------------------------------------- /test/testfiles/google.com.html: -------------------------------------------------------------------------------- 1 | Google Screen reader users, click here to turn off Google Instant.

--------------------------------------------------------------------------------