├── .gitattributes ├── .gitignore ├── README.md ├── cx-extractor.sln ├── cx-extractor ├── App.config ├── Demo.cs ├── Properties │ └── AssemblyInfo.cs ├── TextExtract.cs ├── cx-extractor.csproj └── cx-extractor.csproj.user └── 基于行块分布函数的通用网页正文抽取算法.pdf /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cx_extractor 2 | 基于行块分布函数的通用网页正文抽取 3 | 由于本人做网络爬虫的时候使用过这个方法的java版本，现在自己把java版本的改成了C#版本，希望对大家有用。基于行块分布函数的通用网页正文抽取：线性时间、不建DOM树、与HTML标签无关。原版本的作者为哈工大的陈鑫，代码放在google 4 | code上。https://code.google.com/p/cx-extractor/ 5 | -------------------------------------------------------------------------------- /cx-extractor.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2012 4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "cx-extractor", "cx-extractor\cx-extractor.csproj", "{0676C874-7AE3-4501-86FF-EDCFD36C4EA5}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Any CPU = Debug|Any CPU 9 | Release|Any CPU = Release|Any CPU 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 13 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Debug|Any CPU.Build.0 = Debug|Any CPU 14 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Release|Any CPU.ActiveCfg = Release|Any CPU 15 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Release|Any CPU.Build.0 = Release|Any CPU 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /cx-extractor/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /cx-extractor/Demo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text; 3 | using System.IO; 4 | 5 | namespace cx_extractor 6 | { 7 | class Demo 8 | { 9 | static void Main(string[] args) 10 | { 11 | //demo 12 | StreamReader objReader = new StreamReader("E:\\Documents\\123.html", Encoding.Default); 13 | string sLine = objReader.ReadToEnd(); 14 | objReader.Close(); 15 | Console.Write(TextExtract.parse(sLine)); 16 | Console.Read(); 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /cx-extractor/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // 有关程序集的常规信息通过以下 6 | // 特性集控制。更改这些特性值可修改 7 | // 与程序集关联的信息。 8 | [assembly: AssemblyTitle("cx-extractor")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("cx-extractor")] 13 | [assembly: AssemblyCopyright("Copyright © 2015")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // 将 ComVisible 设置为 false 使此程序集中的类型 18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型， 19 | // 则将该类型上的 ComVisible 特性设置为 true。 20 | [assembly: ComVisible(false)] 21 | 22 | // 如果此项目向 COM 公开，则下列 GUID 用于类型库的 ID 23 | [assembly: Guid("ea2ae4d3-c437-4333-8705-97eac5d0e6e9")] 24 | 25 | // 程序集的版本信息由下面四个值组成: 26 | // 27 | // 主版本 28 | // 次版本 29 | // 生成号 30 | // 修订号 31 | // 32 | // 可以指定所有这些值，也可以使用“生成号”和“修订号”的默认值， 33 | // 方法是按如下所示使用“*”: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /cx-extractor/TextExtract.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Text.RegularExpressions; 5 | 6 | namespace cx_extractor 7 | { 8 | class TextExtract 9 | { 10 | private static string[] lines; 11 | private static int blocksWidth = 3; 12 | /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时，只要增大此阈值即可。*/ 13 | /* 阈值增大，准确率提升，召回率下降；值变小，噪声会大，但可以保证抽到只有一句话的正文 */ 14 | private static int threshold = 86; 15 | private static String html; 16 | private static Boolean flag = false; 17 | private static int start; 18 | private static int end; 19 | private static StringBuilder text = new StringBuilder(); 20 | private static List indexDistribution = new List(); 21 | 22 | 23 | 24 | 25 | ///

26 | /// 抽取网页正文，不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。 27 | ///

28 | /// 网页HTML字符串 29 | /// 网页正文string 30 | public static String parse(String _html) 31 | { 32 | return parse(_html, false); 33 | } 34 | ///

35 | /// 判断传入HTML，若是主题类网页，则抽取正文；否则输出"unkown"。 36 | ///

37 | /// 网页HTML字符串 38 | /// true进行主题类判断, 省略此参数则默认为false 39 | /// 网页正文string 40 | public static String parse(String _html, Boolean _flag) 41 | { 42 | flag = _flag; 43 | html = _html; 44 | preProcess(); 45 | return getText(); 46 | } 47 | 48 | private static void preProcess() 49 | { 50 | html = Regex.Replace(html,"(?is)",""); 51 | html = Regex.Replace(html,"(?is)", ""); // remove html comment 52 | html = Regex.Replace(html,"(?is).*?", ""); // remove javascript 53 | html = Regex.Replace(html,"(?is).*?", ""); // remove css 54 | html = Regex.Replace(html,"&.{2,5};|&#.{2,5};", ""); // remove special char 55 | html = Regex.Replace(html, "(?is)<.*?>", ""); 56 | html = html.Replace("\r", ""); 57 | html = html.Replace("\t", ""); 58 | html = html.Replace(" ",""); 59 | } 60 | 61 | private static String getText() 62 | { 63 | lines = html.Split('\n'); 64 | indexDistribution.Clear(); 65 | 66 | for (int i = 0; i < lines.Length - blocksWidth; i++) 67 | { 68 | int wordsNum = 0; 69 | for (int j = i; j < i + blocksWidth; j++) 70 | { 71 | lines[j] = lines[j].Replace("\\s",""); 72 | wordsNum += lines[j].Length; 73 | } 74 | indexDistribution.Add(wordsNum); 75 | } 76 | 77 | start = -1; end = -1; 78 | Boolean boolstart = false, boolend = false; 79 | text.Length = 0; 80 | 81 | for (int i = 0; i < indexDistribution.Count - 1; i++) 82 | { 83 | if (indexDistribution[i] > threshold && !boolstart) 84 | { 85 | if (indexDistribution[i + 1] != 0 86 | || indexDistribution[i + 2] != 0 87 | || indexDistribution[i + 3] != 0) 88 | { 89 | boolstart = true; 90 | start = i; 91 | continue; 92 | } 93 | } 94 | if (boolstart) 95 | { 96 | if (indexDistribution[i] == 0 97 | || indexDistribution[i + 1] == 0) 98 | { 99 | end = i; 100 | boolend = true; 101 | } 102 | } 103 | StringBuilder tmp = new StringBuilder(); 104 | if (boolend) 105 | { 106 | for (int ii = start; ii <= end; ii++) 107 | { 108 | if (lines[ii].Length < 5) 109 | continue; 110 | tmp.Append(lines[ii] + "\n"); 111 | } 112 | String str = tmp.ToString(); 113 | if (str.Contains("Copyright") || str.Contains("版权所有")) continue; 114 | text.Append(str); 115 | boolstart = boolend = false; 116 | } 117 | } 118 | return text.ToString(); 119 | } 120 | 121 | 122 | public static void setthreshold(int value) 123 | { 124 | threshold = value; 125 | } 126 | 127 | 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /cx-extractor/cx-extractor.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5} 8 | Exe 9 | Properties 10 | cx_extractor 11 | cx-extractor 12 | v4.5 13 | 512 14 | publish\ 15 | true 16 | Disk 17 | false 18 | Foreground 19 | 7 20 | Days 21 | false 22 | false 23 | true 24 | 0 25 | 1.0.0.%2a 26 | false 27 | false 28 | true 29 | 30 | 31 | AnyCPU 32 | true 33 | full 34 | false 35 | bin\Debug\ 36 | DEBUG;TRACE 37 | prompt 38 | 4 39 | 40 | 41 | AnyCPU 42 | pdbonly 43 | true 44 | bin\Release\ 45 | TRACE 46 | prompt 47 | 4 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | False 69 | Microsoft .NET Framework 4.5 %28x86 和 x64%29 70 | true 71 | 72 | 73 | False 74 | .NET Framework 3.5 SP1 75 | false 76 | 77 | 78 | 79 | 86 | -------------------------------------------------------------------------------- /cx-extractor/cx-extractor.csproj.user: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | zh-CN 11 | false 12 | 13 | -------------------------------------------------------------------------------- /基于行块分布函数的通用网页正文抽取算法.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chrislinan/cx-extractor/866fae7d5e883bb671209ed35a6191ebce00373b/基于行块分布函数的通用网页正文抽取算法.pdf --------------------------------------------------------------------------------