├── .gitattributes
├── .gitignore
├── README.md
├── cx-extractor.sln
├── cx-extractor
├── App.config
├── Demo.cs
├── Properties
│ └── AssemblyInfo.cs
├── TextExtract.cs
├── cx-extractor.csproj
└── cx-extractor.csproj.user
└── 基于行块分布函数的通用网页正文抽取算法.pdf
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 |
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cx_extractor
2 | 基于行块分布函数的通用网页正文抽取
3 | 由于本人做网络爬虫的时候使用过这个方法的java版本,现在自己把java版本的改成了C#版本,希望对大家有用。基于行块分布函数的通用网页正文抽取:线性时间、不建DOM树、与HTML标签无关。原版本的作者为哈工大的陈鑫,代码放在google
4 | code上。https://code.google.com/p/cx-extractor/
5 |
--------------------------------------------------------------------------------
/cx-extractor.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2012
4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "cx-extractor", "cx-extractor\cx-extractor.csproj", "{0676C874-7AE3-4501-86FF-EDCFD36C4EA5}"
5 | EndProject
6 | Global
7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
8 | Debug|Any CPU = Debug|Any CPU
9 | Release|Any CPU = Release|Any CPU
10 | EndGlobalSection
11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
13 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Debug|Any CPU.Build.0 = Debug|Any CPU
14 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Release|Any CPU.ActiveCfg = Release|Any CPU
15 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}.Release|Any CPU.Build.0 = Release|Any CPU
16 | EndGlobalSection
17 | GlobalSection(SolutionProperties) = preSolution
18 | HideSolutionNode = FALSE
19 | EndGlobalSection
20 | EndGlobal
21 |
--------------------------------------------------------------------------------
/cx-extractor/App.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/cx-extractor/Demo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Text;
3 | using System.IO;
4 |
5 | namespace cx_extractor
6 | {
7 | class Demo
8 | {
9 | static void Main(string[] args)
10 | {
11 | //demo
12 | StreamReader objReader = new StreamReader("E:\\Documents\\123.html", Encoding.Default);
13 | string sLine = objReader.ReadToEnd();
14 | objReader.Close();
15 | Console.Write(TextExtract.parse(sLine));
16 | Console.Read();
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/cx-extractor/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // 有关程序集的常规信息通过以下
6 | // 特性集控制。更改这些特性值可修改
7 | // 与程序集关联的信息。
8 | [assembly: AssemblyTitle("cx-extractor")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("cx-extractor")]
13 | [assembly: AssemblyCopyright("Copyright © 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // 将 ComVisible 设置为 false 使此程序集中的类型
18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型,
19 | // 则将该类型上的 ComVisible 特性设置为 true。
20 | [assembly: ComVisible(false)]
21 |
22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
23 | [assembly: Guid("ea2ae4d3-c437-4333-8705-97eac5d0e6e9")]
24 |
25 | // 程序集的版本信息由下面四个值组成:
26 | //
27 | // 主版本
28 | // 次版本
29 | // 生成号
30 | // 修订号
31 | //
32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值,
33 | // 方法是按如下所示使用“*”:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/cx-extractor/TextExtract.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using System.Text.RegularExpressions;
5 |
6 | namespace cx_extractor
7 | {
8 | class TextExtract
9 | {
10 | private static string[] lines;
11 | private static int blocksWidth = 3;
12 | /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时,只要增大此阈值即可。*/
13 | /* 阈值增大,准确率提升,召回率下降;值变小,噪声会大,但可以保证抽到只有一句话的正文 */
14 | private static int threshold = 86;
15 | private static String html;
16 | private static Boolean flag = false;
17 | private static int start;
18 | private static int end;
19 | private static StringBuilder text = new StringBuilder();
20 | private static List indexDistribution = new List();
21 |
22 |
23 |
24 |
25 | ///
26 | /// 抽取网页正文,不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。
27 | ///
28 | /// 网页HTML字符串
29 | /// 网页正文string
30 | public static String parse(String _html)
31 | {
32 | return parse(_html, false);
33 | }
34 | ///
35 | /// 判断传入HTML,若是主题类网页,则抽取正文;否则输出"unkown"。
36 | ///
37 | /// 网页HTML字符串
38 | /// true进行主题类判断, 省略此参数则默认为false
39 | /// 网页正文string
40 | public static String parse(String _html, Boolean _flag)
41 | {
42 | flag = _flag;
43 | html = _html;
44 | preProcess();
45 | return getText();
46 | }
47 |
48 | private static void preProcess()
49 | {
50 | html = Regex.Replace(html,"(?is)","");
51 | html = Regex.Replace(html,"(?is)", ""); // remove html comment
52 | html = Regex.Replace(html,"(?is).*?", ""); // remove javascript
53 | html = Regex.Replace(html,"(?is).*?", ""); // remove css
54 | html = Regex.Replace(html,"&.{2,5};|.{2,5};", ""); // remove special char
55 | html = Regex.Replace(html, "(?is)<.*?>", "");
56 | html = html.Replace("\r", "");
57 | html = html.Replace("\t", "");
58 | html = html.Replace(" ","");
59 | }
60 |
61 | private static String getText()
62 | {
63 | lines = html.Split('\n');
64 | indexDistribution.Clear();
65 |
66 | for (int i = 0; i < lines.Length - blocksWidth; i++)
67 | {
68 | int wordsNum = 0;
69 | for (int j = i; j < i + blocksWidth; j++)
70 | {
71 | lines[j] = lines[j].Replace("\\s","");
72 | wordsNum += lines[j].Length;
73 | }
74 | indexDistribution.Add(wordsNum);
75 | }
76 |
77 | start = -1; end = -1;
78 | Boolean boolstart = false, boolend = false;
79 | text.Length = 0;
80 |
81 | for (int i = 0; i < indexDistribution.Count - 1; i++)
82 | {
83 | if (indexDistribution[i] > threshold && !boolstart)
84 | {
85 | if (indexDistribution[i + 1] != 0
86 | || indexDistribution[i + 2] != 0
87 | || indexDistribution[i + 3] != 0)
88 | {
89 | boolstart = true;
90 | start = i;
91 | continue;
92 | }
93 | }
94 | if (boolstart)
95 | {
96 | if (indexDistribution[i] == 0
97 | || indexDistribution[i + 1] == 0)
98 | {
99 | end = i;
100 | boolend = true;
101 | }
102 | }
103 | StringBuilder tmp = new StringBuilder();
104 | if (boolend)
105 | {
106 | for (int ii = start; ii <= end; ii++)
107 | {
108 | if (lines[ii].Length < 5)
109 | continue;
110 | tmp.Append(lines[ii] + "\n");
111 | }
112 | String str = tmp.ToString();
113 | if (str.Contains("Copyright") || str.Contains("版权所有")) continue;
114 | text.Append(str);
115 | boolstart = boolend = false;
116 | }
117 | }
118 | return text.ToString();
119 | }
120 |
121 |
122 | public static void setthreshold(int value)
123 | {
124 | threshold = value;
125 | }
126 |
127 |
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/cx-extractor/cx-extractor.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {0676C874-7AE3-4501-86FF-EDCFD36C4EA5}
8 | Exe
9 | Properties
10 | cx_extractor
11 | cx-extractor
12 | v4.5
13 | 512
14 | publish\
15 | true
16 | Disk
17 | false
18 | Foreground
19 | 7
20 | Days
21 | false
22 | false
23 | true
24 | 0
25 | 1.0.0.%2a
26 | false
27 | false
28 | true
29 |
30 |
31 | AnyCPU
32 | true
33 | full
34 | false
35 | bin\Debug\
36 | DEBUG;TRACE
37 | prompt
38 | 4
39 |
40 |
41 | AnyCPU
42 | pdbonly
43 | true
44 | bin\Release\
45 | TRACE
46 | prompt
47 | 4
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 | False
69 | Microsoft .NET Framework 4.5 %28x86 和 x64%29
70 | true
71 |
72 |
73 | False
74 | .NET Framework 3.5 SP1
75 | false
76 |
77 |
78 |
79 |
86 |
--------------------------------------------------------------------------------
/cx-extractor/cx-extractor.csproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | zh-CN
11 | false
12 |
13 |
--------------------------------------------------------------------------------
/基于行块分布函数的通用网页正文抽取算法.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chrislinan/cx-extractor/866fae7d5e883bb671209ed35a6191ebce00373b/基于行块分布函数的通用网页正文抽取算法.pdf
--------------------------------------------------------------------------------