├── README.md ├── References └── HtmlAgilityPack │ └── HtmlAgilityPack.dll ├── Rhythm.Staticize ├── IBehavior.cs ├── StaticizeStepChangedEventArgs.cs ├── Validation │ ├── IValidation.cs │ ├── ValidationType.cs │ ├── GenerationSuccessfulValidation.cs │ ├── ValidationDelegateTaken.cs │ ├── ValidationGroup.cs │ ├── ValidationResult.cs │ ├── ValidationExtensions.cs │ ├── InternalALinkValidation.cs │ ├── XPathValidation.cs │ ├── ReferenceResourcesExistingValidation.cs │ └── ValidationProjection.cs ├── Extensions │ ├── CollectionExtensions.cs │ ├── UrlExtensions.cs │ └── IOExtensions.cs ├── ResourcesDownload │ ├── JavascriptResourcesDownloadBehavior.cs │ ├── CssResourcesDownloadBehavior.cs │ ├── ImageResourcesDownloadBehavior.cs │ ├── ResourcesDownloadException.cs │ └── ResourcesDownloadBaseBehavior.cs ├── Properties │ └── AssemblyInfo.cs ├── ResourcesManager.cs ├── StaticizeStep.cs ├── HtmlStaticizeContext.cs ├── IUriResourcesFromLocalFileSystemReslover.cs ├── Rhythm.Staticize.csproj ├── StaticizeStepStatus.cs └── Staticizer.cs ├── Rhythm.Staticize.Tests ├── StaticizeCoreTest.cs ├── Properties │ └── AssemblyInfo.cs ├── Rhythm.Staticize.Tests.csproj └── StaticizeTest.cs ├── Staticize.sln ├── .gitattributes └── .gitignore /README.md: -------------------------------------------------------------------------------- 1 | # Staticize 2 | 3 | 4 | -------------------------------------------------------------------------------- /References/HtmlAgilityPack/HtmlAgilityPack.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RockNHawk/Staticize/HEAD/References/HtmlAgilityPack/HtmlAgilityPack.dll -------------------------------------------------------------------------------- /Rhythm.Staticize/IBehavior.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 添加 IBehavior ,它会在 HTML 被生成,并加载后执行。 10 | /// 因此你可以使用 IBehavior 对 HTML 进行读取,它会在验证之前执行。 11 | /// 12 | public interface IBehavior 13 | { 14 | void Process(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Rhythm.Staticize/StaticizeStepChangedEventArgs.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 关键状态变更事件参数 10 | /// 11 | public class StaticizeStepChangedEventArgs : System.EventArgs 12 | { 13 | /// 14 | /// 表示静态化过程关键步骤 15 | /// 16 | public StaticizeStep Step { get; set; } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Rhythm.Staticize.Tests/StaticizeCoreTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | using System.Web; 8 | using System.Net; 9 | using Rhythm.Staticize; 10 | 11 | namespace Rhythm.Staticize 12 | { 13 | [TestClass] 14 | public class StaticizeCoreTest 15 | { 16 | [TestMethod] 17 | public void UrlExtensionsTest() 18 | { 19 | var uri = new Uri("http://localhost/a/b/c/1.html"); 20 | string parentDirectory = UrlExtensions.GetFileDirectory(uri); 21 | Assert.AreEqual(parentDirectory, "/a/b/c/"); 22 | 23 | var parentAddress = UrlExtensions.GetParent(uri); 24 | Assert.AreEqual(parentAddress, "http://localhost/a/b/c/"); 25 | } 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/IValidation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 表示一个验证规则 11 | /// 12 | public interface IValidation 13 | { 14 | /// 15 | /// 执行验证。 16 | /// 17 | /// 被验证的 HtmlDocument 18 | /// 获取验证不通过时的错误提示信息。 19 | string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status); 20 | 21 | /// 22 | /// 获取此验证的名称信息(用于向用户界面显示)。 23 | /// 24 | string Name { get; } 25 | 26 | /// 27 | /// 验证类型 28 | /// 29 | ValidationType Type { get; } 30 | } 31 | } -------------------------------------------------------------------------------- /Rhythm.Staticize/Extensions/CollectionExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | public static class CollectionExtensions 9 | { 10 | public static void AddRange(this IList source, IEnumerable collection) 11 | { 12 | if (collection == null) 13 | { 14 | throw new ArgumentNullException("collection"); 15 | } 16 | List list = source as List; 17 | if (list != null) 18 | { 19 | list.AddRange(collection); 20 | } 21 | else 22 | { 23 | foreach (var item in collection) 24 | { 25 | source.Add(item); 26 | } 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Rhythm.Staticize/ResourcesDownload/JavascriptResourcesDownloadBehavior.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 提供对被爬行页面中引用的 Javascript 资源的下载支持。 11 | /// 12 | public class JavascriptResourcesDownloadBehavior : ResourcesDownloadBaseBehavior 13 | { 14 | public JavascriptResourcesDownloadBehavior(String outputBaseDirectory, IUriResourcesFromLocalFileSystemReslover resourceFileReslover = null) 15 | : base(outputBaseDirectory, "//script[@src]", resourceFileReslover) 16 | { 17 | } 18 | 19 | protected override void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context) 20 | { 21 | base.OnResourceParsed(resourceUris, context); 22 | context.Resources.ReferenceJavascripts.AddRange(resourceUris); 23 | } 24 | 25 | 26 | } 27 | } -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ValidationType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 验证的类型(标签完整性、链接完整性、资源文件完整性)。 10 | /// 11 | public enum ValidationType 12 | { 13 | [System.ComponentModel.DataAnnotations.Display(Name = "其它")] 14 | Other, 15 | /// 16 | /// 标签完整性 17 | /// 18 | [System.ComponentModel.DataAnnotations.Display(Name = "标签完整性")] 19 | Tag, 20 | /// 21 | /// 链接完整性 22 | /// 23 | [System.ComponentModel.DataAnnotations.Display(Name = "链接完整性")] 24 | Link, 25 | /// 26 | /// 资源文件完整性 27 | /// 28 | [System.ComponentModel.DataAnnotations.Display(Name = "资源文件完整性")] 29 | Resource, 30 | /// 31 | /// 内容正确性 32 | /// 33 | [System.ComponentModel.DataAnnotations.Display(Name = "内容正确性")] 34 | Content, 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // 有关程序集的常规信息通过以下 6 | // 特性集控制。更改这些特性值可修改 7 | // 与程序集关联的信息。 8 | [assembly: AssemblyTitle("Staticize")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Deepst")] 12 | [assembly: AssemblyProduct("Staticize")] 13 | [assembly: AssemblyCopyright("Copyright © Deepst 2013")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // 将 ComVisible 设置为 false 使此程序集中的类型 18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型, 19 | // 则将该类型上的 ComVisible 特性设置为 true。 20 | [assembly: ComVisible(false)] 21 | 22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID 23 | [assembly: Guid("023aa712-4366-47b8-b8c8-f7230b800240")] 24 | 25 | // 程序集的版本信息由下面四个值组成: 26 | // 27 | // 主版本 28 | // 次版本 29 | // 生成号 30 | // 修订号 31 | // 32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值, 33 | // 方法是按如下所示使用“*”: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /Rhythm.Staticize.Tests/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // 有关程序集的常规信息通过以下特性集 6 | // 控制。更改这些特性值可修改 7 | // 与程序集关联的信息。 8 | [assembly: AssemblyTitle("Rhythm.Staticize.Tests")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Microsoft")] 12 | [assembly: AssemblyProduct("Rhythm.Staticize.Tests")] 13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2013")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // 将 ComVisible 设置为 false 会使此程序集中的类型 18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型, 19 | // 请将该类型上的 ComVisible 特性设置为 true。 20 | [assembly: ComVisible(false)] 21 | 22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID 23 | [assembly: Guid("8d3af334-e9bf-4b55-b18d-4a0db4fc3274")] 24 | 25 | // 程序集的版本信息由以下四个值组成: 26 | // 27 | // 主版本 28 | // 次版本 29 | // 生成号 30 | // 修订号 31 | // 32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值, 33 | // 方法是按如下所示使用“*”: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /Rhythm.Staticize/ResourcesManager.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | 9 | /// 10 | /// 提供对被静态化网页总引用资源的计数支持。 11 | /// 12 | public class ResourcesManager 13 | { 14 | /// 15 | /// 初始化 ResourcesManager 的新实例 16 | /// 17 | public ResourcesManager() 18 | { 19 | ReferenceCsses = new List(3); 20 | ReferenceJavascripts = new List(3); 21 | ReferenceImages = new List(5); 22 | NotExistsFiles = new Dictionary(); 23 | } 24 | 25 | /// 26 | /// 获取页面 HTML 中引用的 CSS 27 | /// 28 | public IList ReferenceCsses { get; private set; } 29 | 30 | /// 31 | /// 获取页面 HTML 中引用的 JS 32 | /// 33 | public IList ReferenceJavascripts { get; private set; } 34 | 35 | /// 36 | /// 获取页面 HTML 中引用的图片 37 | /// 38 | public IList ReferenceImages { get; private set; } 39 | 40 | /// 41 | /// 获取页面 HTML 中有引用但实际不存在的图片。 42 | /// 43 | public IDictionary NotExistsFiles { get; private set; } 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /Rhythm.Staticize/ResourcesDownload/CssResourcesDownloadBehavior.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 提供对被爬行页面中引用的 CSS 资源的下载支持。 11 | /// 12 | public class CssResourcesDownloadBehavior : ResourcesDownloadBaseBehavior 13 | { 14 | public CssResourcesDownloadBehavior(String outputBaseDirectory, IUriResourcesFromLocalFileSystemReslover resourceFileReslover = null) 15 | : base(outputBaseDirectory, @"//link[@rel='stylesheet']", resourceFileReslover) 16 | { 17 | } 18 | 19 | protected override string[] GetSrcAttributes(HtmlAgilityPack.HtmlNodeCollection nodes) 20 | { 21 | var srcAttributes = (from node in nodes 22 | where !String.IsNullOrWhiteSpace(node.GetAttributeValue("href", null)) 23 | select node.GetAttributeValue("href", null)).ToArray(); 24 | return srcAttributes; 25 | } 26 | 27 | protected override void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context) 28 | { 29 | base.OnResourceParsed(resourceUris, context); 30 | context.Resources.ReferenceCsses.AddRange(resourceUris); 31 | } 32 | 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Rhythm.Staticize/ResourcesDownload/ImageResourcesDownloadBehavior.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 提供对被爬行页面中引用的图片资源的下载支持。 11 | /// 12 | public class ImageResourcesDownloadBehavior : ResourcesDownloadBaseBehavior 13 | { 14 | /// 15 | /// 初始化 ImageResourcesDownloadBehavior 的新实例。 16 | /// 17 | /// 图片输出文件夹 18 | /// 19 | public ImageResourcesDownloadBehavior(String outputBaseDirectory, IUriResourcesFromLocalFileSystemReslover resourceFileReslover = null) 20 | : base(outputBaseDirectory, "//img[@src]", resourceFileReslover) 21 | { 22 | } 23 | 24 | /// 25 | /// 当资源URL被正确解析,即将被下载时回调 26 | /// 27 | /// 当资源URL(集合) 28 | /// 29 | protected override void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context) 30 | { 31 | context.Resources.ReferenceImages.AddRange(resourceUris); 32 | base.OnResourceParsed(resourceUris, context); 33 | } 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/GenerationSuccessfulValidation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | class GenerationSuccessfulValidation : IValidation 9 | { 10 | /// 11 | /// 获取 GenerationValidation 的实例。 12 | /// 13 | public static readonly IValidation Instance = new GenerationSuccessfulValidation(); 14 | 15 | string IValidation.Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) 16 | { 17 | var errorMessage = new StringBuilder(); 18 | if (status.GenerationError != null) 19 | { 20 | var err = status.GenerationError; 21 | errorMessage.AppendFormat("生成HTML期间发生错误:{0}\r\n{1}\r\n", err.Message, err.ToString()); 22 | } 23 | if (status.DocumentLoadError != null) 24 | { 25 | var err = status.DocumentLoadError; 26 | errorMessage.AppendFormat("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", err.Message, err.ToString()); 27 | } 28 | return errorMessage.Length == 0 ? null : errorMessage.ToString(); 29 | } 30 | 31 | string IValidation.Name { get { return "页面HTML是否成功生成。"; } } 32 | 33 | 34 | public ValidationType Type { get { return ValidationType.Tag; } } 35 | 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Extensions/UrlExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | public static class UrlExtensions 9 | { 10 | /// 11 | /// 获取页面的Uri所在的目录名称。(仅支持页面) 12 | /// 13 | /// 14 | /// 所在的目录名称 15 | public static string GetFileDirectory(this Uri uri) 16 | { 17 | //input:http://xxx/xx.html 18 | //return:/ 19 | var baseUrlString = uri.GetParent(); 20 | var baseUrl = new Uri(baseUrlString); 21 | return baseUrl.LocalPath; 22 | } 23 | 24 | /// 25 | /// 获取URI的上一级的URI Address。 26 | /// 27 | /// 28 | /// 上一级的URI Address。 29 | public static string GetParent(this Uri uri) 30 | { 31 | //input:http://xxx/xx.html 32 | //return:http://xxx 33 | String uriString = uri.ToString(); 34 | int lastSlash = uriString.LastIndexOf('/'); 35 | if (lastSlash == -1) 36 | { 37 | return uriString; 38 | } 39 | else 40 | { 41 | String baseDir = uriString.Substring(0, lastSlash + 1); 42 | return baseDir;//.TidyUri(); 43 | } 44 | } 45 | 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Rhythm.Staticize/ResourcesDownload/ResourcesDownloadException.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | [Serializable] 10 | public class ResourcesDownloadException : System.Exception 11 | { 12 | 13 | /// 14 | /// 初始化此类的新实例 15 | /// 16 | public ResourcesDownloadException() 17 | { } 18 | 19 | /// 20 | /// 使用指定的错误信息初始化此类的新实例。 21 | /// 22 | /// 解释异常原因的错误信息 23 | public ResourcesDownloadException(string message) 24 | : base(message) 25 | { 26 | 27 | } 28 | 29 | /// 30 | /// 使用指定错误消息和对作为此异常原因的内部异常的引用来初始化此类的新实例。 31 | /// 32 | /// 解释异常原因的错误信息 33 | /// 导致当前异常的异常;如果未指定内部异常,则是一个 null 引用。 34 | public ResourcesDownloadException(string message, System.Exception innerException) 35 | : base(message, innerException) 36 | { 37 | 38 | } 39 | 40 | public Uri Url { get; set; } 41 | 42 | public override void GetObjectData(System.Runtime.Serialization.SerializationInfo info, System.Runtime.Serialization.StreamingContext context) 43 | { 44 | if (info == null) 45 | { 46 | throw new ArgumentNullException("info"); 47 | } 48 | base.GetObjectData(info, context); 49 | info.AddValue("Url", this.Url, typeof(string)); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Rhythm.Staticize/StaticizeStep.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 表示静态化过程关键步骤。 10 | /// 11 | public enum StaticizeStep 12 | { 13 | /// 14 | /// 正在初始化 15 | /// 16 | [System.ComponentModel.DataAnnotations.Display(Name = "正在初始化")] 17 | Initialize, 18 | /// 19 | /// 正在生成 HTML 20 | /// 21 | [System.ComponentModel.DataAnnotations.Display(Name = "正在生成 HTML")] 22 | GenerationHtml, 23 | /// 24 | /// HTML 生成完成 25 | /// 26 | [System.ComponentModel.DataAnnotations.Display(Name = "HTML 生成完成")] 27 | GenerationHtmlCompleted, 28 | /// 29 | /// 正在验证 30 | /// 31 | [System.ComponentModel.DataAnnotations.Display(Name = "正在验证")] 32 | Validation, 33 | /// 34 | /// 验证完成 35 | /// 36 | [System.ComponentModel.DataAnnotations.Display(Name = "验证完成")] 37 | ValidationCompleted, 38 | /// 39 | /// 已完成 40 | /// 41 | [System.ComponentModel.DataAnnotations.Display(Name = "已完成")] 42 | Completed, 43 | /// 44 | /// 静态化过程被意外终止 45 | /// 静态化执行过程中意外停止了,可能是线程 Crash 或计算机关机造成的。 46 | /// 47 | [System.ComponentModel.DataAnnotations.Display(Name = "静态化过程被意外终止", Description = "静态化执行过程中意外停止了,可能是线程 Crash 或计算机关机造成的。")] 48 | Crashed, 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /Rhythm.Staticize/HtmlStaticizeContext.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 网页静态化时的上下文信息。 10 | /// 提供对爬行网页过程中状态信息存储的支持。 11 | /// 提供错误信息列表。 12 | /// 13 | /// 14 | public class HtmlStaticizeContext 15 | { 16 | internal Uri uri; 17 | internal String fileName; 18 | internal Exception generationError; 19 | internal IList validationResults; 20 | 21 | /// 22 | /// 初始化 HtmlStaticizeContext 的新实例 23 | /// 24 | public HtmlStaticizeContext() 25 | { 26 | Resources = new ResourcesManager(); 27 | Errors = new List(); 28 | } 29 | 30 | /// 31 | /// 获取网页的Uri信息。 32 | /// 33 | public Uri Uri { get { return uri; } } 34 | 35 | /// 36 | /// 获取是否生成失败。 37 | /// 如果不为null,表示 HTML 生成失败。 38 | /// 39 | public System.Exception GenerationError { get { return generationError; } } 40 | 41 | /// 42 | /// 获取是否加载 HTML 失败(与 XMLDocument.LoadXML 方法同理,如果 Load 失败,表示 HTML document 格式不正确)。 43 | /// 44 | public System.Exception DocumentLoadError { get; internal set; } 45 | 46 | /// 47 | /// 此页面静态化过程中的错误信息。 48 | /// 如发生404等错误均会在此记录。 49 | /// 50 | public IList Errors { get; internal set; } 51 | 52 | /// 53 | /// 网页引用资源信息 54 | /// 55 | public ResourcesManager Resources { get; internal set; } 56 | 57 | /// 58 | /// 对此网页静态化后的验证结果(集合) 59 | /// 60 | public IList ValidationResults { get { return validationResults; } } 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /Staticize.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2012 4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Rhythm.Staticize.Tests", "Rhythm.Staticize.Tests\Rhythm.Staticize.Tests.csproj", "{BB81F20F-68E2-415B-B885-30DEA4B7CD4D}" 5 | EndProject 6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "References", "References", "{9210C7F1-32B1-4C5B-8D9E-F38C06E96AE7}" 7 | EndProject 8 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "HtmlAgilityPack", "HtmlAgilityPack", "{9B4B0CB9-A03F-4680-AB08-150C30F07AF0}" 9 | ProjectSection(SolutionItems) = preProject 10 | References\HtmlAgilityPack\HtmlAgilityPack.dll = References\HtmlAgilityPack\HtmlAgilityPack.dll 11 | References\HtmlAgilityPack\HtmlAgilityPack.XML = References\HtmlAgilityPack\HtmlAgilityPack.XML 12 | EndProjectSection 13 | EndProject 14 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Rhythm.Staticize", "Rhythm.Staticize\Rhythm.Staticize.csproj", "{C1EBD5EC-6861-4C79-A954-6B58EC944FF8}" 15 | EndProject 16 | Global 17 | 18 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 19 | Debug|Any CPU = Debug|Any CPU 20 | Release|Any CPU = Release|Any CPU 21 | EndGlobalSection 22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 23 | {BB81F20F-68E2-415B-B885-30DEA4B7CD4D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 24 | {BB81F20F-68E2-415B-B885-30DEA4B7CD4D}.Debug|Any CPU.Build.0 = Debug|Any CPU 25 | {BB81F20F-68E2-415B-B885-30DEA4B7CD4D}.Release|Any CPU.ActiveCfg = Release|Any CPU 26 | {BB81F20F-68E2-415B-B885-30DEA4B7CD4D}.Release|Any CPU.Build.0 = Release|Any CPU 27 | {C1EBD5EC-6861-4C79-A954-6B58EC944FF8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 28 | {C1EBD5EC-6861-4C79-A954-6B58EC944FF8}.Debug|Any CPU.Build.0 = Debug|Any CPU 29 | {C1EBD5EC-6861-4C79-A954-6B58EC944FF8}.Release|Any CPU.ActiveCfg = Release|Any CPU 30 | {C1EBD5EC-6861-4C79-A954-6B58EC944FF8}.Release|Any CPU.Build.0 = Release|Any CPU 31 | EndGlobalSection 32 | GlobalSection(SolutionProperties) = preSolution 33 | HideSolutionNode = FALSE 34 | EndGlobalSection 35 | GlobalSection(NestedProjects) = preSolution 36 | {9B4B0CB9-A03F-4680-AB08-150C30F07AF0} = {9210C7F1-32B1-4C5B-8D9E-F38C06E96AE7} 37 | EndGlobalSection 38 | EndGlobal 39 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Extensions/IOExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | public static class IOExtensions 9 | { 10 | public static bool IsDirectorySeparator(this char chr) 11 | { 12 | return chr == '/' || chr == '\\'; 13 | } 14 | 15 | public static int IndexOfDirectorySeparator(this string path) 16 | { 17 | if (path == null) 18 | { 19 | return -1; 20 | } 21 | int index1 = path.IndexOf('/'); 22 | int index2 = path.IndexOf('\\'); 23 | return index1 > index2 ? index1 : index2; 24 | } 25 | 26 | 27 | /// 28 | /// 移光最前面的斜杠,不管是正斜还是反斜。 29 | /// 30 | /// 31 | /// 32 | public static string RemoveStartDirectorySeparator(this string path) 33 | { 34 | if (path == null) 35 | { 36 | return null; 37 | } 38 | if (path.Length == 0) 39 | { 40 | return path; 41 | } 42 | if (!path[0].IsDirectorySeparator()) 43 | { 44 | return path; 45 | } 46 | do 47 | { 48 | path = path.Substring(1); 49 | } while (path.Length != 0 && path[0].IsDirectorySeparator()); 50 | return path; 51 | } 52 | 53 | /// 54 | /// 移光最后的斜杠,不管是正斜还是反斜。 55 | /// 56 | /// 57 | /// 58 | public static string RemoveLastDirectorySeparator(this string path) 59 | { 60 | if (path == null) 61 | { 62 | return null; 63 | } 64 | int length = path.Length; 65 | if (length == 0) 66 | { 67 | return path; 68 | } 69 | if (!path[(length - 1)].IsDirectorySeparator()) 70 | { 71 | return path; 72 | } 73 | do 74 | { 75 | // 移掉最后一个 76 | path = path.Substring(0, length - 1); 77 | length--; 78 | } while (length > 0 && path[length - 1].IsDirectorySeparator()); 79 | return path; 80 | } 81 | 82 | 83 | 84 | 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ValidationDelegateTaken.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 定义一个对 HtmlDocument 进行验证的规则。 11 | /// 12 | public class ValidationDelegateTaken : IValidation 13 | { 14 | String errorMessage; 15 | System.Func documentValidation; 16 | 17 | /// 18 | /// 初始化 HtmlDocumentValidation 的新实例。 19 | /// 20 | /// 此验证的名称信息(用于向用户界面显示)。 21 | /// 一个委托,用于验证 HtmlDocument 是否符合规则。 22 | /// 验证不符合规则时的提示信息。 23 | public ValidationDelegateTaken(String name, ValidationType validationType, System.Func documentValidation, String errorMessage) 24 | { 25 | this.documentValidation = documentValidation; 26 | this.errorMessage = errorMessage; 27 | this.Name = name; 28 | this.Type = validationType; 29 | if (documentValidation == null) 30 | { 31 | throw new ArgumentNullException("documentValidation"); 32 | } 33 | if (errorMessage == null) 34 | { 35 | throw new ArgumentNullException("errorMessage"); 36 | } 37 | } 38 | 39 | /// 40 | /// 执行验证。 41 | /// 42 | /// 被验证的 HtmlDocument 43 | /// 验证通过则返回true。 44 | public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) 45 | { 46 | if (document == null) 47 | { 48 | throw new ArgumentNullException("document"); 49 | } 50 | return documentValidation.Invoke(document) ? null : errorMessage; 51 | } 52 | 53 | /// 54 | /// 获取验证不通过时的错误提示信息。 55 | /// 56 | public String ErrorMessage { get { return errorMessage; } } 57 | 58 | /// 59 | /// 获取此验证的名称信息(用于向用户界面显示)。 60 | /// 61 | public String Name { get; private set; } 62 | 63 | public ValidationType Type { get; set; } 64 | 65 | public override string ToString() 66 | { 67 | return this.Name; 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /Rhythm.Staticize/IUriResourcesFromLocalFileSystemReslover.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 定义将Uri路径转换为本地文件系统文件的支持。 10 | /// 11 | public interface IUriResourcesFromLocalFileSystemReslover 12 | { 13 | /// 14 | /// 定义将Uri转换为本地路径。 15 | /// 16 | /// 表示一个资源的链接。 17 | /// 该资源的本地路径。 18 | String ResloveLocalPath(Uri uri); 19 | 20 | /// 21 | /// 如果被静态化的网站和静态化程序处于同一个计算机中,尝试直接从本地复制文件。 22 | /// 23 | /// 24 | /// 25 | /// 是否复制成功 26 | Boolean TryCopyFromLocal(Uri uri, String saveFilePath); 27 | } 28 | 29 | /// 30 | /// 默认 IUriToLocalFilePathReslover 的实现。 31 | /// 32 | public class DefaultUriToLocalFilePathReslover : IUriResourcesFromLocalFileSystemReslover 33 | { 34 | /// 35 | /// 获取 DefaultUriToLocalFilePathReslover 的实例。 36 | /// 37 | public static readonly IUriResourcesFromLocalFileSystemReslover Instance = new DefaultUriToLocalFilePathReslover(); 38 | 39 | String baseDir = AppDomain.CurrentDomain.BaseDirectory; 40 | 41 | /// 42 | /// 定义将Uri转换为本地路径。 43 | /// 44 | /// 表示一个资源的链接。 45 | /// 该资源的本地路径。 46 | public string ResloveLocalPath(Uri uri) 47 | { 48 | return FormatUriToLocalPath(uri); 49 | } 50 | 51 | static string FormatUriToLocalPath(Uri uri) 52 | { 53 | String localPath = uri.IsAbsoluteUri ? uri.LocalPath : uri.ToString(); 54 | localPath = localPath[0] == '/' || localPath[0] == '\\' ? localPath.Substring(1, localPath.Length - 1) : localPath; 55 | return localPath; 56 | } 57 | 58 | /// 59 | /// 如果被静态化的网站和静态化程序处于同一个计算机中,尝试直接从本地复制文件。 60 | /// 61 | /// 62 | /// 63 | /// 是否复制成功 64 | public bool TryCopyFromLocal(Uri uri, string saveFilePath) 65 | { 66 | String resourceFilePath = System.IO.Path.Combine(baseDir, FormatUriToLocalPath(uri)); 67 | if (System.IO.File.Exists(resourceFilePath)) 68 | { 69 | try 70 | { 71 | System.IO.File.Copy(resourceFilePath, saveFilePath); 72 | return true; 73 | } 74 | catch (Exception) 75 | { 76 | return false; 77 | } 78 | } 79 | return false; 80 | } 81 | 82 | } 83 | 84 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.sln.docstates 8 | 9 | # Build results 10 | 11 | [Dd]ebug/ 12 | [Rr]elease/ 13 | x64/ 14 | build/ 15 | [Bb]in/ 16 | [Oo]bj/ 17 | 18 | # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets 19 | !packages/*/build/ 20 | 21 | # MSTest test Results 22 | [Tt]est[Rr]esult*/ 23 | [Bb]uild[Ll]og.* 24 | 25 | *_i.c 26 | *_p.c 27 | *.ilk 28 | *.meta 29 | *.obj 30 | *.pch 31 | *.pdb 32 | *.pgc 33 | *.pgd 34 | *.rsp 35 | *.sbr 36 | *.tlb 37 | *.tli 38 | *.tlh 39 | *.tmp 40 | *.tmp_proj 41 | *.log 42 | *.vspscc 43 | *.vssscc 44 | .builds 45 | *.pidb 46 | *.log 47 | *.scc 48 | 49 | # Visual C++ cache files 50 | ipch/ 51 | *.aps 52 | *.ncb 53 | *.opensdf 54 | *.sdf 55 | *.cachefile 56 | 57 | # Visual Studio profiler 58 | *.psess 59 | *.vsp 60 | *.vspx 61 | 62 | # Guidance Automation Toolkit 63 | *.gpState 64 | 65 | # ReSharper is a .NET coding add-in 66 | _ReSharper*/ 67 | *.[Rr]e[Ss]harper 68 | 69 | # TeamCity is a build add-in 70 | _TeamCity* 71 | 72 | # DotCover is a Code Coverage Tool 73 | *.dotCover 74 | 75 | # NCrunch 76 | *.ncrunch* 77 | .*crunch*.local.xml 78 | 79 | # Installshield output folder 80 | [Ee]xpress/ 81 | 82 | # DocProject is a documentation generator add-in 83 | DocProject/buildhelp/ 84 | DocProject/Help/*.HxT 85 | DocProject/Help/*.HxC 86 | DocProject/Help/*.hhc 87 | DocProject/Help/*.hhk 88 | DocProject/Help/*.hhp 89 | DocProject/Help/Html2 90 | DocProject/Help/html 91 | 92 | # Click-Once directory 93 | publish/ 94 | 95 | # Publish Web Output 96 | *.Publish.xml 97 | 98 | # NuGet Packages Directory 99 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 100 | #packages/ 101 | 102 | # Windows Azure Build Output 103 | csx 104 | *.build.csdef 105 | 106 | # Windows Store app package directory 107 | AppPackages/ 108 | 109 | # Others 110 | sql/ 111 | *.Cache 112 | ClientBin/ 113 | [Ss]tyle[Cc]op.* 114 | ~$* 115 | *~ 116 | *.dbmdl 117 | *.[Pp]ublish.xml 118 | *.pfx 119 | *.publishsettings 120 | 121 | # RIA/Silverlight projects 122 | Generated_Code/ 123 | 124 | # Backup & report files from converting an old project file to a newer 125 | # Visual Studio version. Backup files are not needed, because we have git ;-) 126 | _UpgradeReport_Files/ 127 | Backup*/ 128 | UpgradeLog*.XML 129 | UpgradeLog*.htm 130 | 131 | # SQL Server files 132 | App_Data/*.mdf 133 | App_Data/*.ldf 134 | 135 | 136 | #LightSwitch generated files 137 | GeneratedArtifacts/ 138 | _Pvt_Extensions/ 139 | ModelManifest.xml 140 | 141 | # ========================= 142 | # Windows detritus 143 | # ========================= 144 | 145 | # Windows image file caches 146 | Thumbs.db 147 | ehthumbs.db 148 | 149 | # Folder config file 150 | Desktop.ini 151 | 152 | # Recycle Bin used on file shares 153 | $RECYCLE.BIN/ 154 | 155 | # Mac desktop service store files 156 | .DS_Store 157 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ValidationGroup.cs: -------------------------------------------------------------------------------- 1 | //using System; 2 | //using System.Collections.Generic; 3 | //using System.Linq; 4 | //using System.Text; 5 | //using System.Threading.Tasks; 6 | 7 | //namespace Rhythm.Staticize 8 | //{ 9 | // /// 10 | // /// 提供对被爬行页面的 HtmlDocument 验证支持。 11 | // /// 12 | // [Obsolete ] 13 | // public class ValidationGroup 14 | // { 15 | // List m_Validations; 16 | 17 | // /// 18 | // /// 使用lambda表达式或委托创建验证规则。 19 | // /// 20 | // /// 一个委托,用于验证 HtmlDocument 是否符合规则,返回值为Boolean。 21 | // /// 验证不符合规则时的提示信息。 22 | // public ValidationGroup Add(String name, System.Func documentValidation, String errorMessage) 23 | // { 24 | // if (documentValidation == null) 25 | // { 26 | // throw new ArgumentNullException("documentValidation"); 27 | // } 28 | // if (errorMessage == null) 29 | // { 30 | // throw new ArgumentNullException("errorMessage"); 31 | // } 32 | // if (m_Validations == null) 33 | // { 34 | // m_Validations = new List(); 35 | // } 36 | // m_Validations.Add(new ValidationDelegateTaken(name, documentValidation: documentValidation, errorMessage: errorMessage)); 37 | // return this; 38 | // } 39 | 40 | // /// 41 | // /// 添加自定义验证规则。 42 | // /// 43 | // /// 自定义验证规则。 44 | // public ValidationGroup Add(params IValidation[] validations) 45 | // { 46 | // if (validations == null) 47 | // { 48 | // throw new ArgumentNullException("validations"); 49 | // } 50 | // if (m_Validations == null) 51 | // { 52 | // m_Validations = new List(); 53 | // } 54 | // m_Validations.AddRange(validations); 55 | // return this; 56 | // } 57 | 58 | 59 | // /// 60 | // /// 执行所有 HTML 检查 61 | // /// 62 | // /// 63 | // /// HTML 检查结果。 64 | // public ValidationResult Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) 65 | // { 66 | // return m_Validations == null ? null : m_Validations.Validate(document, status); 67 | // } 68 | 69 | // public override string ToString() 70 | // { 71 | // if (m_Validations != null) 72 | // { 73 | // StringBuilder builder = new StringBuilder(); 74 | // for (int i = 0; i < this.m_Validations.Count; i++) 75 | // { 76 | // builder.AppendFormat("{0},", m_Validations[i].ToString()); 77 | // } 78 | // return builder.ToString(); 79 | // } 80 | // return base.ToString(); 81 | // } 82 | 83 | // public IList Validations 84 | // { 85 | // get { return m_Validations; } 86 | // } 87 | 88 | // } 89 | //} 90 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ValidationResult.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 网页静态化后的验证结果 11 | /// 12 | public class ValidationResult 13 | { 14 | public ValidationResult() 15 | { 16 | //Errors = new Dictionary(); 17 | } 18 | 19 | public int Id { get; set; } 20 | 21 | /// 22 | /// 链接地址(绝对路径) 23 | /// 24 | public Uri Uri { get; set; } 25 | 26 | /// 27 | /// 校验的类型 28 | /// 29 | public virtual ValidationType ValidationType { get; set; } 30 | 31 | /// 32 | /// 校验的短标题,用于在用户界面显示 33 | /// 34 | public virtual string Name { get; set; } 35 | 36 | /// 37 | /// 校验的结果信息,会在用户界面显示 38 | /// 39 | public virtual string Message { get; set; } 40 | 41 | /// 42 | /// 有的校验可能会产生异常,此属性用于记录详细异常信息。 43 | /// 44 | public virtual System.Exception Exception { get; set; } 45 | 46 | 47 | //public IDictionary Errors { get; set; } 48 | 49 | //public bool IsValid 50 | //{ 51 | // get 52 | // { 53 | // var error = this.Errors; 54 | // return (error == null || error.Count == 0); 55 | // } 56 | //} 57 | 58 | //public void AddError(String name, String errorMessage) 59 | //{ 60 | // String existsMessage; 61 | // if (Errors.TryGetValue(name, out existsMessage)) 62 | // { 63 | // Errors[name] = String.Format("{0}\r\n{1}", existsMessage, errorMessage); 64 | // } 65 | // else 66 | // { 67 | // Errors.Add(name, errorMessage); 68 | // } 69 | //} 70 | 71 | //public void RemoveError(String name) 72 | //{ 73 | // Errors.Remove(name); 74 | //} 75 | 76 | //public override string ToString() 77 | //{ 78 | // var errors = this.Errors; 79 | // if (errors != null && errors.Count > 0) 80 | // { 81 | // System.Text.StringBuilder builder = new System.Text.StringBuilder(errors.Count * 20); 82 | // builder.AppendFormat("以下是对页面{0}的验证结果:\r\n\r\n", Uri); 83 | // foreach (var name in errors.Keys) 84 | // { 85 | // String message = errors[name]; 86 | // builder.AppendFormat("验证[{0}]不通过:\r\n{1}\r\n", name, message); 87 | // } 88 | // return builder.ToString(); 89 | // } 90 | // return ""; 91 | //} 92 | 93 | public override string ToString() 94 | { 95 | if (Message != null && Message.Length > 0) 96 | { 97 | return string.Format("验证[{0}]不通过:\r\n{1}\r\n", Name, Message); 98 | } 99 | return ""; 100 | } 101 | 102 | 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ValidationExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | public static class ValidationExtensions 10 | { 11 | /// 12 | /// 执行所有 HTML 检查 13 | /// 14 | /// 15 | /// HTML 检查结果。 16 | public static IList Validate(this IEnumerable validations, HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context) 17 | { 18 | if (document == null) 19 | { 20 | throw new ArgumentNullException("document"); 21 | } 22 | if (context == null) 23 | { 24 | throw new ArgumentNullException("context"); 25 | } 26 | if (validations == null) 27 | { 28 | return null; 29 | } 30 | var validationResult = new List(); 31 | foreach (var vd in validations) 32 | { 33 | var errorMessage = vd.Validate(document, context); 34 | if (errorMessage != null && errorMessage.Length > 0) 35 | { 36 | validationResult.Add(new ValidationResult 37 | { 38 | Uri = context.Uri, 39 | ValidationType = vd.Type, 40 | Name = vd.Name, 41 | Message = errorMessage, 42 | }); 43 | } 44 | } 45 | return validationResult; 46 | } 47 | 48 | public static IEnumerable GetValidationResults(this IEnumerable staticizeContext) 49 | { 50 | List all = new List(); 51 | foreach (var item in staticizeContext) 52 | { 53 | if (item.ValidationResults != null && item.ValidationResults.Count() > 0) 54 | { 55 | all.AddRange(item.ValidationResults); 56 | } 57 | } 58 | return all; 59 | } 60 | 61 | public static void Save(this IEnumerable validateResults, String filePath) 62 | { 63 | if (validateResults == null) 64 | { 65 | return; 66 | } 67 | System.Text.StringBuilder builder = new System.Text.StringBuilder(); 68 | foreach (var item in validateResults) 69 | { 70 | var message = item.Message; 71 | if (message == null || message.Length == 0) 72 | { 73 | continue; 74 | } 75 | builder.AppendFormat("=======================\r\n"); 76 | builder.AppendFormat("对页面 {0} 的验证结果:\r\n\r\n", item.Uri); 77 | builder.AppendFormat("验证 [{0}] 不通过:\r\n{1}\r\n", item.Name, message); 78 | } 79 | if (builder.Length == 0) 80 | { 81 | return; 82 | } 83 | System.IO.File.AppendAllText(filePath, builder.ToString()); 84 | } 85 | 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/InternalALinkValidation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | public class InternalALinkExistingValidation : IValidation 9 | { 10 | String searchDirectory; 11 | IUriResourcesFromLocalFileSystemReslover fileReslover; 12 | 13 | Dictionary files = new Dictionary(5000); 14 | 15 | /// 16 | /// 初始化 InternalALinkValidation 的新实例。 17 | /// 18 | /// 引用资源的输出文件夹。 19 | /// 用于将Uri路径转换为本地路径。 20 | public InternalALinkExistingValidation(String searchBaseDirectory, IUriResourcesFromLocalFileSystemReslover resourceFileReslover = null) 21 | { 22 | this.searchDirectory = searchBaseDirectory; 23 | this.fileReslover = resourceFileReslover ?? DefaultUriToLocalFilePathReslover.Instance; 24 | if (searchBaseDirectory == null) 25 | { 26 | throw new ArgumentNullException("outputBaseDirectory"); 27 | } 28 | } 29 | 30 | public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) 31 | { 32 | //得到当前页面的目录 33 | string documentDir = status.Uri.GetFileDirectory(); 34 | var errorMessage = new StringBuilder(); 35 | var htmlNode = document.DocumentNode; 36 | var nodes = htmlNode.SelectNodes("//a"); 37 | if (nodes == null || nodes.Count == 0) 38 | { 39 | return null; 40 | } 41 | foreach (var aNode in nodes) 42 | { 43 | string href = aNode.GetAttributeValue("href", null); 44 | if (string.IsNullOrWhiteSpace(href) || href[0] == '#') 45 | { 46 | continue; 47 | } 48 | //如果href是相对当前页面来说的: 49 | if (!href[0].IsDirectorySeparator()) 50 | { 51 | href = documentDir + href; 52 | } 53 | 54 | Uri uri; 55 | if (!Uri.TryCreate(href, UriKind.RelativeOrAbsolute, out uri)) 56 | { 57 | continue; 58 | } 59 | //这里可以增加对站内域名的判断 60 | if (uri.IsAbsoluteUri && !string.IsNullOrEmpty(uri.Host)) 61 | { 62 | continue; 63 | } 64 | string local = fileReslover.ResloveLocalPath(uri); 65 | if (string.IsNullOrEmpty(local)) 66 | { 67 | continue; 68 | } 69 | string localPath = System.IO.Path.Combine(searchDirectory, local); 70 | bool isExisting; 71 | if (!files.TryGetValue(localPath, out isExisting)) 72 | { 73 | isExisting = System.IO.File.Exists(localPath); 74 | try 75 | { 76 | files.Add(localPath, isExisting); 77 | } 78 | catch (Exception) 79 | { 80 | } 81 | } 82 | if (!isExisting) 83 | { 84 | errorMessage.AppendFormat("本地不存在链接 \"{0}\" 所指向的文件 \"{1}\"。", uri.ToString(), localPath); 85 | } 86 | } 87 | return errorMessage.Length == 0 ? null : errorMessage.ToString(); 88 | } 89 | 90 | public string Name { get { return "检查页面HTML中的站内A链接指向的页面是在本地存在对应文件。"; } } 91 | 92 | public ValidationType Type { get { return ValidationType.Link; } } 93 | 94 | 95 | public override string ToString() { return ((IValidation)this).Name; } 96 | 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/XPathValidation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using Rhythm.Staticize; 7 | 8 | namespace Rhythm.Staticize 9 | { 10 | /// 11 | /// 提供验证 HTML 文档中特定DOM元素的位置是否与预期的位置相符(内部使用元素的XPath进行对比,预期的位置通过定义一个参考模板获得)。 12 | /// 13 | public class XPathValidation : IValidation 14 | { 15 | Dictionary elementXPath; 16 | 17 | HtmlAgilityPack.HtmlDocument truthDocument; 18 | 19 | /// 20 | /// 初始化 HtmlDocumentXPathValidation 的新实例。 21 | /// 22 | /// 参考模板文件。 23 | /// 需要进行位置检查的网页元素Id。 24 | public XPathValidation(String templateFile, params String[] elementIds) 25 | { 26 | if (templateFile == null) 27 | { 28 | throw new ArgumentNullException("truthDocumentFile"); 29 | } 30 | if (elementIds == null) 31 | { 32 | throw new ArgumentNullException("elementIds"); 33 | } 34 | this.truthDocument = new HtmlAgilityPack.HtmlDocument(); 35 | this.truthDocument.Load(templateFile); 36 | Init(elementIds); 37 | } 38 | 39 | /// 40 | /// 初始化 HtmlDocumentXPathValidation 的新实例。 41 | /// 42 | /// 参考模板文档对象。 43 | /// 需要进行位置检查的网页元素Id。 44 | public XPathValidation(HtmlAgilityPack.HtmlDocument templateDocument, params String[] elementIds) 45 | { 46 | this.truthDocument = templateDocument; 47 | if (templateDocument == null) 48 | { 49 | throw new ArgumentNullException("truthDocument"); 50 | } 51 | Init(elementIds); 52 | } 53 | 54 | void Init(String[] elementIds) 55 | { 56 | elementXPath = new Dictionary(elementIds.Length); 57 | foreach (var id in elementIds) 58 | { 59 | AddXPathCheck(id); 60 | } 61 | } 62 | 63 | void AddXPathCheck(String id) 64 | { 65 | var element = truthDocument.GetElementbyId(id); 66 | if (element == null) 67 | { 68 | return; 69 | } 70 | if (elementXPath.ContainsKey(id)) 71 | { 72 | return; 73 | } 74 | elementXPath.Add(id, element.XPath); 75 | } 76 | 77 | public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) 78 | { 79 | var errorMessageBuilder = new StringBuilder(); 80 | foreach (var id in this.elementXPath.Keys) 81 | { 82 | String truthXPath = elementXPath[id]; 83 | var element = document.GetElementbyId(id); 84 | if (element == null) 85 | { 86 | errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" 在文档中不存在。", id); 87 | continue; 88 | } 89 | if (element.XPath != truthXPath) 90 | { 91 | errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" XPath 不匹配,应为\"{1}\",但实际为\"{2}\"。\r\n行号:{3}\r\n源HTML:\r\n{4}\r\n", id, truthXPath, element.XPath, element.Line.ToString(), element.OuterHtml); 92 | continue; 93 | } 94 | } 95 | return errorMessageBuilder.Length == 0 ? null : errorMessageBuilder.ToString(); 96 | } 97 | 98 | 99 | public string Name { get { return "页面元素XPath与模板XPath是否相符。"; } } 100 | 101 | public ValidationType Type { get { return ValidationType.Tag; } } 102 | 103 | 104 | public override string ToString() { return ((IValidation)this).Name; } 105 | 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Rhythm.Staticize.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {C1EBD5EC-6861-4C79-A954-6B58EC944FF8} 8 | Library 9 | Properties 10 | Rhythm.Staticize 11 | Rhythm.Staticize 12 | v4.0 13 | 512 14 | 12.0.0 15 | 2.0 16 | SAK 17 | SAK 18 | SAK 19 | SAK 20 | 21 | 22 | 23 | true 24 | full 25 | false 26 | bin\ 27 | DEBUG;TRACE 28 | prompt 29 | 4 30 | 31 | 32 | pdbonly 33 | true 34 | bin\ 35 | TRACE 36 | prompt 37 | 4 38 | 39 | 40 | 41 | ..\References\HtmlAgilityPack\HtmlAgilityPack.dll 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 88 | -------------------------------------------------------------------------------- /Rhythm.Staticize/StaticizeStepStatus.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace Rhythm.Staticize 7 | { 8 | /// 9 | /// 用于静态化状态跟踪。 10 | /// 支持异步跟踪静态化状态 11 | /// 12 | public class StaticizeStepStatus : IEnumerable 13 | { 14 | internal int pageCount; 15 | 16 | /// 17 | /// 由于在内部生成页面是并行化的,因此此字段是 volatile 字段。 18 | /// 19 | volatile int volatileGeneratedPageCount; 20 | int generatedPageCount; 21 | internal int validatedPageCount; 22 | 23 | Dictionary contexts; 24 | 25 | /// 26 | /// 初始化 StaticizeStepStatus 的新实例。 27 | /// 28 | public StaticizeStepStatus() 29 | { 30 | this.Errors = new List(); 31 | ValidationErrors = new List(); 32 | } 33 | 34 | internal void AddGeneratedPageCount() 35 | { 36 | volatileGeneratedPageCount++; 37 | generatedPageCount = volatileGeneratedPageCount; 38 | } 39 | 40 | internal void AddValidatedPageCount() 41 | { 42 | validatedPageCount++; 43 | } 44 | 45 | //public StaticizeStepStatus(string id) 46 | //{ 47 | // if (id == null) 48 | // { 49 | // throw new ArgumentNullException("id"); 50 | // } 51 | // this.Id = id; 52 | //} 53 | 54 | //public string Id { get; internal set; } 55 | internal void Init(HtmlStaticizeContext[] entries) 56 | { 57 | contexts = new Dictionary(entries.Length); 58 | for (int j = 0; j < entries.Length; j++) 59 | { 60 | contexts.Add(entries[j].uri, entries[j]); 61 | } 62 | Urls = contexts.Keys; 63 | Items = entries; 64 | } 65 | 66 | /// 67 | /// 获取验证错误信息 68 | /// 69 | public IList ValidationErrors { get; private set; } 70 | 71 | public HtmlStaticizeContext this[Uri uri] { get { return contexts[uri]; } } 72 | 73 | public ICollection Urls { get; private set; } 74 | 75 | /// 76 | /// 获取所有页面的静态化上下文 77 | /// 78 | public ICollection Items { get; private set; } 79 | 80 | /// 81 | /// 获取静态化过程中发生的异常信息 82 | /// 83 | public IList Errors { get; internal set; } 84 | 85 | public IEnumerator GetEnumerator() 86 | { 87 | return contexts.Values.GetEnumerator(); 88 | } 89 | 90 | System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() 91 | { 92 | return contexts.Values.GetEnumerator(); 93 | } 94 | 95 | StaticizeStep step; 96 | /// 97 | /// 获取当前静态化过程正处于哪个步骤 98 | /// 99 | public StaticizeStep Step 100 | { 101 | get { return step; } 102 | // 步骤变更后,会触发 StepChanged 事件 103 | internal set 104 | { 105 | var previus = step; 106 | step = value; 107 | if (previus != value) 108 | { 109 | var @event = StepChanged; 110 | if (@event != null) 111 | { 112 | @event(this, new StaticizeStepChangedEventArgs { Step = step, }); 113 | } 114 | } 115 | } 116 | } 117 | 118 | /// 119 | /// 阶段性状态变更事件。 120 | /// 121 | [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1009:DeclareEventHandlersCorrectly")] 122 | public event StaticizeStepChangedEventHandler StepChanged; 123 | 124 | /// 125 | /// 获取当前的页面总数 126 | /// 127 | public int PageCount { get { return pageCount; } } 128 | 129 | /// 130 | /// 获取已生成的页面总数 131 | /// 132 | public int GeneratedPageCount { get { return generatedPageCount; } } 133 | 134 | /// 135 | /// 获取已验证的页面总数 136 | /// 137 | public int ValidatedPageCount { get { return validatedPageCount; } } 138 | } 139 | 140 | } 141 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ReferenceResourcesExistingValidation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 检查HTML中引用的资源文件是否在文件系统中存在。 11 | /// 它会搜索网页中引用的所有CSS、JS、图片文件,然后在本地静态化目录查找是否存在这些文件。 12 | /// 13 | public class ReferenceResourcesExistingValidation : IValidation 14 | { 15 | Dictionary exisitingFiles = new Dictionary(1000); 16 | String outputDir; 17 | IUriResourcesFromLocalFileSystemReslover fileReslover; 18 | 19 | /// 20 | /// 初始化 ReferenceResourcesExistsValidation 的新实例。 21 | /// 22 | /// 资源的基础保存目录,将会基于此目录搜索相关资源文件。 23 | /// 定义将Uri路径转换为本地文件系统路径。 24 | public ReferenceResourcesExistingValidation(String resourceBaseDir, IUriResourcesFromLocalFileSystemReslover resourceFileReslover = null) 25 | { 26 | this.outputDir = resourceBaseDir; 27 | this.fileReslover = resourceFileReslover ?? DefaultUriToLocalFilePathReslover.Instance; 28 | } 29 | 30 | public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) 31 | { 32 | var resources = status.Resources; 33 | String resultCss = Validate(resources.ReferenceCsses, status); 34 | String resultJs = Validate(resources.ReferenceJavascripts, status); 35 | String resultImage = Validate(resources.ReferenceImages, status); 36 | return ( 37 | String.IsNullOrEmpty(resultCss) && 38 | String.IsNullOrEmpty(resultImage) && 39 | String.IsNullOrEmpty(resultJs) 40 | ) ? null : ( 41 | String.Format("{0}\r\n{1}\r\n{2}", resultCss, resultJs, resultImage) 42 | ); 43 | } 44 | 45 | string Validate(IList list, HtmlStaticizeContext status) 46 | { 47 | if (list == null) 48 | { 49 | return null; 50 | } 51 | Boolean isValid = true; 52 | var errorMessage = new StringBuilder(); 53 | foreach (var uri in list) 54 | { 55 | String fileName = fileReslover.ResloveLocalPath(uri); 56 | if (String.IsNullOrEmpty(fileName)) 57 | { 58 | continue; 59 | } 60 | String physicalFilePath = System.IO.Path.Combine(outputDir, fileName); 61 | bool fileExists = false; 62 | bool hasKey = false; 63 | try 64 | { 65 | //并发修改 patch 66 | hasKey = exisitingFiles.TryGetValue(physicalFilePath, out fileExists); 67 | } 68 | catch (Exception) 69 | { 70 | } 71 | if (!hasKey || !fileExists) 72 | { 73 | if (!hasKey) 74 | { 75 | fileExists = System.IO.File.Exists(physicalFilePath); 76 | } 77 | if (!fileExists) 78 | { 79 | isValid = false; 80 | status.Resources.NotExistsFiles.Add(uri, physicalFilePath); 81 | errorMessage.AppendFormat("资源 \"{0}\" 未能在本地预期的路径 \"{1}\" 中找到。\r\n", uri.ToString(), physicalFilePath); 82 | { 83 | var ex = status.GenerationError; 84 | if (ex != null) 85 | { 86 | errorMessage.AppendLine("这可能是由于请求文件时发生异常造成的,以下是异常信息:"); 87 | errorMessage.AppendFormat("{0}:\r\n{1}\r\n\r\n", ex.Message, ex.ToString()); 88 | } 89 | } 90 | } 91 | try 92 | { 93 | //并发修改 patch 94 | exisitingFiles.Add(physicalFilePath, fileExists); 95 | } 96 | catch (Exception) 97 | { 98 | } 99 | } 100 | } 101 | return isValid ? null : errorMessage.ToString(); 102 | } 103 | 104 | //StringBuilder errorMessage;// = new StringBuilder(); 105 | //string IValidation.ErrorMessage 106 | //{ 107 | // get { return errorMessage == null ? null : errorMessage.ToString(); } 108 | //} 109 | 110 | public string Name { get { return "网页引用的资源文件是否存在"; } } 111 | 112 | public ValidationType Type { get { return ValidationType.Resource; } } 113 | 114 | 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /Rhythm.Staticize.Tests/Rhythm.Staticize.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Debug 5 | AnyCPU 6 | {BB81F20F-68E2-415B-B885-30DEA4B7CD4D} 7 | Library 8 | Properties 9 | Rhythm.Staticize 10 | Rhythm.Staticize.Tests 11 | v4.0 12 | 512 13 | {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} 14 | 10.0 15 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) 16 | $(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages 17 | False 18 | UnitTest 19 | SAK 20 | SAK 21 | SAK 22 | SAK 23 | 24 | 25 | true 26 | full 27 | false 28 | bin\Debug\ 29 | DEBUG;TRACE 30 | prompt 31 | 4 32 | 33 | 34 | pdbonly 35 | true 36 | bin\Release\ 37 | TRACE 38 | prompt 39 | 4 40 | 41 | 42 | 43 | 44 | 3.5 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | {c1ebd5ec-6861-4c79-a954-6b58ec944ff8} 67 | Rhythm.Staticize 68 | 69 | 70 | 71 | 72 | 73 | 74 | False 75 | 76 | 77 | False 78 | 79 | 80 | False 81 | 82 | 83 | False 84 | 85 | 86 | 87 | 88 | 89 | 90 | 97 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Validation/ValidationProjection.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 提供对常用 HTML 验证方法的支持。 11 | /// 12 | public static class ValidationProjection 13 | { 14 | /// 15 | /// 验证 HTML 中是否包含指定内容。 16 | /// 17 | /// 18 | /// 19 | /// 20 | public static IValidation Contains(String value, String errorMessage = null) 21 | { 22 | return new ValidationDelegateTaken("页面是否存在指定内容", ValidationType.Content, (doc) => 23 | { 24 | var htmlNode = doc.DocumentNode; 25 | return htmlNode.InnerHtml.Contains(value); 26 | }, errorMessage ?? string.Format("页面中不存在预期的内容:\"{0}\"。", value)); 27 | } 28 | 29 | /// 30 | /// 验证 HTML 网页标题是否等于预期的标题。 31 | /// 32 | /// 预期的标题 33 | /// 34 | /// 35 | public static IValidation TitleEquals(String excepted, String errorMessage = null) 36 | { 37 | return new ValidationDelegateTaken("网页标题", ValidationType.Content, (doc) => 38 | { 39 | var htmlNode = doc.DocumentNode; 40 | var titleNode = htmlNode.SelectSingleNode(@"html/head/title"); 41 | if (titleNode == null) 42 | { 43 | return false; 44 | } 45 | return excepted == titleNode.InnerHtml; 46 | }, errorMessage ?? string.Format("预期的标题 \"{0}\"。", excepted)); 47 | } 48 | 49 | /// 50 | /// 验证 HTML DOM 中是否存在指定的元素。 51 | /// 52 | /// 53 | /// 54 | /// 55 | public static IValidation HasElement(String elementId, String errorMessage = null) 56 | { 57 | return new ValidationDelegateTaken("页面元素是否存在", ValidationType.Tag, (doc) => 58 | { 59 | var htmlNode = doc.DocumentNode; 60 | var element = doc.GetElementbyId(elementId); 61 | return element != null; 62 | }, errorMessage ?? string.Format("页面中不存在元素\"{0}\"。", elementId)); 63 | } 64 | 65 | /// 66 | /// 验证 HTML Docuemnt 是否包含指定的 CSS 文件。 67 | /// 68 | /// 69 | /// 错误信息。 70 | /// 71 | public static IValidation HasCssLink(String cssHref, String errorMessage = null) 72 | { 73 | return new ValidationDelegateTaken("CSS 标签是否存在", ValidationType.Tag , (doc) => 74 | { 75 | var htmlNode = doc.DocumentNode; 76 | var nodes = htmlNode.SelectNodes(@"//link[@rel='stylesheet']"); 77 | if (nodes == null) 78 | { 79 | return false; 80 | } 81 | return nodes.Where(n => cssHref == n.GetAttributeValue("href", null)).Count() > 0; 82 | }, errorMessage ?? string.Format("CSS 标签 \"{0}\" 不存在。", cssHref)); 83 | } 84 | 85 | /// 86 | /// 验证 HTML Docuemnt 是否包含指定的 JS 文件。 87 | /// 88 | /// 89 | /// 错误信息。 90 | /// 91 | public static IValidation HasScriptLink(String jsSrc, String errorMessage = null) 92 | { 93 | return new ValidationDelegateTaken("JS 标签是否存在", ValidationType.Tag, (doc) => 94 | { 95 | var htmlNode = doc.DocumentNode; 96 | var nodes = htmlNode.SelectNodes(@"//script[@src]"); 97 | if (nodes == null) 98 | { 99 | return false; 100 | } 101 | return nodes.Where(n => jsSrc == n.GetAttributeValue("src", null)).Count() > 0; 102 | }, errorMessage ?? string.Format("JS 标签 \"{0}\" 不存在。", jsSrc)); 103 | } 104 | 105 | /// 106 | /// 验证 HTML Docuemnt 中引用的资源十是否存在。 107 | /// 108 | /// 资源的基础保存目录,将会基于此目录搜索相关资源文件。 109 | /// 错误信息。 110 | /// 111 | public static IValidation ResourcesExisting(String resourceBaseDirectory, String errorMessage = null) 112 | { 113 | return new ReferenceResourcesExistingValidation(resourceBaseDirectory); 114 | } 115 | 116 | public static IValidation XPathEquals(String templateFile, params String[] elementIds) 117 | { 118 | return new XPathValidation(templateFile, elementIds); 119 | } 120 | 121 | public static IValidation InternalALinkExisting(String searchBaseDirectory) 122 | { 123 | return new InternalALinkExistingValidation(searchBaseDirectory); 124 | } 125 | 126 | 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /Rhythm.Staticize.Tests/StaticizeTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using System.Linq; 4 | using System.Collections.Generic; 5 | using Rhythm; 6 | using Rhythm.Staticize; 7 | 8 | namespace Rhythm.Staticize 9 | { 10 | [TestClass] 11 | public class StaticizeTest 12 | { 13 | /// 14 | /// 初始化代码 15 | /// 16 | [TestInitialize()] 17 | public void Initialize() { } 18 | 19 | /// 20 | /// 资源清理代码 21 | /// 22 | [TestCleanup] 23 | public void Cleanup() { } 24 | 25 | 26 | [TestMethod] 27 | public void StaticizeTest1() 28 | { 29 | // 编号 30 | String batchId = CreateBatchId(); 31 | 32 | // 输出文件夹 33 | string outputDirectory = System.IO.Path.Combine(AppDomain.CurrentDomain.BaseDirectory, batchId); 34 | System.IO.Directory.CreateDirectory(outputDirectory); 35 | 36 | List> pages = new List>(10000); 37 | 38 | // 将下面的 URL 生成为 HTML 静态化文件,文件会生成在 bin 下 39 | var urls = new[] { 40 | "http://www.zhihu.com/question/25519625", 41 | "http://www.zhihu.com/question/27232313", 42 | "http://www.zhihu.com/question/31291872", 43 | "http://www.zhihu.com/question/31293043", 44 | "http://www.zhihu.com/question/31318753", 45 | "http://cn.bing.com/", 46 | "http://36kr.com/" 47 | }; 48 | 49 | // 需要说明,如果页面内的图片、CSS、JS 采用相对路径 即不含(http://host/),Staticize 能够自动下载并放在文件夹中 50 | // 但如果是绝对路径,如 http://img3.douban.com/misc/mixed_static/7011201580a8cbed.css ,则是不会下载的。 51 | { 52 | for (int i = 1; i < urls.Length; i++) 53 | { 54 | string outputFile = System.IO.Path.Combine(outputDirectory, string.Concat("zihu-", i.ToString(), ".html")); 55 | pages.Add(new KeyValuePair(new Uri(urls[i]), outputFile)); 56 | } 57 | } 58 | 59 | CreateDirectory(pages, outputDirectory); 60 | 61 | Staticizer staticize = new Staticizer(); 62 | 63 | staticize.AddBehavior( 64 | new ImageResourcesDownloadBehavior(outputDirectory) 65 | ); 66 | 67 | //staticize.AddValidation( 68 | // //验证CSS文件是否存在 69 | // ValidationProjection.HasCssLink("/resources/css/jquery-ui-themes.css"), 70 | // ValidationProjection.HasCssLink("/resources/css/axure_rp_page.css"), 71 | // //验证网页主要页面DOM元素(id)是否存在 72 | // ValidationProjection.HasElement("main_container"), 73 | // //验证JS文件是否存在 74 | // ValidationProjection.HasScriptLink("/data/sitemap.js"), 75 | // ValidationProjection.HasScriptLink("/resources/scripts/jquery-1.7.1.min.js"), 76 | // ValidationProjection.HasScriptLink("/resources/scripts/axutils.js"), 77 | // ValidationProjection.HasScriptLink("/resources/scripts/jquery-ui-1.8.10.custom.min.js"), 78 | // ValidationProjection.HasScriptLink("/resources/scripts/axurerp_beforepagescript.js"), 79 | // ValidationProjection.HasScriptLink("/resources/scripts/messagecenter.js") 80 | // ); 81 | 82 | //staticize.AddValidation( 83 | // //验证 HTML Docuemnt 中引用的资源是否存在。 84 | // ValidationProjection.ResourcesExisting(outputDirectory), 85 | // //XPath 86 | // ValidationProjection.XPathEquals("main_template.html", "main_container"), 87 | // ValidationProjection.InternalALinkExisting(outputDirectory) 88 | // ); 89 | 90 | var stepTaken = new StaticizeStepStatus(); 91 | 92 | var staticizeResults = staticize.Staticize(pages, stepTaken); 93 | 94 | var validationResults = staticizeResults.GetValidationResults(); 95 | validationResults.Save(System.IO.Path.Combine(outputDirectory, "validationResults.txt")); 96 | } 97 | 98 | public KeyValuePair CreateUri(string address, String outputDirectory) 99 | { 100 | var uri = new Uri(address); 101 | string fileName = DefaultUriToLocalFilePathReslover.Instance.ResloveLocalPath(uri); 102 | return new KeyValuePair(uri, System.IO.Path.Combine(outputDirectory, fileName)); 103 | } 104 | 105 | void CreateDirectory(IEnumerable> pages, string outputDirectory) 106 | { 107 | foreach (var item in pages) 108 | { 109 | string pageDir = item.Key.GetFileDirectory(); 110 | var dir = System.IO.Path.Combine(outputDirectory, pageDir.RemoveLastDirectorySeparator()); 111 | System.IO.Directory.CreateDirectory(dir); 112 | } 113 | } 114 | 115 | public static string CreateBatchId() 116 | { 117 | String batchId = String.Format("{0}-{1}", System.DateTime.Now.ToString("yyyyMMddHHmmss"), Guid.NewGuid().ToString().Replace('-', new char()).Substring(0, 6)); 118 | return batchId; 119 | } 120 | 121 | 122 | 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /Rhythm.Staticize/ResourcesDownload/ResourcesDownloadBaseBehavior.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace Rhythm.Staticize 8 | { 9 | /// 10 | /// 提供对被爬行页面中引用资源的下载支持。 11 | /// 12 | public class ResourcesDownloadBaseBehavior : IBehavior 13 | { 14 | String resourcesNodeSelectPath; 15 | String outputDirectory; 16 | IUriResourcesFromLocalFileSystemReslover fileReslover; 17 | 18 | /// 19 | /// 初始化 InterceptorForResourcesDownloadBase 的新实例。 20 | /// 21 | /// 引用资源的输出文件夹。 22 | /// 引用资源的HTML标签XPath表达式。 23 | /// 用于将Uri路径转换为本地路径。 24 | public ResourcesDownloadBaseBehavior(String outputBaseDirectory, String resourcesHtmlNodeSelectPath, IUriResourcesFromLocalFileSystemReslover resourceFileReslover = null) 25 | { 26 | this.outputDirectory = outputBaseDirectory; 27 | this.resourcesNodeSelectPath = resourcesHtmlNodeSelectPath; 28 | this.fileReslover = resourceFileReslover ?? new DefaultUriToLocalFilePathReslover(); 29 | if (outputBaseDirectory == null) 30 | { 31 | throw new ArgumentNullException("outputBaseDirectory"); 32 | } 33 | if (resourcesHtmlNodeSelectPath == null) 34 | { 35 | throw new ArgumentNullException("resourcesHtmlNodeSelectPath"); 36 | } 37 | } 38 | 39 | Dictionary files = new Dictionary(100); 40 | Dictionary directories = new Dictionary(20); 41 | public void Process(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context) 42 | { 43 | Uri documentUri = context.Uri; 44 | String baseUrl = documentUri.GetParent();// GetParent(documentUri); 45 | 46 | var htmlNode = document.DocumentNode; 47 | var nodes = htmlNode.SelectNodes(resourcesNodeSelectPath); 48 | if (nodes == null || nodes.Count == 0) 49 | { 50 | return; 51 | } 52 | var srcAttributes = GetSrcAttributes(nodes); 53 | if (srcAttributes == null || srcAttributes.Count() == 0) 54 | { 55 | return; 56 | } 57 | var parsedSrcUris = ParseResourcesUris(documentUri, baseUrl, srcAttributes); 58 | if (parsedSrcUris == null || parsedSrcUris.Length == 0) 59 | { 60 | return; 61 | } 62 | 63 | OnResourceParsed(parsedSrcUris, context); 64 | 65 | for (int i = 0; i < parsedSrcUris.Length; i++) 66 | { 67 | var uri = parsedSrcUris[i]; 68 | String localPath = fileReslover.ResloveLocalPath(uri); 69 | if (String.IsNullOrEmpty(localPath)) 70 | { 71 | continue; 72 | } 73 | var localDirectory = System.IO.Path.Combine(outputDirectory, System.IO.Path.GetDirectoryName(localPath)); 74 | if (!directories.ContainsKey(localDirectory) && !System.IO.Directory.Exists(localDirectory)) 75 | { 76 | System.IO.Directory.CreateDirectory(localDirectory); 77 | //并发 patch 78 | try 79 | { 80 | directories.Add(localDirectory, null); 81 | } 82 | catch (Exception) 83 | { 84 | } 85 | } 86 | 87 | String saveFile = System.IO.Path.Combine(outputDirectory, localPath); 88 | //并发 patch 89 | try 90 | { 91 | //已存在相同文件,则跳过。为避免并发写同一个文件。 92 | if (files.ContainsKey(saveFile)) 93 | { 94 | continue; 95 | } 96 | files.Add(saveFile, null); 97 | } 98 | catch (Exception) 99 | { 100 | } 101 | if (System.IO.File.Exists(saveFile)) 102 | { 103 | continue; 104 | } 105 | if (this.fileReslover.TryCopyFromLocal(uri, saveFile)) 106 | { 107 | continue; 108 | } 109 | using (System.Net.WebClient wc = new System.Net.WebClient()) 110 | { 111 | try 112 | { 113 | wc.DownloadFile(uri, saveFile); 114 | } 115 | catch (Exception ex) 116 | { 117 | //修复WebClient 文件不存在仍然本地保存了一个空文件 118 | System.IO.File.Delete(saveFile); 119 | context.Errors.Add(new ResourcesDownloadException(String.Format(@"下载资源 ""{0}"" 时发生异常。", uri.ToString()), ex) 120 | { 121 | Url = uri, 122 | }); 123 | continue; 124 | } 125 | } 126 | } 127 | } 128 | 129 | ///// 130 | ///// 获取页面的Uri所在的目录名称。(仅支持页面) 131 | ///// 132 | ///// 133 | ///// 上一级的URI Address。 134 | //static string GetParent(Uri uri) 135 | //{ 136 | // String documentUriString = uri.ToString(); 137 | // int lastSlash = documentUriString.LastIndexOf('/'); 138 | // String documenBaseDir = documentUriString.Substring(0, lastSlash); 139 | // if (lastSlash == -1) 140 | // { 141 | // documenBaseDir = documentUriString; 142 | // } 143 | // else 144 | // { 145 | // //特殊处理 146 | // //如果最后还是“/”(有的URL不标准,路径中有两个“//”,如http://localhost:90/Admin/Blogs///17.html) 147 | // while ( 148 | // documenBaseDir[documenBaseDir.Length - 1] == '/' || 149 | // documenBaseDir[documenBaseDir.Length - 1] == '\\' 150 | // ) 151 | // { 152 | // documenBaseDir = documenBaseDir.Substring(0, documenBaseDir.Length - 1); 153 | // } 154 | // } 155 | // return documenBaseDir; 156 | //} 157 | 158 | /// 159 | /// 获取资源 Html Node 的“src”属性。 160 | /// 161 | /// Html Node集合 162 | /// 排除空了值的“src”属性集合。 163 | protected virtual string[] GetSrcAttributes(HtmlAgilityPack.HtmlNodeCollection nodes) 164 | { 165 | var srcAttributes = (from node in nodes 166 | where !String.IsNullOrWhiteSpace(node.GetAttributeValue("src", null)) 167 | select node.GetAttributeValue("src", null)).ToArray(); 168 | return srcAttributes; 169 | } 170 | 171 | /// 172 | /// 从“src”属性的值创建Uri对象。 173 | /// 174 | /// “src”属性集合。 175 | /// Uri对象集合。 176 | Uri[] ParseResourcesUris(Uri documentUri, string documenBaseDir, string[] srcAttributes) 177 | { 178 | Uri parseUri = null; 179 | var parsedImgSrcUris = (from src in srcAttributes 180 | select Uri.TryCreate(src, UriKind.RelativeOrAbsolute, out parseUri) ? parseUri : null).Where(m => m != null).ToArray(); 181 | 182 | var filter = (from uri in parsedImgSrcUris 183 | where uri.IsAbsoluteUri == false || (uri.IsAbsoluteUri && uri.Host == documentUri.Host) 184 | select uri.IsAbsoluteUri ? uri : CreateAbsoluteUri(documentUri, documenBaseDir, uri)).Distinct().ToArray(); 185 | return filter; 186 | } 187 | 188 | /// 189 | /// 当资源URL被正确解析,即将被下载时回调 190 | /// 191 | /// 当资源URL(集合) 192 | /// 193 | protected virtual void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context) 194 | { 195 | 196 | } 197 | 198 | 199 | Uri CreateAbsoluteUri(Uri documentUri, string documenBaseDir, Uri uri) 200 | { 201 | String str = uri.ToString(); 202 | if (str[0] == '/') 203 | { 204 | String format = String.Format("{0}://{1}:{2}{3}", documentUri.Scheme, documentUri.Host, documentUri.Port, uri.ToString()); 205 | return new Uri(format); 206 | } 207 | else 208 | { 209 | 210 | String format = String.Format("{0}/{1}", documenBaseDir, uri.ToString()); 211 | return new Uri(format); 212 | } 213 | } 214 | 215 | 216 | } 217 | 218 | 219 | } 220 | -------------------------------------------------------------------------------- /Rhythm.Staticize/Staticizer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using System.Web; 7 | using System.Net; 8 | //using HtmlAgilityPack; 9 | 10 | namespace Rhythm.Staticize 11 | { 12 | /// 13 | /// 静态化关键状态(StaticizeStep)变化的事件委托。 14 | /// 15 | /// 表示当前状态的对象 16 | /// 关键状态变更事件参数 17 | public delegate void StaticizeStepChangedEventHandler(StaticizeStepStatus sender, StaticizeStepChangedEventArgs e); 18 | 19 | /// 20 | /// 静态化类库的入口。 21 | /// 22 | public class Staticizer 23 | { 24 | /// 25 | /// 执行静态化 26 | /// 27 | /// 要静态化的页面列表。Key为页面绝对URL,Value为这个页面保存在本地的路径。URL和Value必须是唯一的。 28 | /// 静态化状态,默认请传入此实例,它提供对异步线程获取静态化状态的支持。 29 | /// 静态化状态,与传入的 stepTaken 引用一致。 30 | public StaticizeStepStatus Staticize(IEnumerable> pages, StaticizeStepStatus stepTaken) 31 | { 32 | if (pages == null) 33 | { 34 | throw new ArgumentNullException("pages"); 35 | } 36 | if (stepTaken == null) 37 | { 38 | throw new ArgumentNullException("stepTaken"); 39 | } 40 | #region 初始化 41 | stepTaken.Step = StaticizeStep.Initialize; 42 | 43 | int pageCount = pages.Count(); 44 | stepTaken.pageCount = pageCount; 45 | // 创建 Context 对象,每个页面一个 Context 46 | HtmlStaticizeContext[] entries = new HtmlStaticizeContext[pageCount]; 47 | { 48 | int i = 0; 49 | foreach (var address in pages) 50 | { 51 | entries[i] = new HtmlStaticizeContext 52 | { 53 | uri = address.Key, 54 | fileName = address.Value, 55 | }; 56 | i++; 57 | } 58 | } 59 | stepTaken.Init(entries); 60 | 61 | AddValidation(GenerationSuccessfulValidation.Instance); 62 | 63 | #endregion 64 | 65 | 66 | stepTaken.Step = StaticizeStep.GenerationHtml; 67 | 68 | // 生成 HTML 69 | Generate(entries, stepTaken); 70 | stepTaken.Step = StaticizeStep.GenerationHtmlCompleted; 71 | 72 | #region 验证 73 | 74 | stepTaken.Step = StaticizeStep.Validation; 75 | if ( 76 | (m_Behaviors != null && m_Behaviors.Count > 0) 77 | || (m_Validations != null && m_Validations.Count > 0) 78 | ) 79 | { 80 | for (int j = 0; j < entries.Length; j++) 81 | { 82 | var entry = entries[j]; 83 | // 如果 generationError 不为null,表示 HTML 生成失败。 84 | if (entry.generationError != null) 85 | { 86 | var ex = entry.generationError; 87 | var vd = new ValidationResult() 88 | { 89 | ValidationType = ValidationType.Tag, 90 | Uri = entry.uri, 91 | Name = "页面HTML是否成功生成。", 92 | Message = string.Format("生成HTML期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()), 93 | Exception = ex, 94 | }; 95 | entry.validationResults = new ValidationResult[] { vd }; 96 | stepTaken.ValidationErrors.Add(vd); 97 | stepTaken.validatedPageCount++; 98 | continue; 99 | } 100 | // load document dom 101 | var doc = new HtmlAgilityPack.HtmlDocument(); 102 | // 尝试加载 document 103 | try 104 | { 105 | doc.Load(entry.fileName, System.Text.Encoding.UTF8); 106 | } 107 | catch (Exception ex) 108 | { 109 | // 加载 document失败 110 | entry.DocumentLoadError = ex; 111 | var vd = new ValidationResult() 112 | { 113 | ValidationType = ValidationType.Tag, 114 | Uri = entry.uri, 115 | Name = "页面HTML是否成功生成。", 116 | Message = string.Format("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()), 117 | Exception = ex, 118 | }; 119 | entry.validationResults = new ValidationResult[] { vd }; 120 | stepTaken.ValidationErrors.Add(vd); 121 | stepTaken.AddValidatedPageCount(); 122 | continue; 123 | } 124 | 125 | if (m_Behaviors != null && m_Behaviors.Count > 0) 126 | { 127 | for (int k = 0; k < m_Behaviors.Count; k++) 128 | { 129 | m_Behaviors[k].Process(doc, entry); 130 | } 131 | } 132 | if (m_Validations != null && m_Validations.Count > 0) 133 | { 134 | Validate(doc, entry, stepTaken); 135 | } 136 | stepTaken.AddValidatedPageCount(); 137 | } 138 | } 139 | stepTaken.Step = StaticizeStep.ValidationCompleted; 140 | 141 | #endregion 142 | 143 | // add context errors results to status 144 | { 145 | var all = stepTaken.Errors; 146 | for (int i = 0; i < entries.Length; i++) 147 | { 148 | var items = entries[i].Errors; 149 | if (items != null && items.Count > 0) 150 | { 151 | all.AddRange(items); 152 | } 153 | } 154 | } 155 | 156 | stepTaken.Step = StaticizeStep.Completed; 157 | return stepTaken; 158 | } 159 | 160 | /// 161 | /// 验证 162 | /// 163 | /// 164 | /// 165 | /// 166 | void Validate(HtmlAgilityPack.HtmlDocument doc, HtmlStaticizeContext context, StaticizeStepStatus stepTaken) 167 | { 168 | if (this.m_Validations != null) 169 | { 170 | var result = m_Validations.Validate(doc, context); 171 | if (result != null && result.Count > 0) 172 | { 173 | if (context.validationResults == null) 174 | { 175 | context.validationResults = result; 176 | } 177 | else 178 | { 179 | context.validationResults.AddRange(result); 180 | } 181 | stepTaken.ValidationErrors.AddRange(result); 182 | } 183 | } 184 | } 185 | 186 | /// 187 | /// 生成 188 | /// 189 | /// 190 | /// 191 | void Generate(HtmlStaticizeContext[] entries, StaticizeStepStatus step) 192 | { 193 | System.Threading.Tasks.Parallel.ForEach(entries, (entry) => 194 | { 195 | using (var wc = new WebClient()) 196 | { 197 | try 198 | { 199 | wc.DownloadFile(entry.uri, entry.fileName); 200 | step.AddGeneratedPageCount(); 201 | } 202 | catch (Exception ex) 203 | { 204 | entry.generationError = ex; 205 | entry.Errors.Add(ex); 206 | //step.Errors.Add(ex); 207 | } 208 | } 209 | }); 210 | //// batch download html file 211 | //using (var wc = new WebClient()) 212 | //{ 213 | // for (int j = 0; j < entries.Length; j++) 214 | // { 215 | // var entry = entries[j]; 216 | // // may be some url down failure 217 | // // should log error 218 | // try 219 | // { 220 | // wc.DownloadFile(entry.uri, entry.fileName); 221 | // } 222 | // catch (Exception ex) 223 | // { 224 | // entry.generationError = ex; 225 | // entry.Errors.Add(ex); 226 | // } 227 | // } 228 | //} 229 | } 230 | 231 | 232 | List m_Behaviors; 233 | /// 234 | /// 添加 IBehavior ,它会在 HTML 被生成,并加载后执行。 235 | /// 因此你可以使用 IBehavior 对 HTML 进行读取,它会在验证之前执行。 236 | /// 237 | /// 238 | /// 239 | public Staticizer AddBehavior(params IBehavior[] behaviors) 240 | { 241 | if (behaviors == null) 242 | { 243 | throw new ArgumentNullException("behaviors"); 244 | } 245 | if (this.m_Behaviors == null) 246 | { 247 | this.m_Behaviors = new List(behaviors); 248 | } 249 | else 250 | { 251 | this.m_Behaviors.AddRange(behaviors); 252 | } 253 | return this; 254 | } 255 | 256 | List m_Validations; 257 | 258 | 259 | /// 260 | /// 添加自定义验证规则。 261 | /// 262 | /// 自定义验证规则。 263 | public Staticizer AddValidation(params IValidation[] validations) 264 | { 265 | if (validations == null) 266 | { 267 | throw new ArgumentNullException("validations"); 268 | } 269 | if (this.m_Validations == null) 270 | { 271 | this.m_Validations = new List(validations); 272 | } 273 | else 274 | { 275 | this.m_Validations.AddRange(validations); 276 | } 277 | return this; 278 | } 279 | 280 | } 281 | } --------------------------------------------------------------------------------