├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── simHtml.iml ├── vcs.xml └── workspace.xml ├── .travis.yml ├── README.md ├── README_zh.md ├── example ├── data │ ├── exp1.html │ └── exp2.html └── test.go ├── go.mod ├── go.sum └── simHtml ├── calSim.go ├── getSim.go └── utils.go /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/simHtml.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 15 | 16 | 22 | 23 | 24 | 25 | rune 26 | 27 | 28 | string 29 | 30 | 31 | 32 | 33 | 35 | 36 | 38 | 39 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 118 | 119 | 130 | 144 | 145 | 146 | true 147 | https://goproxy.io 148 | 149 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.13.x 4 | env: 5 | - GO111MODULE=on 6 | os: 7 | - linux 8 | - osx 9 | - windows 10 | sudo: false 11 | install: true -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # simHtml 2 | 3 | [![Build Status](https://travis-ci.com/cckuailong/simHtml.svg?branch=master)](https://travis-ci.com/cckuailong/simHtml) 4 | 5 | [English ReadMe](https://github.com/cckuailong/simHtml/blob/master/README.md) || 6 | [中文 ReadMe](https://github.com/cckuailong/simHtml/blob/master/README_zh.md) 7 | 8 | ## Introduce 9 | 10 | This package provides functions to measure the similarity between web pages. 11 | 12 | ## Install 13 | 14 | The quick way:: 15 | 16 | go get -u github.com/cckuailong/simHtml 17 | 18 | ## How it works? 19 | 20 | ### Structural Similarity 21 | 22 | Uses sequence comparison of the html tags to compute the similarity. 23 | 24 | We not implement the similarity based on tree edit distance because it is slower than sequence comparison. 25 | 26 | 27 | ### Style Similarity 28 | 29 | Calculates the similarity of Attributes of class and style. 30 | 31 | 32 | ### Joint Similarity (Structural Similarity and Style Similarity) 33 | 34 | The joint similarity metric is calculated as:: 35 | 36 | k * structural_similarity(document_1, document_2) + (1 - k) * style_similarity(document_1, document_2) 37 | 38 | All the similarity metrics takes values between 0 and 1. 39 | 40 | ### Recommendations for joint similarity 41 | 42 | Using `k=0.3` give use better results. The style similarity gives more information about the similarity rather than the structural similarity. 43 | 44 | ## Functions 45 | 46 | - GetSimFromFile(file1, file2 string) float64 47 | 48 | ``` 49 | In [1]: 1.html's content is 50 | ''' 51 | 52 |

First Document

53 | 57 | 58 | ''' 59 | 60 | In [2]: 2.html's content is 61 | ''' 62 | 63 |

Second document Document

64 | 67 | 68 | ''' 69 | 70 | In [3] import "github.com/cckuailong/simHtml/simHtml" 71 | 72 | In [4]: simHtml.GetSimRate("./1.html", "./2.html") 73 | Out[4]: 0.9727272727272727 74 | ``` 75 | 76 | - GetSimFromStr(str1, str2 string) float64 77 | 78 | - GetSimFromUrl(url1, url2 string) float64 79 | 80 | ### References 81 | 82 | - html-similarity -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | # simHtml 2 | 3 | [![Build Status](https://travis-ci.com/cckuailong/simHtml.svg?branch=master)](https://travis-ci.com/cckuailong/simHtml) 4 | 5 | [English Readme](https://github.com/cckuailong/py2so/blob/master/README.md) || 6 | [中文 Readme](https://github.com/cckuailong/py2so/blob/master/README_zh.md) 7 | 8 | ## 介绍 9 | 10 | simHtml包提供了一些用于计算Web页面相似度的函数 11 | 12 | ## 安装 13 | 14 | 快速安装: 15 | 16 | go get -u github.com/cckuailong/simHtml 17 | 18 | ## 原理 19 | 20 | ### 网页结构相似度 21 | 22 | 使用序列比较方法(最长公共子序列)来计算dom树的相似度。 23 | 24 | ### 元素类型相似度 25 | 26 | 计算class 和 style的相似度。 27 | 28 | ### 整合 网页结构相似度 和 元素类型相似度 29 | 30 | 整合算法: 31 | 32 | k * structural_similarity(document_1, document_2) + (1 - k) * style_similarity(document_1, document_2) 33 | 34 | 相似度取值在0-1之间 35 | 36 | ### k取值建议 37 | 38 | 使用 `k=0.3` 可以获得更好的结果。 元素类型相似度 包含的信息更多,更精确。 39 | 40 | ## 函数 41 | 42 | - GetSimFromFile(file1, file2 string) float64 43 | 44 | ``` 45 | In [1]: 1.html's content is 46 | ''' 47 | 48 |

First Document

49 | 53 | 54 | ''' 55 | 56 | In [2]: 2.html's content is 57 | ''' 58 | 59 |

Second document Document

60 | 63 | 64 | ''' 65 | 66 | In [3] import "github.com/cckuailong/simHtml/simHtml" 67 | 68 | In [4]: simHtml.GetSimRate("./1.html", "./2.html") 69 | Out[4]: 0.9727272727272727 70 | ``` 71 | 72 | - GetSimFromStr(str1, str2 string) float64 73 | 74 | - GetSimFromUrl(url1, url2 string) float64 75 | 76 | ### 参考 77 | 78 | - html-similarity -------------------------------------------------------------------------------- /example/data/exp1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Hello, world! 6 | 7 | 8 | 9 |

Hello, world ya!

10 |

Lorem ipsum, dolor
11 | sit amet.

12 |
13 |
14 |

Lorem ipsum, dolor sit amet.

15 |
16 |

17 | opa 18 |

19 |
20 | DIV 21 |
22 | hahahahha 23 |
24 |
25 |
26 |

Lorem ipsum, dolor sit amet.

27 |

Hello, world!

28 |
29 | 30 | 31 | -------------------------------------------------------------------------------- /example/data/exp2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Hello, world! 6 | 7 | 8 | 9 |

Hello, world!

10 |

Lorem ipsum, dolor
11 | sit amet.

12 |
13 |
14 |

15 | opa 16 |

17 |
18 | DIV 19 |
20 | hahahahha 21 |
22 |
23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /example/test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "fmt" 5 | "github.com/cckuailong/simHtml/simHtml" 6 | ) 7 | 8 | func testFileInterface(){ 9 | f1 := "example/data/exp1.html" 10 | f2 := "example/data/exp2.html" 11 | fmt.Println(simHtml.GetSimFromFile(f1, f2)) 12 | } 13 | 14 | func testStrInterface(){ 15 | html1 := "" 16 | html2 := "" 17 | fmt.Println(simHtml.GetSimFromStr(html1, html2)) 18 | } 19 | 20 | func testUrlInterface(){ 21 | url1 := "http://lovebear.top/2020/01/08/PyBgpStream_Install/" 22 | url2 := "http://lovebear.top/2020/01/06/Grafana_Install_And_Config/" 23 | fmt.Println(simHtml.GetSimFromUrl(url1, url2)) 24 | } 25 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cckuailong/simHtml 2 | 3 | go 1.13 4 | 5 | require github.com/PuerkitoBio/goquery v1.5.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk= 2 | github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= 3 | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= 4 | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 5 | github.com/cckuailong/simHtml v0.0.0-20200113160512-f1af987b2803 h1:8a9v6KbIC2PaTDZGG2DuvLXVjTA4sQhp0VuC9y3QIEs= 6 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 7 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= 8 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 9 | -------------------------------------------------------------------------------- /simHtml/calSim.go: -------------------------------------------------------------------------------- 1 | package simHtml 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "strings" 6 | ) 7 | 8 | func getDomCssList(doc *goquery.Document) ([]string, []string){ 9 | queue := []*goquery.Selection{} 10 | dom_res, css_res := []string{}, []string{} 11 | queue = append(queue, doc.Selection) 12 | for len(queue) > 0{ 13 | cur_sel := queue[0] 14 | queue = queue[1:] 15 | if len(cur_sel.Nodes)==0{ 16 | continue 17 | } 18 | for _, c := range(cur_sel.Nodes){ 19 | dom_res = append(dom_res, c.Data) 20 | for _,item := range(c.Attr){ 21 | key := strings.ToLower(item.Key) 22 | if key == "class" || key == "style"{ 23 | css_res = append(css_res, item.Val) 24 | } 25 | } 26 | } 27 | queue = append(queue, cur_sel.Children()) 28 | } 29 | return dom_res[1:], css_res 30 | } 31 | 32 | func getSimRate(doc1, doc2 *goquery.Document) float64{ 33 | var domRate, cssRate float64 34 | domList1, cssList1 := getDomCssList(doc1) 35 | domList2, cssList2 := getDomCssList(doc2) 36 | domSimNum := LongestCommonSubsequence(domList1, domList2) 37 | cssSimNum := LongestCommonSubsequence(cssList1, cssList2) 38 | domLen := len(domList1)+len(domList2) 39 | cssLen := len(cssList1)+len(cssList2) 40 | if domLen == 0{ 41 | domRate = 0 42 | }else{ 43 | domRate = float64(2*domSimNum)/float64(domLen) 44 | } 45 | if cssLen == 0{ 46 | cssRate = 0 47 | }else{ 48 | cssRate = float64(2*cssSimNum)/float64(cssLen) 49 | } 50 | return 0.3*domRate + 0.7*cssRate 51 | } 52 | -------------------------------------------------------------------------------- /simHtml/getSim.go: -------------------------------------------------------------------------------- 1 | package simHtml 2 | 3 | import ( 4 | "bytes" 5 | "crypto/tls" 6 | "fmt" 7 | "github.com/PuerkitoBio/goquery" 8 | "io/ioutil" 9 | "net/http" 10 | "strings" 11 | ) 12 | 13 | // File interface 14 | func GetSimFromFile(file1, file2 string) float64{ 15 | cont1, _ := ioutil.ReadFile(file1) 16 | doc1, _ := goquery.NewDocumentFromReader(bytes.NewReader(cont1)) 17 | cont2, _ := ioutil.ReadFile(file2) 18 | doc2, _ := goquery.NewDocumentFromReader(bytes.NewReader(cont2)) 19 | return getSimRate(doc1, doc2) 20 | } 21 | 22 | // String interface 23 | func GetSimFromStr(html1, html2 string) float64{ 24 | doc1, _ := goquery.NewDocumentFromReader(strings.NewReader(html1)) 25 | doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(html2)) 26 | return getSimRate(doc1, doc2) 27 | } 28 | 29 | // Url interface 30 | func GetSimFromUrl(url1, url2 string) float64{ 31 | http.DefaultClient.Transport = &http.Transport{ 32 | TLSClientConfig: &tls.Config{ 33 | InsecureSkipVerify: true, 34 | }, 35 | } 36 | resp1, err := http.Get(url1) 37 | if err != nil{ 38 | fmt.Println(err) 39 | return 0 40 | } 41 | defer resp1.Body.Close() 42 | resp2, err := http.Get(url2) 43 | if err != nil{ 44 | fmt.Println(err) 45 | return 0 46 | } 47 | defer resp2.Body.Close() 48 | doc1, _ := goquery.NewDocumentFromReader(resp1.Body) 49 | doc2, _ := goquery.NewDocumentFromReader(resp2.Body) 50 | return getSimRate(doc1, doc2) 51 | } 52 | -------------------------------------------------------------------------------- /simHtml/utils.go: -------------------------------------------------------------------------------- 1 | package simHtml 2 | 3 | func LongestCommonSubsequence(text1, text2 []string) int { 4 | m, n := len(text1), len(text2) 5 | up := make([]int, n+2) 6 | var a, b, c, tmp, maximum int 7 | for i := 1; i <= m; i++ { 8 | for j := 1; j <= n; j++ { 9 | if text1[i-1] == text2[j-1] { 10 | tmp = a + 1 11 | } else { 12 | tmp = getMax(b, c) 13 | } 14 | if tmp > maximum { 15 | maximum = tmp 16 | } 17 | c = tmp 18 | a = b 19 | up[j] = tmp 20 | b = up[j+1] 21 | } 22 | a = 0 23 | b = up[1] 24 | c = 0 25 | } 26 | return maximum 27 | } 28 | 29 | func getMax(a, b int) int { 30 | if a > b { 31 | return a 32 | } 33 | return b 34 | } 35 | --------------------------------------------------------------------------------