├── .idea
├── encodings.xml
├── misc.xml
├── modules.xml
├── simHtml.iml
├── vcs.xml
└── workspace.xml
├── .travis.yml
├── README.md
├── README_zh.md
├── example
├── data
│ ├── exp1.html
│ └── exp2.html
└── test.go
├── go.mod
├── go.sum
└── simHtml
├── calSim.go
├── getSim.go
└── utils.go
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/simHtml.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
22 |
23 |
24 |
25 | rune
26 |
27 |
28 | string
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
129 |
130 |
131 |
143 |
144 |
145 |
146 | true
147 | https://goproxy.io
148 |
149 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | go:
3 | - 1.13.x
4 | env:
5 | - GO111MODULE=on
6 | os:
7 | - linux
8 | - osx
9 | - windows
10 | sudo: false
11 | install: true
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # simHtml
2 |
3 | [](https://travis-ci.com/cckuailong/simHtml)
4 |
5 | [English ReadMe](https://github.com/cckuailong/simHtml/blob/master/README.md) ||
6 | [中文 ReadMe](https://github.com/cckuailong/simHtml/blob/master/README_zh.md)
7 |
8 | ## Introduce
9 |
10 | This package provides functions to measure the similarity between web pages.
11 |
12 | ## Install
13 |
14 | The quick way::
15 |
16 | go get -u github.com/cckuailong/simHtml
17 |
18 | ## How it works?
19 |
20 | ### Structural Similarity
21 |
22 | Uses sequence comparison of the html tags to compute the similarity.
23 |
24 | We not implement the similarity based on tree edit distance because it is slower than sequence comparison.
25 |
26 |
27 | ### Style Similarity
28 |
29 | Calculates the similarity of Attributes of class and style.
30 |
31 |
32 | ### Joint Similarity (Structural Similarity and Style Similarity)
33 |
34 | The joint similarity metric is calculated as::
35 |
36 | k * structural_similarity(document_1, document_2) + (1 - k) * style_similarity(document_1, document_2)
37 |
38 | All the similarity metrics takes values between 0 and 1.
39 |
40 | ### Recommendations for joint similarity
41 |
42 | Using `k=0.3` give use better results. The style similarity gives more information about the similarity rather than the structural similarity.
43 |
44 | ## Functions
45 |
46 | - GetSimFromFile(file1, file2 string) float64
47 |
48 | ```
49 | In [1]: 1.html's content is
50 | '''
51 |
52 |
First Document
53 |
57 |
58 | '''
59 |
60 | In [2]: 2.html's content is
61 | '''
62 |
63 | Second document Document
64 |
67 |
68 | '''
69 |
70 | In [3] import "github.com/cckuailong/simHtml/simHtml"
71 |
72 | In [4]: simHtml.GetSimRate("./1.html", "./2.html")
73 | Out[4]: 0.9727272727272727
74 | ```
75 |
76 | - GetSimFromStr(str1, str2 string) float64
77 |
78 | - GetSimFromUrl(url1, url2 string) float64
79 |
80 | ### References
81 |
82 | - html-similarity
--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
1 | # simHtml
2 |
3 | [](https://travis-ci.com/cckuailong/simHtml)
4 |
5 | [English Readme](https://github.com/cckuailong/py2so/blob/master/README.md) ||
6 | [中文 Readme](https://github.com/cckuailong/py2so/blob/master/README_zh.md)
7 |
8 | ## 介绍
9 |
10 | simHtml包提供了一些用于计算Web页面相似度的函数
11 |
12 | ## 安装
13 |
14 | 快速安装:
15 |
16 | go get -u github.com/cckuailong/simHtml
17 |
18 | ## 原理
19 |
20 | ### 网页结构相似度
21 |
22 | 使用序列比较方法(最长公共子序列)来计算dom树的相似度。
23 |
24 | ### 元素类型相似度
25 |
26 | 计算class 和 style的相似度。
27 |
28 | ### 整合 网页结构相似度 和 元素类型相似度
29 |
30 | 整合算法:
31 |
32 | k * structural_similarity(document_1, document_2) + (1 - k) * style_similarity(document_1, document_2)
33 |
34 | 相似度取值在0-1之间
35 |
36 | ### k取值建议
37 |
38 | 使用 `k=0.3` 可以获得更好的结果。 元素类型相似度 包含的信息更多,更精确。
39 |
40 | ## 函数
41 |
42 | - GetSimFromFile(file1, file2 string) float64
43 |
44 | ```
45 | In [1]: 1.html's content is
46 | '''
47 |
48 | First Document
49 |
53 |
54 | '''
55 |
56 | In [2]: 2.html's content is
57 | '''
58 |
59 | Second document Document
60 |
63 |
64 | '''
65 |
66 | In [3] import "github.com/cckuailong/simHtml/simHtml"
67 |
68 | In [4]: simHtml.GetSimRate("./1.html", "./2.html")
69 | Out[4]: 0.9727272727272727
70 | ```
71 |
72 | - GetSimFromStr(str1, str2 string) float64
73 |
74 | - GetSimFromUrl(url1, url2 string) float64
75 |
76 | ### 参考
77 |
78 | - html-similarity
--------------------------------------------------------------------------------
/example/data/exp1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Hello, world!
6 |
7 |
8 |
9 | Hello, world ya!
10 | Lorem ipsum, dolor
11 | sit amet.
12 |
13 |
14 |
Lorem ipsum, dolor sit amet.
15 |
16 |
17 | opa
18 |
19 |
20 | DIV
21 |
22 |
hahahahha
23 |
24 |
25 |
26 |
Lorem ipsum, dolor sit amet.
27 |
Hello, world!
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/example/data/exp2.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Hello, world!
6 |
7 |
8 |
9 | Hello, world!
10 | Lorem ipsum, dolor
11 | sit amet.
12 |
13 |
14 |
15 | opa
16 |
17 |
18 | DIV
19 |
20 |
hahahahha
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/example/test.go:
--------------------------------------------------------------------------------
1 | package example
2 |
3 | import (
4 | "fmt"
5 | "github.com/cckuailong/simHtml/simHtml"
6 | )
7 |
8 | func testFileInterface(){
9 | f1 := "example/data/exp1.html"
10 | f2 := "example/data/exp2.html"
11 | fmt.Println(simHtml.GetSimFromFile(f1, f2))
12 | }
13 |
14 | func testStrInterface(){
15 | html1 := ""
16 | html2 := ""
17 | fmt.Println(simHtml.GetSimFromStr(html1, html2))
18 | }
19 |
20 | func testUrlInterface(){
21 | url1 := "http://lovebear.top/2020/01/08/PyBgpStream_Install/"
22 | url2 := "http://lovebear.top/2020/01/06/Grafana_Install_And_Config/"
23 | fmt.Println(simHtml.GetSimFromUrl(url1, url2))
24 | }
25 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/cckuailong/simHtml
2 |
3 | go 1.13
4 |
5 | require github.com/PuerkitoBio/goquery v1.5.0
6 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
2 | github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
3 | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
4 | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
5 | github.com/cckuailong/simHtml v0.0.0-20200113160512-f1af987b2803 h1:8a9v6KbIC2PaTDZGG2DuvLXVjTA4sQhp0VuC9y3QIEs=
6 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
7 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
8 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
9 |
--------------------------------------------------------------------------------
/simHtml/calSim.go:
--------------------------------------------------------------------------------
1 | package simHtml
2 |
3 | import (
4 | "github.com/PuerkitoBio/goquery"
5 | "strings"
6 | )
7 |
8 | func getDomCssList(doc *goquery.Document) ([]string, []string){
9 | queue := []*goquery.Selection{}
10 | dom_res, css_res := []string{}, []string{}
11 | queue = append(queue, doc.Selection)
12 | for len(queue) > 0{
13 | cur_sel := queue[0]
14 | queue = queue[1:]
15 | if len(cur_sel.Nodes)==0{
16 | continue
17 | }
18 | for _, c := range(cur_sel.Nodes){
19 | dom_res = append(dom_res, c.Data)
20 | for _,item := range(c.Attr){
21 | key := strings.ToLower(item.Key)
22 | if key == "class" || key == "style"{
23 | css_res = append(css_res, item.Val)
24 | }
25 | }
26 | }
27 | queue = append(queue, cur_sel.Children())
28 | }
29 | return dom_res[1:], css_res
30 | }
31 |
32 | func getSimRate(doc1, doc2 *goquery.Document) float64{
33 | var domRate, cssRate float64
34 | domList1, cssList1 := getDomCssList(doc1)
35 | domList2, cssList2 := getDomCssList(doc2)
36 | domSimNum := LongestCommonSubsequence(domList1, domList2)
37 | cssSimNum := LongestCommonSubsequence(cssList1, cssList2)
38 | domLen := len(domList1)+len(domList2)
39 | cssLen := len(cssList1)+len(cssList2)
40 | if domLen == 0{
41 | domRate = 0
42 | }else{
43 | domRate = float64(2*domSimNum)/float64(domLen)
44 | }
45 | if cssLen == 0{
46 | cssRate = 0
47 | }else{
48 | cssRate = float64(2*cssSimNum)/float64(cssLen)
49 | }
50 | return 0.3*domRate + 0.7*cssRate
51 | }
52 |
--------------------------------------------------------------------------------
/simHtml/getSim.go:
--------------------------------------------------------------------------------
1 | package simHtml
2 |
3 | import (
4 | "bytes"
5 | "crypto/tls"
6 | "fmt"
7 | "github.com/PuerkitoBio/goquery"
8 | "io/ioutil"
9 | "net/http"
10 | "strings"
11 | )
12 |
13 | // File interface
14 | func GetSimFromFile(file1, file2 string) float64{
15 | cont1, _ := ioutil.ReadFile(file1)
16 | doc1, _ := goquery.NewDocumentFromReader(bytes.NewReader(cont1))
17 | cont2, _ := ioutil.ReadFile(file2)
18 | doc2, _ := goquery.NewDocumentFromReader(bytes.NewReader(cont2))
19 | return getSimRate(doc1, doc2)
20 | }
21 |
22 | // String interface
23 | func GetSimFromStr(html1, html2 string) float64{
24 | doc1, _ := goquery.NewDocumentFromReader(strings.NewReader(html1))
25 | doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(html2))
26 | return getSimRate(doc1, doc2)
27 | }
28 |
29 | // Url interface
30 | func GetSimFromUrl(url1, url2 string) float64{
31 | http.DefaultClient.Transport = &http.Transport{
32 | TLSClientConfig: &tls.Config{
33 | InsecureSkipVerify: true,
34 | },
35 | }
36 | resp1, err := http.Get(url1)
37 | if err != nil{
38 | fmt.Println(err)
39 | return 0
40 | }
41 | defer resp1.Body.Close()
42 | resp2, err := http.Get(url2)
43 | if err != nil{
44 | fmt.Println(err)
45 | return 0
46 | }
47 | defer resp2.Body.Close()
48 | doc1, _ := goquery.NewDocumentFromReader(resp1.Body)
49 | doc2, _ := goquery.NewDocumentFromReader(resp2.Body)
50 | return getSimRate(doc1, doc2)
51 | }
52 |
--------------------------------------------------------------------------------
/simHtml/utils.go:
--------------------------------------------------------------------------------
1 | package simHtml
2 |
3 | func LongestCommonSubsequence(text1, text2 []string) int {
4 | m, n := len(text1), len(text2)
5 | up := make([]int, n+2)
6 | var a, b, c, tmp, maximum int
7 | for i := 1; i <= m; i++ {
8 | for j := 1; j <= n; j++ {
9 | if text1[i-1] == text2[j-1] {
10 | tmp = a + 1
11 | } else {
12 | tmp = getMax(b, c)
13 | }
14 | if tmp > maximum {
15 | maximum = tmp
16 | }
17 | c = tmp
18 | a = b
19 | up[j] = tmp
20 | b = up[j+1]
21 | }
22 | a = 0
23 | b = up[1]
24 | c = 0
25 | }
26 | return maximum
27 | }
28 |
29 | func getMax(a, b int) int {
30 | if a > b {
31 | return a
32 | }
33 | return b
34 | }
35 |
--------------------------------------------------------------------------------