├── .gitignore ├── go.mod ├── LICENSE.md ├── README.md ├── go.sum ├── readability_test.go ├── helpers.go └── readability.go /.gitignore: -------------------------------------------------------------------------------- 1 | /.golangcilint-* 2 | /res 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cixtor/readability 2 | 3 | go 1.14 4 | 5 | require golang.org/x/net v0.8.0 6 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Arc90 Inc 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Readability 2 | 3 | Readability is a library written in Go (golang) to parse, analyze and convert HTML pages into readable content. Originally an Arc90 Experiment, it is now incorporated into Safari’s Reader View. 4 | 5 | > Despite the ubiquity of reading on the web, readers remain a neglected audience. Much of our talk about web design revolves around a sense of movement: users are thought to be finding, searching, skimming, looking. We measure how frequently they click but not how long they stay on the page. We concern ourselves with their travel and participation–how they move from page to page, who they talk to when they get there–but forget the needs of those whose purpose is to be still. Readers flourish when they have space–some distance from the hubbub of the crowds–and as web designers, there is yet much we can do to help them carve out that space. 6 | > 7 | > [In Defense Of Readers](http://alistapart.com/articles/indefenseofreaders), by [Mandy Brown](http://www.aworkinglibrary.com/) 8 | 9 | ## Evolution of Readability Web Engines 10 | 11 | | Product | Year | Shutdown | 12 | |---------|------|----------| 13 | | [Instapaper](https://www.instapaper.com/) | 2008 | N/A | 14 | | [Arc90 Readability](https://code.google.com/archive/p/arc90labs-readability/) | 2009 | [Sep 30, 2016](https://medium.com/@readability/the-readability-bookmarking-service-will-shut-down-on-september-30-2016-1641cc18e02b) | 15 | | [Apple Readability](https://developer.apple.com/documentation/safariextensions/safarireader) | 2010 | N/A | 16 | | [Microsoft Reading View](https://docs.microsoft.com/en-us/microsoft-edge/dev-guide/browser-features/reading-view) | 2014 | N/A | 17 | | [Mozilla Readability](https://github.com/mozilla/readability) | 2015 | N/A | 18 | | [Mercury Reader](https://mercury.postlight.com/) | 2016 | [Apr 15, 2019](https://www.reddit.com/r/mac/comments/apkhzs/a/) | 19 | 20 | ## Reader Mode Parser Diversity 21 | 22 | All modern web browsers, except for Google Chrome, include an option to parse, analyze, and extract the main content from web pages to provide what is commonly known as “Reading Mode”. Reading Mode is a separate web rendering mode that strips out repeated and irrelevant content, this allows the web browser to extract the main content and display it cleanly and consistently to the user. 23 | 24 | | Vendor | Product | Parser | Environments | 25 | |--------|---------|--------|--------------| 26 | | Mozilla | Firefox | Mozilla Readability | Desktop and Android | 27 | | GNOME | Web | Mozilla Readability | Desktop | 28 | | Vivaldi | Vivaldi | Mozilla Readability | Desktop | 29 | | Yandex | Browser | Mozilla Readability | Desktop | 30 | | Samsung | Browser | Mozilla Readability | Android | 31 | | Apple | Safari | Safari Reader | macOS and iOS | 32 | | Maxthon | Maxthon | Maxthon Reader | Desktop | 33 | | Microsoft | Edge | EdgeHTML | Windows and Windows Mobile | 34 | | Microsoft | Edge Mobile | Chrome DOM Distiller | Android | 35 | | Google | Chrome | Chrome DOM Distiller | Android | 36 | | Postlight | Mercury Reader | Web Reader | Web / browser extension | 37 | | Instant Paper | Instapaper | Instaparser | Web / browser extension | 38 | | Mozilla | Pocket | Unknown | Web / browser extension | 39 | 40 | --- 41 | 42 | Ref: https://web.archive.org/web/20150817073201/http://lab.arc90.com/2009/03/02/readability/ 43 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 2 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 3 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 4 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 5 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 6 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 7 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 8 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 9 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 10 | golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ= 11 | golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= 12 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 13 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 14 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 15 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 16 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 17 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 18 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 19 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 20 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 21 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 22 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 23 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 24 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 25 | golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= 26 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 27 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 28 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 29 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 30 | golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 31 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 32 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 33 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 34 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 35 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 36 | -------------------------------------------------------------------------------- /readability_test.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | "testing" 10 | 11 | "golang.org/x/net/html" 12 | ) 13 | 14 | func TestMaxElemsToParse(t *testing.T) { 15 | input := strings.NewReader(` 16 |
17 |lorem ipsum
21 | 22 | `) 23 | 24 | parser := New() 25 | parser.MaxElemsToParse = 3 26 | _, err := parser.Parse(input, "https://cixtor.com/blog") 27 | 28 | if err.Error() != "too many elements: 5" { 29 | t.Fatalf("expecting failure due to MaxElemsToParse: %s", err) 30 | } 31 | } 32 | 33 | func TestRemoveScripts(t *testing.T) { 34 | input := strings.NewReader(` 35 | 36 |lorem ipsum
41 | 42 | 45 | 46 | `) 47 | 48 | a, err := New().Parse(input, "https://cixtor.com/blog") 49 | 50 | if err != nil { 51 | t.Fatalf("parser failure: %s", err) 52 | } 53 | 54 | if a.TextContent != "lorem ipsum" { 55 | t.Fatalf("scripts were not removed: %s", a.TextContent) 56 | } 57 | } 58 | 59 | func getNodeExcerpt(node *html.Node) string { 60 | outer := outerHTML(node) 61 | outer = strings.Join(strings.Fields(outer), "\x20") 62 | if len(outer) < 500 { 63 | return outer 64 | } 65 | return outer[:500] 66 | } 67 | 68 | func errColorDiff(label string, a string, b string) error { 69 | coloredA := "" 70 | coloredB := "" 71 | for i := 0; i < len(a); i++ { 72 | if b[i] == a[i] { 73 | coloredA += a[i : i+1] 74 | coloredB += b[i : i+1] 75 | continue 76 | } 77 | coloredA += "\x1b[0;92m" + a[i:] + "\x1b[0m" 78 | coloredB += "\x1b[0;91m" + b[i:] + "\x1b[0m" 79 | break 80 | } 81 | return fmt.Errorf("%s\n- %s\n+ %s", label, coloredA, coloredB) 82 | } 83 | 84 | func compareArticleContent(result *html.Node, expected *html.Node) error { 85 | // Make sure number of nodes is same 86 | resultNodesCount := len(children(result)) 87 | expectedNodesCount := len(children(expected)) 88 | if resultNodesCount != expectedNodesCount { 89 | return fmt.Errorf( 90 | "number of nodes is different, want %d got %d", 91 | expectedNodesCount, 92 | resultNodesCount, 93 | ) 94 | } 95 | 96 | resultNode := result 97 | expectedNode := expected 98 | for resultNode != nil && expectedNode != nil { 99 | // Get node excerpt 100 | resultExcerpt := getNodeExcerpt(resultNode) 101 | expectedExcerpt := getNodeExcerpt(expectedNode) 102 | 103 | // Compare tag name 104 | resultTagName := tagName(resultNode) 105 | expectedTagName := tagName(expectedNode) 106 | if resultTagName != expectedTagName { 107 | return fmt.Errorf( 108 | "tag name is different\nwant: %s (%s)\ngot : %s (%s)", 109 | expectedTagName, 110 | expectedExcerpt, 111 | resultTagName, 112 | resultExcerpt, 113 | ) 114 | } 115 | 116 | // Compare attributes 117 | resultAttrCount := len(resultNode.Attr) 118 | expectedAttrCount := len(expectedNode.Attr) 119 | if resultAttrCount != expectedAttrCount { 120 | return fmt.Errorf( 121 | "number of attributes is different\nwant: %d (%s)\ngot : %d (%s)", 122 | expectedAttrCount, 123 | expectedExcerpt, 124 | resultAttrCount, 125 | resultExcerpt, 126 | ) 127 | } 128 | 129 | for _, resultAttr := range resultNode.Attr { 130 | expectedAttrVal := getAttribute(expectedNode, resultAttr.Key) 131 | switch resultAttr.Key { 132 | case "href", "src": 133 | resultAttr.Val = strings.TrimSuffix(resultAttr.Val, "/") 134 | expectedAttrVal = strings.TrimSuffix(expectedAttrVal, "/") 135 | } 136 | 137 | if resultAttr.Val != expectedAttrVal { 138 | return fmt.Errorf( 139 | "attribute %s is different\nwant: %s (%s)\ngot : %s (%s)", 140 | resultAttr.Key, 141 | expectedAttrVal, 142 | expectedExcerpt, 143 | resultAttr.Val, 144 | resultExcerpt, 145 | ) 146 | } 147 | } 148 | 149 | // Compare text content 150 | resultText := strings.TrimSpace(textContent(resultNode)) 151 | expectedText := strings.TrimSpace(textContent(expectedNode)) 152 | 153 | resultText = strings.Join(strings.Fields(resultText), "\x20") 154 | expectedText = strings.Join(strings.Fields(expectedText), "\x20") 155 | 156 | if resultText != expectedText { 157 | return errColorDiff( 158 | "text content is different", 159 | expectedExcerpt, 160 | resultExcerpt, 161 | ) 162 | } 163 | 164 | // Move to next node 165 | r := Readability{} 166 | resultNode = r.getNextNode(resultNode, false) 167 | expectedNode = r.getNextNode(expectedNode, false) 168 | } 169 | 170 | return nil 171 | } 172 | 173 | func TestParse(t *testing.T) { 174 | testDir := "scenarios" 175 | testItems, err := ioutil.ReadDir(testDir) 176 | if err != nil { 177 | t.Errorf("\nfailed to read test directory") 178 | } 179 | 180 | for _, item := range testItems { 181 | if !item.IsDir() { 182 | continue 183 | } 184 | 185 | t.Run(item.Name(), func(t1 *testing.T) { 186 | // Open test file 187 | testFilePath := filepath.Join(testDir, item.Name(), "source.html") 188 | testFile, err := os.Open(testFilePath) 189 | if err != nil { 190 | t1.Errorf("\nfailed to open test file") 191 | } 192 | defer testFile.Close() 193 | 194 | // Open expected result file 195 | expectedFilePath := filepath.Join(testDir, item.Name(), "expected.html") 196 | expectedFile, err := os.Open(expectedFilePath) 197 | if err != nil { 198 | t1.Errorf("\nfailed to open expected result file") 199 | } 200 | defer expectedFile.Close() 201 | 202 | // Parse expected result 203 | expectedHTML, err := html.Parse(expectedFile) 204 | if err != nil { 205 | t1.Errorf("\nfailed to parse expected result file") 206 | } 207 | 208 | // Get article from test file 209 | resultArticle, err := New().Parse(testFile, "http://fakehost/test/page.html") 210 | if err != nil { 211 | t1.Errorf("\nfailed to parse test file") 212 | } 213 | 214 | // Parse article into HTML 215 | resultHTML, err := html.Parse(strings.NewReader(resultArticle.Content)) 216 | if err != nil { 217 | t1.Errorf("\nfailed to parse test article into HTML") 218 | } 219 | 220 | // Compare article 221 | err = compareArticleContent(resultHTML, expectedHTML) 222 | if err != nil { 223 | t1.Errorf("\n%v", err) 224 | } 225 | }) 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /helpers.go: -------------------------------------------------------------------------------- 1 | package readability 2 | 3 | import ( 4 | "bytes" 5 | "net/url" 6 | "strings" 7 | 8 | "golang.org/x/net/html" 9 | ) 10 | 11 | // firstElementChild returns the object's first child Element, or nil if there 12 | // are no child elements. 13 | func firstElementChild(node *html.Node) *html.Node { 14 | for child := node.FirstChild; child != nil; child = child.NextSibling { 15 | if child.Type == html.ElementNode { 16 | return child 17 | } 18 | } 19 | 20 | return nil 21 | } 22 | 23 | // nextElementSibling returns the Element immediately following the specified 24 | // one in its parent's children list, or nil if the specified Element is the 25 | // last one in the list. 26 | func nextElementSibling(node *html.Node) *html.Node { 27 | for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling { 28 | if sibling.Type == html.ElementNode { 29 | return sibling 30 | } 31 | } 32 | 33 | return nil 34 | } 35 | 36 | // appendChild adds a node to the end of the list of children of a specified 37 | // parent node. If the given child is a reference to an existing node in the 38 | // document, appendChild moves it from its current position to the new position 39 | // (there is no requirement to remove the node from its parent node before 40 | // appending it to some other node). 41 | // 42 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/appendChild 43 | func appendChild(node *html.Node, child *html.Node) { 44 | if child.Parent != nil { 45 | temp := cloneNode(child) 46 | node.AppendChild(temp) 47 | child.Parent.RemoveChild(child) 48 | return 49 | } 50 | 51 | node.AppendChild(child) 52 | } 53 | 54 | // childNodes returns list of a node's direct children. 55 | func childNodes(node *html.Node) []*html.Node { 56 | var list []*html.Node 57 | 58 | for c := node.FirstChild; c != nil; c = c.NextSibling { 59 | list = append(list, c) 60 | } 61 | 62 | return list 63 | } 64 | 65 | // includeNode determines if node is included inside nodeList. 66 | func includeNode(nodeList []*html.Node, node *html.Node) bool { 67 | for i := 0; i < len(nodeList); i++ { 68 | if nodeList[i] == node { 69 | return true 70 | } 71 | } 72 | 73 | return false 74 | } 75 | 76 | // cloneNode returns a duplicate of the node on which this method was called. 77 | // 78 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/cloneNode 79 | func cloneNode(node *html.Node) *html.Node { 80 | clone := &html.Node{ 81 | Type: node.Type, 82 | DataAtom: node.DataAtom, 83 | Data: node.Data, 84 | Attr: make([]html.Attribute, len(node.Attr)), 85 | } 86 | 87 | copy(clone.Attr, node.Attr) 88 | 89 | for c := node.FirstChild; c != nil; c = c.NextSibling { 90 | clone.AppendChild(cloneNode(c)) 91 | } 92 | 93 | return clone 94 | } 95 | 96 | // createElement creates the HTML element specified by tagName. 97 | // 98 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Document/createElement 99 | func createElement(tagName string) *html.Node { 100 | return &html.Node{Type: html.ElementNode, Data: tagName} 101 | } 102 | 103 | // createTextNode creates a new Text node. 104 | func createTextNode(data string) *html.Node { 105 | return &html.Node{Type: html.TextNode, Data: data} 106 | } 107 | 108 | // getElementsByTagName returns a collection of HTML elements with the given 109 | // tag name. If tag name is an asterisk, a list of all the available HTML nodes 110 | // will be returned instead. 111 | // 112 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Document/getElementsByTagName 113 | func getElementsByTagName(node *html.Node, tag string) []*html.Node { 114 | var lst []*html.Node 115 | var fun func(*html.Node) 116 | 117 | fun = func(n *html.Node) { 118 | if n.Type == html.ElementNode && (tag == "*" || n.Data == tag) { 119 | lst = append(lst, n) 120 | } 121 | 122 | for c := n.FirstChild; c != nil; c = c.NextSibling { 123 | fun(c) 124 | } 125 | } 126 | 127 | fun(node) 128 | 129 | return lst 130 | } 131 | 132 | // getAttribute returns the value of a specified attribute on the element. If 133 | // the given attribute does not exist, the function returns an empty string. 134 | func getAttribute(node *html.Node, attrName string) string { 135 | for i := 0; i < len(node.Attr); i++ { 136 | if node.Attr[i].Key == attrName { 137 | return node.Attr[i].Val 138 | } 139 | } 140 | 141 | return "" 142 | } 143 | 144 | // setAttribute sets attribute for node. If attribute already exists, it will 145 | // be replaced. 146 | func setAttribute(node *html.Node, attrName string, attrValue string) { 147 | attrIdx := -1 148 | 149 | for i := 0; i < len(node.Attr); i++ { 150 | if node.Attr[i].Key == attrName { 151 | attrIdx = i 152 | break 153 | } 154 | } 155 | 156 | if attrIdx >= 0 { 157 | node.Attr[attrIdx].Val = attrValue 158 | return 159 | } 160 | 161 | node.Attr = append(node.Attr, html.Attribute{ 162 | Key: attrName, 163 | Val: attrValue, 164 | }) 165 | } 166 | 167 | // removeAttribute removes attribute with given name. 168 | func removeAttribute(node *html.Node, attrName string) { 169 | attrIdx := -1 170 | 171 | for i := 0; i < len(node.Attr); i++ { 172 | if node.Attr[i].Key == attrName { 173 | attrIdx = i 174 | break 175 | } 176 | } 177 | 178 | if attrIdx >= 0 { 179 | a := node.Attr 180 | a = append(a[:attrIdx], a[attrIdx+1:]...) 181 | node.Attr = a 182 | } 183 | } 184 | 185 | // hasAttribute returns a Boolean value indicating whether the specified node 186 | // has the specified attribute or not. 187 | func hasAttribute(node *html.Node, attrName string) bool { 188 | for i := 0; i < len(node.Attr); i++ { 189 | if node.Attr[i].Key == attrName { 190 | return true 191 | } 192 | } 193 | 194 | return false 195 | } 196 | 197 | // outerHTML returns an HTML serialization of the element and its descendants. 198 | func outerHTML(node *html.Node) string { 199 | var buffer bytes.Buffer 200 | 201 | if err := html.Render(&buffer, node); err != nil { 202 | return "" 203 | } 204 | 205 | return buffer.String() 206 | } 207 | 208 | // innerHTML returns the HTML content (inner HTML) of an element. 209 | func innerHTML(node *html.Node) string { 210 | var err error 211 | var buffer bytes.Buffer 212 | 213 | for child := node.FirstChild; child != nil; child = child.NextSibling { 214 | if err = html.Render(&buffer, child); err != nil { 215 | return "" 216 | } 217 | } 218 | 219 | return strings.TrimSpace(buffer.String()) 220 | } 221 | 222 | // documentElement returns the root element of the document. 223 | func documentElement(doc *html.Node) *html.Node { 224 | nodes := getElementsByTagName(doc, "html") 225 | 226 | if len(nodes) > 0 { 227 | return nodes[0] 228 | } 229 | 230 | return nil 231 | } 232 | 233 | // className returns the value of the class attribute of the element. 234 | func className(node *html.Node) string { 235 | className := getAttribute(node, "class") 236 | className = strings.TrimSpace(className) 237 | className = rxNormalize.ReplaceAllString(className, "\x20") 238 | return className 239 | } 240 | 241 | // id returns the value of the id attribute of the specified element. 242 | func id(node *html.Node) string { 243 | id := getAttribute(node, "id") 244 | id = strings.TrimSpace(id) 245 | return id 246 | } 247 | 248 | // children returns an HTMLCollection of the child elements of Node. 249 | func children(node *html.Node) []*html.Node { 250 | var children []*html.Node 251 | 252 | if node == nil { 253 | return nil 254 | } 255 | 256 | for child := node.FirstChild; child != nil; child = child.NextSibling { 257 | if child.Type == html.ElementNode { 258 | children = append(children, child) 259 | } 260 | } 261 | 262 | return children 263 | } 264 | 265 | // wordCount returns number of word in str. 266 | func wordCount(str string) int { 267 | return len(strings.Fields(str)) 268 | } 269 | 270 | // indexOf returns the first index at which a given element can be found in the 271 | // array, or -1 if it is not present. 272 | func indexOf(array []string, key string) int { 273 | for idx, val := range array { 274 | if val == key { 275 | return idx 276 | } 277 | } 278 | 279 | return -1 280 | } 281 | 282 | // replaceNode replaces a child node within the given (parent) node. 283 | // 284 | // See: https://developer.mozilla.org/en-US/docs/Web/API/Node/replaceChild 285 | func replaceNode(oldNode *html.Node, newNode *html.Node) { 286 | if oldNode.Parent == nil { 287 | return 288 | } 289 | 290 | newNode.Parent = nil 291 | newNode.PrevSibling = nil 292 | newNode.NextSibling = nil 293 | oldNode.Parent.InsertBefore(newNode, oldNode) 294 | oldNode.Parent.RemoveChild(oldNode) 295 | } 296 | 297 | // tagName returns the tag name of the element on which it’s called. 298 | // 299 | // For example, if the element is an.
424 | // Whitespace between
elements are ignored. For example:
425 | //
426 | //
abc
block.
437 | replaced := false
438 |
439 | // If we find a
chain, remove the
nodes until we hit another
440 | // element or non-whitespace. This leaves behind the first
in the
441 | // chain (which will be replaced with a
later).
442 | for {
443 | next = r.nextElement(next)
444 |
445 | if next == nil || tagName(next) == "BR" {
446 | break
447 | }
448 |
449 | replaced = true
450 | brSibling := next.NextSibling
451 | next.Parent.RemoveChild(next)
452 | next = brSibling
453 | }
454 |
455 | // If we removed a
chain, replace the remaining
with a
. 456 | // Add all sibling nodes as children of the
until we hit another
457 | //
chain.
458 | if replaced {
459 | p := createElement("p")
460 | replaceNode(br, p)
461 |
462 | next = p.NextSibling
463 | for next != nil {
464 | // If we have hit another
, we are done adding children
465 | // to this
. 466 | if tagName(next) == "br" { 467 | nextElem := r.nextElement(next.NextSibling) 468 | if nextElem != nil && tagName(nextElem) == "br" { 469 | break 470 | } 471 | } 472 | 473 | if !r.isPhrasingContent(next) { 474 | break 475 | } 476 | 477 | // Otherwise, make this node a child of the new
. 478 | sibling := next.NextSibling 479 | appendChild(p, next) 480 | next = sibling 481 | } 482 | 483 | for p.LastChild != nil && r.isWhitespace(p.LastChild) { 484 | p.RemoveChild(p.LastChild) 485 | } 486 | 487 | if tagName(p.Parent) == "P" { 488 | r.setNodeTag(p.Parent, "div") 489 | } 490 | } 491 | }) 492 | } 493 | 494 | func (r *Readability) setNodeTag(node *html.Node, newTagName string) { 495 | if node.Type == html.ElementNode { 496 | node.Data = newTagName 497 | } 498 | 499 | // NOTES(cixtor): the original function in Readability.js is a bit longer 500 | // because it contains a fallback mechanism to set the node tag name just 501 | // in case JSDOMParser is not available, there is no need to implement this 502 | // here. 503 | } 504 | 505 | // getArticleMetadata attempts to get excerpt and byline metadata for the article. 506 | func (r *Readability) getArticleMetadata() Article { 507 | values := make(map[string]string) 508 | metaElements := getElementsByTagName(r.doc, "meta") 509 | 510 | // Find description tags. 511 | r.forEachNode(metaElements, func(element *html.Node, _ int) { 512 | elementName := getAttribute(element, "name") 513 | elementProperty := getAttribute(element, "property") 514 | content := getAttribute(element, "content") 515 | if content == "" { 516 | return 517 | } 518 | matches := []string{} 519 | name := "" 520 | 521 | if elementProperty != "" { 522 | matches = rxPropertyPattern.FindAllString(elementProperty, -1) 523 | for i := len(matches) - 1; i >= 0; i-- { 524 | // Convert to lowercase, and remove any whitespace 525 | // so we can match belops. 526 | name = strings.ToLower(matches[i]) 527 | name = strings.Join(strings.Fields(name), "") 528 | // multiple authors 529 | values[name] = strings.TrimSpace(content) 530 | } 531 | } 532 | 533 | if len(matches) == 0 && elementName != "" && rxNamePattern.MatchString(elementName) { 534 | // Convert to lowercase, remove any whitespace, and convert 535 | // dots to colons so we can match belops. 536 | name = strings.ToLower(elementName) 537 | name = strings.Join(strings.Fields(name), "") 538 | name = strings.Replace(name, ".", ":", -1) 539 | values[name] = strings.TrimSpace(content) 540 | } 541 | }) 542 | 543 | // get title 544 | metadataTitle := "" 545 | for _, name := range []string{ 546 | "dc:title", 547 | "dcterm:title", 548 | "og:title", 549 | "weibo:article:title", 550 | "weibo:webpage:title", 551 | "title", 552 | "twitter:title", 553 | } { 554 | if value, ok := values[name]; ok { 555 | metadataTitle = value 556 | break 557 | } 558 | } 559 | 560 | if metadataTitle == "" { 561 | metadataTitle = r.getArticleTitle() 562 | } 563 | 564 | // get author 565 | metadataByline := "" 566 | for _, name := range []string{ 567 | "dc:creator", 568 | "dcterm:creator", 569 | "author", 570 | } { 571 | if value, ok := values[name]; ok { 572 | metadataByline = value 573 | break 574 | } 575 | } 576 | 577 | // get description 578 | metadataExcerpt := "" 579 | for _, name := range []string{ 580 | "dc:description", 581 | "dcterm:description", 582 | "og:description", 583 | "weibo:article:description", 584 | "weibo:webpage:description", 585 | "description", 586 | "twitter:description", 587 | } { 588 | if value, ok := values[name]; ok { 589 | metadataExcerpt = value 590 | break 591 | } 592 | } 593 | 594 | // get site name 595 | metadataSiteName := values["og:site_name"] 596 | 597 | // get image thumbnail 598 | metadataImage := "" 599 | for _, name := range []string{ 600 | "og:image", 601 | "image", 602 | "twitter:image", 603 | } { 604 | if value, ok := values[name]; ok { 605 | metadataImage = toAbsoluteURI(value, r.documentURI) 606 | break 607 | } 608 | } 609 | 610 | // get favicon 611 | metadataFavicon := r.getArticleFavicon() 612 | 613 | return Article{ 614 | Title: metadataTitle, 615 | Byline: metadataByline, 616 | Excerpt: metadataExcerpt, 617 | SiteName: metadataSiteName, 618 | Image: metadataImage, 619 | Favicon: metadataFavicon, 620 | } 621 | } 622 | 623 | // prepArticle prepares the article Node for display cleaning out any inline 624 | // CSS styles, iframes, forms and stripping extraneous paragraph tags
. 625 | func (r *Readability) prepArticle(articleContent *html.Node) { 626 | r.cleanStyles(articleContent) 627 | 628 | // Check for data tables before we continue, to avoid removing 629 | // items in those tables, which will often be isolated even 630 | // though they're visually linked to other content-ful elements 631 | // (text, images, etc.). 632 | r.markDataTables(articleContent) 633 | 634 | // Clean out junk from the article content 635 | r.cleanConditionally(articleContent, "form") 636 | r.cleanConditionally(articleContent, "fieldset") 637 | r.clean(articleContent, "object") 638 | r.clean(articleContent, "embed") 639 | r.clean(articleContent, "footer") 640 | r.clean(articleContent, "link") 641 | r.clean(articleContent, "aside") 642 | 643 | // Clean out elements have "share" in their id/class combinations 644 | // from final top candidates, which means we don't remove the top 645 | // candidates even they have "share". 646 | r.forEachNode(children(articleContent), func(topCandidate *html.Node, _ int) { 647 | r.cleanMatchedNodes(topCandidate, func(node *html.Node, nodeClassID string) bool { 648 | return rxShare.MatchString(nodeClassID) && len(textContent(node)) < r.CharThresholds 649 | }) 650 | }) 651 | 652 | // If there is only one h2 and its text content substantially 653 | // equals article title, they are probably using it as a header 654 | // and not a subheader, so remove it since we already extract 655 | // the title separately. 656 | if h2s := getElementsByTagName(articleContent, "h2"); len(h2s) == 1 { 657 | h2 := h2s[0] 658 | h2Text := textContent(h2) 659 | lengthSimilarRate := float64(len(h2Text)-len(r.articleTitle)) / float64(len(r.articleTitle)) 660 | 661 | if math.Abs(lengthSimilarRate) < 0.5 { 662 | titlesMatch := false 663 | 664 | if lengthSimilarRate > 0 { 665 | titlesMatch = strings.Contains(h2Text, r.articleTitle) 666 | } else { 667 | titlesMatch = strings.Contains(r.articleTitle, h2Text) 668 | } 669 | 670 | if titlesMatch { 671 | r.clean(articleContent, "h2") 672 | } 673 | } 674 | } 675 | 676 | r.clean(articleContent, "iframe") 677 | r.clean(articleContent, "input") 678 | r.clean(articleContent, "textarea") 679 | r.clean(articleContent, "select") 680 | r.clean(articleContent, "button") 681 | r.cleanHeaders(articleContent) 682 | 683 | // Do these last as the previous stuff may have removed junk 684 | // that will affect these 685 | r.cleanConditionally(articleContent, "table") 686 | r.cleanConditionally(articleContent, "ul") 687 | r.cleanConditionally(articleContent, "div") 688 | 689 | // Remove extra paragraphs 690 | r.removeNodes(getElementsByTagName(articleContent, "p"), func(p *html.Node) bool { 691 | imgCount := len(getElementsByTagName(p, "img")) 692 | embedCount := len(getElementsByTagName(p, "embed")) 693 | objectCount := len(getElementsByTagName(p, "object")) 694 | 695 | // Nasty iframes have been removed, only remain embedded videos. 696 | iframeCount := len(getElementsByTagName(p, "iframe")) 697 | totalCount := imgCount + embedCount + objectCount + iframeCount 698 | 699 | return totalCount == 0 && r.getInnerText(p, false) == "" 700 | }) 701 | 702 | r.forEachNode(getElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) { 703 | next := r.nextElement(br.NextSibling) 704 | 705 | if next != nil && tagName(next) == "p" { 706 | br.Parent.RemoveChild(br) 707 | } 708 | }) 709 | 710 | // Remove single-cell tables 711 | r.forEachNode(getElementsByTagName(articleContent, "table"), func(table *html.Node, _ int) { 712 | tbody := table 713 | 714 | if r.hasSingleTagInsideElement(table, "tbody") { 715 | tbody = firstElementChild(table) 716 | } 717 | 718 | if r.hasSingleTagInsideElement(tbody, "tr") { 719 | row := firstElementChild(tbody) 720 | 721 | if r.hasSingleTagInsideElement(row, "td") { 722 | cell := firstElementChild(row) 723 | 724 | newTag := "div" 725 | 726 | if r.everyNode(childNodes(cell), r.isPhrasingContent) { 727 | newTag = "p" 728 | } 729 | 730 | r.setNodeTag(cell, newTag) 731 | 732 | replaceNode(table, cell) 733 | } 734 | } 735 | }) 736 | } 737 | 738 | // grabArticle uses a variety of metrics (content score, classname, element 739 | // types), find the content that is most likely to be the stuff a user wants to 740 | // read. Then return it wrapped up in a div. 741 | func (r *Readability) grabArticle() *html.Node { 742 | for { 743 | doc := cloneNode(r.doc) 744 | 745 | var page *html.Node 746 | if nodes := getElementsByTagName(doc, "body"); len(nodes) > 0 { 747 | page = nodes[0] 748 | } 749 | 750 | // We can not grab an article if we do not have a page. 751 | if page == nil { 752 | return nil 753 | } 754 | 755 | // First, node prepping. Trash nodes that look cruddy (like ones with 756 | // the class name "comment", etc), and turn divs into P tags where they 757 | // have been used inappropriately (as in, where they contain no other 758 | // block level elements). 759 | var elementsToScore []*html.Node 760 | var node = documentElement(doc) 761 | 762 | for node != nil { 763 | matchString := className(node) + "\x20" + id(node) 764 | 765 | if !r.isProbablyVisible(node) { 766 | node = r.removeAndGetNext(node) 767 | continue 768 | } 769 | 770 | // Remove Node if it is a Byline. 771 | if r.checkByline(node, matchString) { 772 | node = r.removeAndGetNext(node) 773 | continue 774 | } 775 | 776 | // Remove unlikely candidates. 777 | nodeTagName := tagName(node) 778 | if r.flags.stripUnlikelys { 779 | if rxUnlikelyCandidates.MatchString(matchString) && 780 | !rxOkMaybeItsACandidate.MatchString(matchString) && 781 | !r.hasAncestorTag(node, "table", 3, nil) && 782 | nodeTagName != "body" && 783 | nodeTagName != "a" { 784 | node = r.removeAndGetNext(node) 785 | continue 786 | } 787 | } 788 | 789 | // Remove DIV, SECTION and HEADER nodes without any content. 790 | switch nodeTagName { 791 | case "div", 792 | "section", 793 | "header", 794 | "h1", 795 | "h2", 796 | "h3", 797 | "h4", 798 | "h5", 799 | "h6": 800 | if r.isElementWithoutContent(node) { 801 | node = r.removeAndGetNext(node) 802 | continue 803 | } 804 | } 805 | 806 | if indexOf(r.TagsToScore, nodeTagName) != -1 { 807 | elementsToScore = append(elementsToScore, node) 808 | } 809 | 810 | // Convert
.
811 | if nodeTagName == "div" {
812 | // Put phrasing content into paragraphs.
813 | var p *html.Node
814 | childNode := node.FirstChild
815 |
816 | for childNode != nil {
817 | nextSibling := childNode.NextSibling
818 |
819 | if r.isPhrasingContent(childNode) {
820 | if p != nil {
821 | appendChild(p, childNode)
822 | } else if !r.isWhitespace(childNode) {
823 | p = createElement("p")
824 | appendChild(p, cloneNode(childNode))
825 | replaceNode(childNode, p)
826 | }
827 | } else if p != nil {
828 | for p.LastChild != nil && r.isWhitespace(p.LastChild) {
829 | p.RemoveChild(p.LastChild)
830 | }
831 | p = nil
832 | }
833 |
834 | childNode = nextSibling
835 | }
836 |
837 | // Sites like http://mobile.slate.com encloses each paragraph
838 | // with a DIV element. DIVs with only a P element inside and no
839 | // text content can be safely converted into plain P elements to
840 | // avoid confusing the scoring algorithm with DIVs with are, in
841 | // practice, paragraphs.
842 | if r.hasSingleTagInsideElement(node, "p") && r.getLinkDensity(node) < 0.25 {
843 | newNode := children(node)[0]
844 | replaceNode(node, newNode)
845 | node = newNode
846 | elementsToScore = append(elementsToScore, node)
847 | } else if !r.hasChildBlockElement(node) {
848 | r.setNodeTag(node, "p")
849 | elementsToScore = append(elementsToScore, node)
850 | }
851 | }
852 |
853 | node = r.getNextNode(node, false)
854 | }
855 |
856 | // Loop through all paragraphs and assign a score to them based on how
857 | // much relevant content they have. Then add their score to their parent
858 | // node. A score is determined by things like number of commas, class
859 | // names, etc. Maybe eventually link density.
860 | var candidates []*html.Node
861 | r.forEachNode(elementsToScore, func(elementToScore *html.Node, _ int) {
862 | if elementToScore.Parent == nil || tagName(elementToScore.Parent) == "" {
863 | return
864 | }
865 |
866 | // If this paragraph is less than 25 characters, don't even count it.
867 | innerText := r.getInnerText(elementToScore, true)
868 | if len(innerText) < 25 {
869 | return
870 | }
871 |
872 | // Exclude nodes with no ancestor.
873 | ancestors := r.getNodeAncestors(elementToScore, 3)
874 | if len(ancestors) == 0 {
875 | return
876 | }
877 |
878 | // Add a point for the paragraph itself as a base.
879 | contentScore := 1
880 |
881 | // Add points for any commas within this paragraph.
882 | contentScore += strings.Count(innerText, ",")
883 |
884 | // For every 100 characters in this paragraph, add another point. Up to 3 points.
885 | contentScore += int(math.Min(math.Floor(float64(len(innerText))/100.0), 3.0))
886 |
887 | // Initialize and score ancestors.
888 | r.forEachNode(ancestors, func(ancestor *html.Node, level int) {
889 | if tagName(ancestor) == "" || ancestor.Parent == nil || ancestor.Parent.Type != html.ElementNode {
890 | return
891 | }
892 |
893 | if !r.hasContentScore(ancestor) {
894 | r.initializeNode(ancestor)
895 | candidates = append(candidates, ancestor)
896 | }
897 |
898 | // Node score divider:
899 | // - parent: 1 (no division)
900 | // - grandparent: 2
901 | // - great grandparent+: ancestor level * 3
902 | scoreDivider := 1
903 | switch level {
904 | case 0:
905 | scoreDivider = 1
906 | case 1:
907 | scoreDivider = 2
908 | default:
909 | scoreDivider = level * 3
910 | }
911 |
912 | ancestorScore := r.getContentScore(ancestor)
913 | ancestorScore += float64(contentScore) / float64(scoreDivider)
914 |
915 | r.setContentScore(ancestor, ancestorScore)
916 | })
917 | })
918 |
919 | // These lines are a bit different compared to Readability.js.
920 | //
921 | // In Readability.js, they fetch NTopCandidates utilising array method
922 | // like `splice` and `pop`. In Go, array method like that is not as
923 | // simple, especially since we are working with pointer. So, here we
924 | // simply sort top candidates, and limit it to max NTopCandidates.
925 |
926 | // Scale the final candidates score based on link density. Good
927 | // content should have a relatively small link density (5% or
928 | // less) and be mostly unaffected by this operation.
929 | for i := 0; i < len(candidates); i++ {
930 | candidate := candidates[i]
931 | candidateScore := r.getContentScore(candidate) * (1 - r.getLinkDensity(candidate))
932 | r.setContentScore(candidate, candidateScore)
933 | }
934 |
935 | // After we have calculated scores, sort through all of the possible
936 | // candidate nodes we found and find the one with the highest score.
937 | sort.Slice(candidates, func(i int, j int) bool {
938 | return r.getContentScore(candidates[i]) > r.getContentScore(candidates[j])
939 | })
940 |
941 | var topCandidates []*html.Node
942 |
943 | if len(candidates) > r.NTopCandidates {
944 | topCandidates = candidates[:r.NTopCandidates]
945 | } else {
946 | topCandidates = candidates
947 | }
948 |
949 | var topCandidate, parentOfTopCandidate *html.Node
950 | neededToCreateTopCandidate := false
951 | if len(topCandidates) > 0 {
952 | topCandidate = topCandidates[0]
953 | }
954 |
955 | // If we still have no top candidate, just use the body as a last
956 | // resort. We also have to copy the body node so it is something
957 | // we can modify.
958 | if topCandidate == nil || tagName(topCandidate) == "body" {
959 | // Move all of the page's children into topCandidate
960 | topCandidate = createElement("div")
961 | neededToCreateTopCandidate = true
962 | // Move everything (not just elements, also text nodes etc.)
963 | // into the container so we even include text directly in the body:
964 | kids := childNodes(page)
965 | for i := 0; i < len(kids); i++ {
966 | appendChild(topCandidate, kids[i])
967 | }
968 |
969 | appendChild(page, topCandidate)
970 | r.initializeNode(topCandidate)
971 | } else if topCandidate != nil {
972 | // Find a better top candidate node if it contains (at least three)
973 | // nodes which belong to `topCandidates` array and whose scores are
974 | // quite closed with current `topCandidate` node.
975 | topCandidateScore := r.getContentScore(topCandidate)
976 | var alternativeCandidateAncestors [][]*html.Node
977 | for i := 1; i < len(topCandidates); i++ {
978 | if r.getContentScore(topCandidates[i])/topCandidateScore >= 0.75 {
979 | topCandidateAncestors := r.getNodeAncestors(topCandidates[i], 0)
980 | alternativeCandidateAncestors = append(alternativeCandidateAncestors, topCandidateAncestors)
981 | }
982 | }
983 |
984 | minimumTopCandidates := 3
985 | if len(alternativeCandidateAncestors) >= minimumTopCandidates {
986 | parentOfTopCandidate = topCandidate.Parent
987 | for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" {
988 | listContainingThisAncestor := 0
989 | for ancestorIndex := 0; ancestorIndex < len(alternativeCandidateAncestors) && listContainingThisAncestor < minimumTopCandidates; ancestorIndex++ {
990 | if includeNode(alternativeCandidateAncestors[ancestorIndex], parentOfTopCandidate) {
991 | listContainingThisAncestor++
992 | }
993 | }
994 |
995 | if listContainingThisAncestor >= minimumTopCandidates {
996 | topCandidate = parentOfTopCandidate
997 | break
998 | }
999 |
1000 | parentOfTopCandidate = parentOfTopCandidate.Parent
1001 | }
1002 | }
1003 |
1004 | if !r.hasContentScore(topCandidate) {
1005 | r.initializeNode(topCandidate)
1006 | }
1007 |
1008 | // Because of our bonus system, parents of candidates might
1009 | // have scores themselves. They get half of the node. There
1010 | // won't be nodes with higher scores than our topCandidate,
1011 | // but if we see the score going *up* in the first few steps *
1012 | // up the tree, that's a decent sign that there might be more
1013 | // content lurking in other places that we want to unify in.
1014 | // The sibling stuff below does some of that - but only if
1015 | // we've looked high enough up the DOM tree.
1016 | parentOfTopCandidate = topCandidate.Parent
1017 | lastScore := r.getContentScore(topCandidate)
1018 | // The scores shouldn't get too lor.
1019 | scoreThreshold := lastScore / 3.0
1020 | for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" {
1021 | if !r.hasContentScore(parentOfTopCandidate) {
1022 | parentOfTopCandidate = parentOfTopCandidate.Parent
1023 | continue
1024 | }
1025 |
1026 | parentScore := r.getContentScore(parentOfTopCandidate)
1027 | if parentScore < scoreThreshold {
1028 | break
1029 | }
1030 |
1031 | if parentScore > lastScore {
1032 | // Alright! We found a better parent to use.
1033 | topCandidate = parentOfTopCandidate
1034 | break
1035 | }
1036 |
1037 | lastScore = parentScore
1038 | parentOfTopCandidate = parentOfTopCandidate.Parent
1039 | }
1040 |
1041 | // If the top candidate is the only child, use parent
1042 | // instead. This will help sibling joining logic when
1043 | // adjacent content is actually located in parent's
1044 | // sibling node.
1045 | parentOfTopCandidate = topCandidate.Parent
1046 | for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" && len(children(parentOfTopCandidate)) == 1 {
1047 | topCandidate = parentOfTopCandidate
1048 | parentOfTopCandidate = topCandidate.Parent
1049 | }
1050 |
1051 | if !r.hasContentScore(topCandidate) {
1052 | r.initializeNode(topCandidate)
1053 | }
1054 | }
1055 |
1056 | // Now that we have the top candidate, look through its siblings
1057 | // for content that might also be related. Things like preambles,
1058 | // content split by ads that we removed, etc.
1059 | articleContent := createElement("div")
1060 | siblingScoreThreshold := math.Max(10, r.getContentScore(topCandidate)*0.2)
1061 |
1062 | // Keep potential top candidate's parent node to try to get text direction of it later.
1063 | topCandidateScore := r.getContentScore(topCandidate)
1064 | topCandidateClassName := className(topCandidate)
1065 |
1066 | parentOfTopCandidate = topCandidate.Parent
1067 | siblings := children(parentOfTopCandidate)
1068 | for s := 0; s < len(siblings); s++ {
1069 | sibling := siblings[s]
1070 | appendNode := false
1071 |
1072 | if sibling == topCandidate {
1073 | appendNode = true
1074 | } else {
1075 | contentBonus := float64(0)
1076 |
1077 | // Give a bonus if sibling nodes and top candidates have the example same classname
1078 | if className(sibling) == topCandidateClassName && topCandidateClassName != "" {
1079 | contentBonus += topCandidateScore * 0.2
1080 | }
1081 |
1082 | if r.hasContentScore(sibling) && r.getContentScore(sibling)+contentBonus >= siblingScoreThreshold {
1083 | appendNode = true
1084 | } else if tagName(sibling) == "p" {
1085 | linkDensity := r.getLinkDensity(sibling)
1086 | nodeContent := r.getInnerText(sibling, true)
1087 | nodeLength := len(nodeContent)
1088 |
1089 | if nodeLength > 80 && linkDensity < 0.25 {
1090 | appendNode = true
1091 | } else if nodeLength < 80 && nodeLength > 0 && linkDensity == 0 &&
1092 | rxSentencePeriod.MatchString(nodeContent) {
1093 | appendNode = true
1094 | }
1095 | }
1096 | }
1097 |
1098 | if appendNode {
1099 | // We have a node that is not a common block level element,
1100 | // like a FORM or TD tag. Turn it into a DIV so it does not get
1101 | // filtered out later by accident.
1102 | if indexOf(alterToDivExceptions, tagName(sibling)) == -1 {
1103 | r.setNodeTag(sibling, "div")
1104 | }
1105 |
1106 | appendChild(articleContent, sibling)
1107 | }
1108 | }
1109 |
1110 | // So we have all of the content that we need. Now we clean
1111 | // it up for presentation.
1112 | r.prepArticle(articleContent)
1113 |
1114 | if neededToCreateTopCandidate {
1115 | // We already created a fake DIV thing, and there would not have
1116 | // been any siblings left for the previous loop, so there is no
1117 | // point trying to create a new DIV and then move all the children
1118 | // over. Just assign IDs and CSS class names here. No need to append
1119 | // because that already happened anyway.
1120 | //
1121 | // By the way, this line is different with Readability.js.
1122 | //
1123 | // In Readability.js, when using `appendChild`, the node is still
1124 | // referenced. Meanwhile here, our `appendChild` will clone the
1125 | // node, put it in the new place, then delete the original.
1126 | firstChild := firstElementChild(articleContent)
1127 | if firstChild != nil && tagName(firstChild) == "div" {
1128 | setAttribute(firstChild, "id", "readability-page-1")
1129 | setAttribute(firstChild, "class", "page")
1130 | }
1131 | } else {
1132 | div := createElement("div")
1133 |
1134 | setAttribute(div, "id", "readability-page-1")
1135 | setAttribute(div, "class", "page")
1136 |
1137 | childs := childNodes(articleContent)
1138 |
1139 | for i := 0; i < len(childs); i++ {
1140 | appendChild(div, childs[i])
1141 | }
1142 |
1143 | appendChild(articleContent, div)
1144 | }
1145 |
1146 | parseSuccessful := true
1147 |
1148 | // Now that we've gone through the full algorithm, check to see if we
1149 | // got any meaningful content. If we did not, we may need to re-run
1150 | // grabArticle with different flags set. This gives us a higher
1151 | // likelihood of finding the content, and the sieve approach gives us a
1152 | // higher likelihood of finding the -right- content.
1153 | textLength := len(r.getInnerText(articleContent, true))
1154 | if textLength < r.CharThresholds {
1155 | parseSuccessful = false
1156 |
1157 | if r.flags.stripUnlikelys {
1158 | r.flags.stripUnlikelys = false
1159 | r.attempts = append(r.attempts, parseAttempt{
1160 | articleContent: articleContent,
1161 | textLength: textLength,
1162 | })
1163 | } else if r.flags.useWeightClasses {
1164 | r.flags.useWeightClasses = false
1165 | r.attempts = append(r.attempts, parseAttempt{
1166 | articleContent: articleContent,
1167 | textLength: textLength,
1168 | })
1169 | } else if r.flags.cleanConditionally {
1170 | r.flags.cleanConditionally = false
1171 | r.attempts = append(r.attempts, parseAttempt{
1172 | articleContent: articleContent,
1173 | textLength: textLength,
1174 | })
1175 | } else {
1176 | r.attempts = append(r.attempts, parseAttempt{
1177 | articleContent: articleContent,
1178 | textLength: textLength,
1179 | })
1180 |
1181 | // No luck after removing flags, just return the
1182 | // longest text we found during the different loops *
1183 | sort.Slice(r.attempts, func(i, j int) bool {
1184 | return r.attempts[i].textLength > r.attempts[j].textLength
1185 | })
1186 |
1187 | // But first check if we actually have something
1188 | if r.attempts[0].textLength == 0 {
1189 | return nil
1190 | }
1191 |
1192 | articleContent = r.attempts[0].articleContent
1193 | parseSuccessful = true
1194 | }
1195 | }
1196 |
1197 | if parseSuccessful {
1198 | return articleContent
1199 | }
1200 | }
1201 | }
1202 |
1203 | // initializeNode initializes a node with the readability score. Also checks
1204 | // the className/id for special names to add to its score.
1205 | func (r *Readability) initializeNode(node *html.Node) {
1206 | contentScore := float64(r.getClassWeight(node))
1207 |
1208 | switch tagName(node) {
1209 | case "div":
1210 | contentScore += 5
1211 | case "pre", "td", "blockquote":
1212 | contentScore += 3
1213 | case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
1214 | contentScore -= 3
1215 | case "h1", "h2", "h3", "h4", "h5", "h6", "th":
1216 | contentScore -= 5
1217 | }
1218 |
1219 | r.setContentScore(node, contentScore)
1220 | }
1221 |
1222 | // removeAndGetNext remove node and returns its next node.
1223 | func (r *Readability) removeAndGetNext(node *html.Node) *html.Node {
1224 | nextNode := r.getNextNode(node, true)
1225 |
1226 | if node.Parent != nil {
1227 | node.Parent.RemoveChild(node)
1228 | }
1229 |
1230 | return nextNode
1231 | }
1232 |
1233 | // getNextNode traverses the DOM from node to node, starting at the node passed
1234 | // in. Pass true for the second parameter to indicate this node itself (and its
1235 | // kids) are going away, and we want the next node over. Calling this in a loop
1236 | // will traverse the DOM depth-first.
1237 | //
1238 | // In Readability.js, ignoreSelfAndKids default to false.
1239 | func (r *Readability) getNextNode(node *html.Node, ignoreSelfAndKids bool) *html.Node {
1240 | // First check for kids if those are not being ignored
1241 | if firstChild := firstElementChild(node); !ignoreSelfAndKids && firstChild != nil {
1242 | return firstChild
1243 | }
1244 |
1245 | // Then for siblings...
1246 | if sibling := nextElementSibling(node); sibling != nil {
1247 | return sibling
1248 | }
1249 |
1250 | // And finally, move up the parent chain *and* find a sibling
1251 | // (because this is depth-first traversal, we will have already
1252 | // seen the parent nodes themselves).
1253 | for {
1254 | node = node.Parent
1255 | if node == nil || nextElementSibling(node) != nil {
1256 | break
1257 | }
1258 | }
1259 |
1260 | if node != nil {
1261 | return nextElementSibling(node)
1262 | }
1263 |
1264 | return nil
1265 | }
1266 |
1267 | // isValidByline checks whether the input string could be a byline.
1268 | func (r *Readability) isValidByline(byline string) bool {
1269 | byline = strings.TrimSpace(byline)
1270 | return len(byline) > 0 && len(byline) < 100
1271 | }
1272 |
1273 | // checkByline determines if a node is used as byline.
1274 | func (r *Readability) checkByline(node *html.Node, matchString string) bool {
1275 | if r.articleByline != "" {
1276 | return false
1277 | }
1278 |
1279 | rel := getAttribute(node, "rel")
1280 | itemprop := getAttribute(node, "itemprop")
1281 | nodeText := textContent(node)
1282 | if (rel == "author" || strings.Contains(itemprop, "author") || rxByline.MatchString(matchString)) && r.isValidByline(nodeText) {
1283 | nodeText = strings.TrimSpace(nodeText)
1284 | nodeText = strings.Join(strings.Fields(nodeText), "\x20")
1285 | r.articleByline = nodeText
1286 | return true
1287 | }
1288 |
1289 | return false
1290 | }
1291 |
1292 | // getNodeAncestors gets the node's direct parent and grandparents.
1293 | //
1294 | // In Readability.js, maxDepth default to 0.
1295 | func (r *Readability) getNodeAncestors(node *html.Node, maxDepth int) []*html.Node {
1296 | level := 0
1297 | ancestors := []*html.Node{}
1298 |
1299 | for node.Parent != nil {
1300 | level++
1301 | ancestors = append(ancestors, node.Parent)
1302 |
1303 | if maxDepth > 0 && level == maxDepth {
1304 | break
1305 | }
1306 |
1307 | node = node.Parent
1308 | }
1309 |
1310 | return ancestors
1311 | }
1312 |
1313 | // setContentScore sets the readability score for a node.
1314 | func (r *Readability) setContentScore(node *html.Node, score float64) {
1315 | setAttribute(node, "data-readability-score", fmt.Sprintf("%.4f", score))
1316 | }
1317 |
1318 | // hasContentScore checks if node has readability score.
1319 | func (r *Readability) hasContentScore(node *html.Node) bool {
1320 | return hasAttribute(node, "data-readability-score")
1321 | }
1322 |
1323 | // getContentScore gets the readability score of a node.
1324 | func (r *Readability) getContentScore(node *html.Node) float64 {
1325 | strScore := getAttribute(node, "data-readability-score")
1326 | strScore = strings.TrimSpace(strScore)
1327 |
1328 | if strScore == "" {
1329 | return 0
1330 | }
1331 |
1332 | score, err := strconv.ParseFloat(strScore, 64)
1333 |
1334 | if err != nil {
1335 | return 0
1336 | }
1337 |
1338 | return score
1339 | }
1340 |
1341 | // removeScripts removes script tags from the document.
1342 | func (r *Readability) removeScripts(doc *html.Node) {
1343 | r.removeNodes(getElementsByTagName(doc, "script"), nil)
1344 | r.removeNodes(getElementsByTagName(doc, "noscript"), nil)
1345 | }
1346 |
1347 | // hasSingleTagInsideElement check if the node has only whitespace and a single
1348 | // element with given tag. Returns false if the DIV Node contains non-empty text
1349 | // nodes or if it contains no element with given tag or more than 1 element.
1350 | func (r *Readability) hasSingleTagInsideElement(element *html.Node, tag string) bool {
1351 | // There should be exactly 1 element child with given tag
1352 | if childs := children(element); len(childs) != 1 || tagName(childs[0]) != tag {
1353 | return false
1354 | }
1355 |
1356 | // And there should be no text nodes with real content
1357 | return !r.someNode(childNodes(element), func(node *html.Node) bool {
1358 | return node.Type == html.TextNode && rxHasContent.MatchString(textContent(node))
1359 | })
1360 | }
1361 |
1362 | // isElementWithoutContent determines if node is empty. A node is considered
1363 | // empty is there is nothing inside or if the only things inside are HTML break
1364 | // tags
and HTML horizontal rule tags