├── LICENSE ├── README.md └── textextract.go /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Emir Uz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # textextract 2 | 3 | textextract is a tiny library (87 lines of Go) that identifies where the article content is in a HTML page (as opposed to navigation, headers, footers, ads, etc), extracts it and returns it as a string. 4 | 5 | It's a tree search and score algorithm, it uses a very simple scoring rule, it is surprisingly effective. 6 | 7 | ## What it's for 8 | 9 | If you're doing semantic analysis on crawled information and you need article content to feed into some other process like a semantic extractor, classifier, etc. It preserves the rendering order of text but it doesn't preserve white space. 10 | 11 | ## How it works 12 | 13 | 1. It parses the HTML into a node tree using the standard Go html package. 14 | 15 | 2. It then walks the tree depth first and scores each node on route. The score from the parent node is pushed down as the basis for the child node. The scoring formula is: WORDCOUNT - WORDCOUNTINANCHOR^2, where WORDCOUNT is the number of words in the node that are not hyperlinked and WORDCOUNTINANCHOR is the number of words in the node that are hyperlinked. The WORDCOUNTINANCHOR for each node is actually calculated as 1 + WORDCOUNTINACHOR^2 just because there is often anchors on things other than words so WORDCOUNTINACHOR^2 is often zero. 16 | 17 | 3. As it goes it'll add nodes that are below the minimum score to a toDelete slice. When the recursion is finished, it'll delete all nodes in the toDelete slice. 18 | 19 | 4. Finally, the filtered tree is parsed again, depth first, and all text nodes are printed to a string. 20 | 21 | ## How to install it 22 | 23 | go get https://github.com/emiruz/textextract 24 | 25 | ## How to use it 26 | 27 | import "github.com/emiruz/textextract" 28 | 29 | main func() { 30 | textextract.MinScore = 5 // the default is 5. 31 | extractedText, err := textextract.ExtractFromHtml(yourUTF8HTMLString) 32 | } 33 | 34 | ## License 35 | 36 | MIT Licensed, do as you will with it. 37 | 38 | ## Bugs 39 | 40 | Please submit them as issues on the repository. 41 | 42 | ## TODO 43 | 44 | 1. Add tests 45 | 46 | 2. Add comments. 47 | -------------------------------------------------------------------------------- /textextract.go: -------------------------------------------------------------------------------- 1 | package textextract 2 | 3 | import ("strings" 4 | "golang.org/x/net/html" 5 | "bytes" 6 | "fmt" 7 | "regexp" 8 | "errors" 9 | ) 10 | 11 | var MinScore = 5 12 | 13 | func isInAnchor(n *html.Node) bool { 14 | if n.Parent == nil { 15 | return false 16 | } 17 | if n.Parent.Data == "a" { 18 | return true 19 | } 20 | return isInAnchor(n.Parent) 21 | } 22 | 23 | func normaliseText(t string) string { 24 | r, _ := regexp.Compile("<[^>]*>|\\n|\\t| +") 25 | r2, _ := regexp.Compile("^ +| +$") 26 | return r2.ReplaceAllString(r.ReplaceAllString( 27 | r.ReplaceAllString(t, " "), 28 | " "),"") 29 | } 30 | 31 | func filter(doc *html.Node, minScore int) *html.Node { 32 | type NodePair struct { 33 | Parent *html.Node 34 | Child *html.Node 35 | } 36 | toDelete := []NodePair{} 37 | var f func(n *html.Node, score int) int 38 | f = func(n *html.Node, score int) int { 39 | if n.Type == html.TextNode { 40 | count := len(strings.Split(normaliseText(n.Data), " ")) 41 | switch { 42 | case n.Parent.Data == "script": 43 | case n.Parent.Data == "style": 44 | case n.Parent.Data == "link": 45 | case isInAnchor(n): 46 | score -= 1 + count^2 47 | default: 48 | score += count 49 | } 50 | return score 51 | } 52 | 53 | ownScore := score 54 | for c := n.FirstChild; c != nil; c = c.NextSibling { 55 | score += f(c, ownScore) 56 | } 57 | 58 | if score <= minScore && n.Data != "a" { 59 | toDelete = append(toDelete, NodePair{n.Parent,n}) 60 | } 61 | return score 62 | } 63 | f(doc,0) 64 | 65 | for _, x := range toDelete { 66 | if x.Parent != nil { 67 | x.Parent.RemoveChild(x.Child) 68 | } 69 | } 70 | return doc 71 | } 72 | 73 | func ExtractFromHtml(htmlUTF8Str string) (string,error) { 74 | doc, err := html.Parse(strings.NewReader(htmlUTF8Str)) 75 | if err != nil { 76 | return "", errors.New("Could not parse HTML string.") 77 | } 78 | doc = filter(doc, MinScore) 79 | var f func(n *html.Node) 80 | var buffer bytes.Buffer 81 | f = func(n *html.Node) { 82 | d := normaliseText(n.Data) 83 | if n.Type == html.TextNode && d != "" && d!= " " { 84 | switch n.Parent.Data { 85 | case "title": 86 | default: 87 | buffer.WriteString(fmt.Sprintf("\n%s", d)) 88 | } 89 | } 90 | for c := n.FirstChild; c != nil; c = c.NextSibling { 91 | f(c) 92 | } 93 | } 94 | f(doc) 95 | return buffer.String(), nil 96 | } 97 | --------------------------------------------------------------------------------