├── LICENSE
├── README.md
└── textextract.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Emir Uz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # textextract
 2 | 
 3 | textextract is a tiny library (87 lines of Go) that identifies where the article content is in a HTML page (as opposed to navigation, headers, footers, ads, etc), extracts it and returns it as a string.
 4 | 
 5 | It's a tree search and score algorithm, it uses a very simple scoring rule, it is surprisingly effective.
 6 | 
 7 | ## What it's for
 8 | 
 9 | If you're doing semantic analysis on crawled information and you need article content to feed into some other process like a semantic extractor, classifier, etc. It preserves the rendering order of text but it doesn't preserve white space.
10 | 
11 | ## How it works
12 | 
13 | 1. It parses the HTML into a node tree using the standard Go html package.
14 | 
15 | 2. It then walks the tree depth first and scores each node on route. The score from the parent node is pushed down as the basis for the child node. The scoring formula is: WORDCOUNT - WORDCOUNTINANCHOR^2, where WORDCOUNT is the number of words in the node that are not hyperlinked and WORDCOUNTINANCHOR is the number of words in the node that are hyperlinked. The WORDCOUNTINANCHOR for each node is actually calculated as 1 + WORDCOUNTINACHOR^2 just because there is often anchors on things other than words so WORDCOUNTINACHOR^2 is often zero.
16 | 
17 | 3. As it goes it'll add nodes that are below the minimum score to a toDelete slice. When the recursion is finished, it'll delete all nodes in the toDelete slice.
18 | 
19 | 4. Finally, the filtered tree is parsed again, depth first, and all text nodes are printed to a string.
20 | 
21 | ## How to install it
22 | 
23 |     go get https://github.com/emiruz/textextract
24 | 
25 | ## How to use it
26 | 
27 |     import "github.com/emiruz/textextract"
28 | 
29 |     main func() {
30 |     	textextract.MinScore = 5 // the default is 5.
31 |         extractedText, err := textextract.ExtractFromHtml(yourUTF8HTMLString)
32 |     }
33 | 
34 | ## License
35 | 
36 | MIT Licensed, do as you will with it.
37 | 
38 | ## Bugs
39 | 
40 | Please submit them as issues on the repository.
41 | 
42 | ## TODO
43 | 
44 | 1. Add tests
45 | 
46 | 2. Add comments.
47 | 


--------------------------------------------------------------------------------
/textextract.go:
--------------------------------------------------------------------------------
 1 | package textextract
 2 | 
 3 | import ("strings"
 4 | 	"golang.org/x/net/html"
 5 | 	"bytes"
 6 | 	"fmt"
 7 | 	"regexp"
 8 | 	"errors"
 9 | )
10 | 
11 | var MinScore = 5
12 | 
13 | func isInAnchor(n *html.Node) bool {
14 | 	if n.Parent == nil {
15 | 		return false
16 | 	}
17 | 	if n.Parent.Data == "a" {
18 | 		return true
19 | 	}
20 | 	return isInAnchor(n.Parent)
21 | }
22 | 
23 | func normaliseText(t string) string {
24 | 	r, _ := regexp.Compile("<[^>]*>|\\n|\\t| +")	
25 | 	r2, _ := regexp.Compile("^ +| +$")
26 | 	return r2.ReplaceAllString(r.ReplaceAllString(
27 | 		r.ReplaceAllString(t, " "),
28 | 		" "),"")
29 | }
30 | 
31 | func filter(doc *html.Node, minScore int) *html.Node {
32 | 	type NodePair struct {
33 | 		Parent *html.Node
34 | 		Child *html.Node
35 | 	}
36 | 	toDelete := []NodePair{}
37 | 	var f func(n *html.Node, score int) int
38 | 	f = func(n *html.Node, score int) int {
39 | 		if n.Type == html.TextNode {
40 | 			count := len(strings.Split(normaliseText(n.Data), " "))
41 | 			switch {
42 | 			case n.Parent.Data == "script":
43 | 			case n.Parent.Data == "style":
44 | 			case n.Parent.Data == "link":
45 | 			case isInAnchor(n):
46 | 				score -= 1 + count^2
47 | 			default:
48 | 				score += count
49 | 			}
50 | 			return score
51 | 		}
52 | 
53 | 		ownScore := score
54 | 		for c := n.FirstChild; c != nil; c = c.NextSibling {
55 | 			score += f(c, ownScore)
56 | 		}
57 | 
58 | 		if score <= minScore && n.Data != "a"  {
59 | 			toDelete = append(toDelete, NodePair{n.Parent,n})
60 | 		}
61 | 		return score
62 | 	}
63 | 	f(doc,0)
64 | 
65 | 	for _, x := range toDelete {
66 | 		if x.Parent != nil {
67 | 			x.Parent.RemoveChild(x.Child)
68 | 		}
69 | 	}
70 | 	return doc
71 | }
72 | 
73 | func ExtractFromHtml(htmlUTF8Str string) (string,error) {
74 | 	doc, err := html.Parse(strings.NewReader(htmlUTF8Str))
75 | 	if err != nil {
76 | 		return "", errors.New("Could not parse HTML string.")
77 | 	}
78 | 	doc = filter(doc, MinScore)
79 | 	var f func(n *html.Node)
80 | 	var buffer bytes.Buffer
81 | 	f = func(n *html.Node) {
82 | 		d := normaliseText(n.Data)
83 | 		if n.Type == html.TextNode && d != "" && d!= " " {
84 | 			switch n.Parent.Data {
85 | 			case "title":
86 | 			default:
87 | 				buffer.WriteString(fmt.Sprintf("\n%s", d))
88 | 			}
89 | 		}
90 | 		for c := n.FirstChild; c != nil; c = c.NextSibling {
91 | 			f(c)
92 | 		}
93 | 	}
94 | 	f(doc)
95 | 	return buffer.String(), nil
96 | }
97 | 


--------------------------------------------------------------------------------