├── COPYING ├── README.md ├── doc.go ├── elcatalog.go ├── element.go ├── example └── extract.go ├── fetch.go ├── main.go ├── qa └── qa.go ├── text.go ├── util.go └── util_test.go /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Alessandro Arzilli 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the project nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Library that uses Readability-like heuristics to extract text from an HTML document. 2 | 3 | Example: 4 | ```go 5 | import "golang.org/x/net/html" 6 | … 7 | node, err := html.Parse(bytes.NewReader(raw_html)) 8 | if err != nil { 9 | log.Fatal("Parsing error: ", err) 10 | } 11 | title, text := sandblast.Extract(node) 12 | fmt.Printf("Title: %s\n%s", title, text) 13 | … 14 | ``` 15 | See also `example/extract.go`, a command line utility to extract text from a URL. 16 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | //Library that uses Readability-like heuristics to extract text from an HTML document 2 | package sandblast 3 | -------------------------------------------------------------------------------- /elcatalog.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "golang.org/x/net/html" 5 | "strings" 6 | ) 7 | 8 | type nodeKind int 9 | 10 | const ( 11 | _K_SUPPRESSED = nodeKind(iota) 12 | _K_TODESTRUCTURE 13 | _K_CONTAINER 14 | _K_KOTCONTAINER 15 | _K_FORMATTING 16 | _K_INLINE 17 | ) 18 | 19 | var elements = map[string]nodeKind{ 20 | /* Suppressed */ 21 | "head": _K_SUPPRESSED, 22 | "base": _K_SUPPRESSED, "link": _K_SUPPRESSED, "meta": _K_SUPPRESSED, "title": _K_SUPPRESSED, 23 | "script": _K_SUPPRESSED, "noscript": _K_SUPPRESSED, "style": _K_SUPPRESSED, 24 | "input": _K_SUPPRESSED, "label": _K_SUPPRESSED, "textarea": _K_SUPPRESSED, "button": _K_SUPPRESSED, 25 | "isindex": _K_SUPPRESSED, 26 | "object": _K_SUPPRESSED, "applet": _K_SUPPRESSED, "img": _K_SUPPRESSED, "map": _K_SUPPRESSED, 27 | "address": _K_SUPPRESSED, 28 | "basefont": _K_SUPPRESSED, 29 | "colgroup": _K_SUPPRESSED, "col": _K_SUPPRESSED, "caption": _K_SUPPRESSED, 30 | "br": _K_SUPPRESSED, "hr": _K_SUPPRESSED, 31 | "canvas": _K_SUPPRESSED, 32 | "audio": _K_SUPPRESSED, "video": _K_SUPPRESSED, "source": _K_SUPPRESSED, "track": _K_SUPPRESSED, "embed": _K_SUPPRESSED, 33 | "datalist": _K_SUPPRESSED, "keygen": _K_SUPPRESSED, "output": _K_SUPPRESSED, 34 | "command": _K_SUPPRESSED, "progress": _K_SUPPRESSED, 35 | "ruby": _K_SUPPRESSED, "rt": _K_SUPPRESSED, "rp": _K_SUPPRESSED, 36 | 37 | /* To Destructure */ 38 | "html": _K_TODESTRUCTURE, 39 | "tbody": _K_TODESTRUCTURE, "thread": _K_TODESTRUCTURE, "tfoot": _K_TODESTRUCTURE, "tr": _K_TODESTRUCTURE, "th": _K_TODESTRUCTURE, 40 | "form": _K_TODESTRUCTURE, "fieldset": _K_TODESTRUCTURE, 41 | "optgroup": _K_TODESTRUCTURE, 42 | "iframe": _K_TODESTRUCTURE, 43 | "legend": _K_TODESTRUCTURE, "bdo": _K_TODESTRUCTURE, 44 | "abbr": _K_TODESTRUCTURE, "acronym": _K_TODESTRUCTURE, 45 | "figure": _K_TODESTRUCTURE, "figcaption": _K_TODESTRUCTURE, 46 | 47 | /* Container */ 48 | "div": _K_CONTAINER, "span": _K_CONTAINER, 49 | "select": _K_CONTAINER, "option": _K_CONTAINER, 50 | "table": _K_CONTAINER, "td": _K_CONTAINER, 51 | "dir": _K_CONTAINER, "dl": _K_CONTAINER, "dt": _K_CONTAINER, "dd": _K_CONTAINER, 52 | "menu": _K_CONTAINER, 53 | "ul": _K_CONTAINER, "ol": _K_CONTAINER, "li": _K_CONTAINER, 54 | "blockquote": _K_CONTAINER, "p": _K_CONTAINER, "cite": _K_CONTAINER, "pre": _K_CONTAINER, 55 | "h4": _K_CONTAINER, "h5": _K_CONTAINER, "h6": _K_CONTAINER, 56 | "header": _K_CONTAINER, "hgroup": _K_CONTAINER, "main": _K_CONTAINER, "article": _K_CONTAINER, "aside": _K_CONTAINER, "footer": _K_CONTAINER, "details": _K_CONTAINER, "summary": _K_CONTAINER, 57 | "nav": _K_CONTAINER, "section": _K_CONTAINER, 58 | "dialog": _K_CONTAINER, 59 | 60 | /* Keep Original Tag Container */ 61 | "h1": _K_KOTCONTAINER, "h2": _K_KOTCONTAINER, "h3": _K_KOTCONTAINER, 62 | 63 | /* Formatting */ 64 | "tt": _K_FORMATTING, 65 | "small": _K_FORMATTING, "big": _K_FORMATTING, 66 | "s": _K_FORMATTING, "strike": _K_FORMATTING, 67 | "center": _K_FORMATTING, 68 | "dfn": _K_FORMATTING, "del": _K_FORMATTING, 69 | "kbd": _K_FORMATTING, "samp": _K_FORMATTING, "var": _K_FORMATTING, "code": _K_FORMATTING, 70 | "q'": _K_FORMATTING, "ins": _K_FORMATTING, 71 | "sub": _K_FORMATTING, "sup": _K_FORMATTING, 72 | "font": _K_FORMATTING, 73 | "mark": _K_FORMATTING, "time": _K_FORMATTING, "bdi": _K_FORMATTING, "wbr": _K_FORMATTING, 74 | "meter": _K_FORMATTING, 75 | 76 | /* Inline */ 77 | "strong": _K_INLINE, "b": _K_INLINE, 78 | "i": _K_INLINE, "em": _K_INLINE, 79 | "u": _K_INLINE, 80 | "a": _K_INLINE, 81 | } 82 | 83 | func getNodeKind(node *html.Node) nodeKind { 84 | kind, ok := elements[strings.ToLower(node.Data)] 85 | if !ok { 86 | kind, _ = elements["div"] 87 | } 88 | return kind 89 | } 90 | -------------------------------------------------------------------------------- /element.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "golang.org/x/net/html" 7 | henc "html" 8 | "io" 9 | "strings" 10 | ) 11 | 12 | type element struct { 13 | tag string 14 | childs []*element 15 | content string 16 | collapse bool 17 | originalTag string 18 | linkPart float32 19 | hrefs []string 20 | } 21 | 22 | const ( 23 | _LINK_START = "\x11" 24 | _LINK_END = "\x13" 25 | ) 26 | 27 | func (el *element) Clone() *element { 28 | r := &element{} 29 | r.tag = el.tag 30 | r.content = el.content 31 | r.collapse = el.collapse 32 | r.originalTag = el.originalTag 33 | r.linkPart = el.linkPart 34 | r.hrefs = make([]string, len(el.hrefs)) 35 | copy(r.hrefs, el.hrefs) 36 | if el.childs != nil { 37 | r.childs = make([]*element, len(el.childs)) 38 | for i := range r.childs { 39 | r.childs[i] = el.childs[i].Clone() 40 | } 41 | } else { 42 | r.childs = nil 43 | } 44 | return r 45 | } 46 | 47 | func newContentElement(tag, content string) *element { 48 | return &element{tag, nil, content, false, "", 0.0, nil} 49 | } 50 | 51 | func newChildElement(tag string, childs []*element) *element { 52 | return &element{tag, childs, "", false, "", 0.0, nil} 53 | } 54 | 55 | // Returns a representation of the element suitable for debugging the library 56 | func (e *element) DebugString() string { 57 | out := bytes.NewBuffer([]byte{}) 58 | e.debugStringEx(out, 0) 59 | return string(out.Bytes()) 60 | } 61 | 62 | func (e *element) debugStringEx(out *bytes.Buffer, depth int) { 63 | out.Write([]byte(makeIndent(depth))) 64 | 65 | fmt.Fprintf(out, "<%s", e.tag) 66 | if e.originalTag != "" { 67 | fmt.Fprintf(out, ":%s", e.originalTag) 68 | } 69 | if e.linkPart > 0.001 { 70 | fmt.Fprintf(out, ":%g", e.linkPart) 71 | } 72 | if len(e.hrefs) > 0 { 73 | fmt.Fprintf(out, ":%s", strings.Join(e.hrefs, ",")) 74 | } 75 | out.Write([]byte{'>'}) 76 | if e.childs == nil { 77 | var lctxt linkContext 78 | fmt.Fprintf(out, "[%s(%d)]\n", lctxt.convertLinks(e.content, true), len(e.content)) 79 | } else { 80 | fmt.Fprintf(out, "[%d]\n", len(e.childs)) 81 | sep := false 82 | for _, child := range e.childs { 83 | if child != nil { 84 | child.debugStringEx(out, depth+1) 85 | sep = false 86 | } else { 87 | if !sep { 88 | out.Write([]byte{'\n'}) 89 | sep = true 90 | } 91 | 92 | fmt.Fprintf(out, "%s%v\n", makeIndent(depth+1), child) 93 | } 94 | } 95 | } 96 | } 97 | 98 | func (e *element) String(flags Flags) string { 99 | if e == nil { 100 | return "" 101 | } 102 | out := bytes.NewBuffer([]byte{}) 103 | var lctxt linkContext 104 | e.stringEx(out, flags, &lctxt) 105 | if flags&KeepLinks != 0 { 106 | if len(lctxt.hrefs) > 0 { 107 | io.WriteString(out, "\n") 108 | } 109 | for i := range lctxt.hrefs { 110 | fmt.Fprintf(out, "\t[%d] %s\n", i, lctxt.hrefs[i]) 111 | } 112 | } 113 | if lctxt.cnt != len(lctxt.hrefs) { 114 | fmt.Fprintf(out, "LINK COUNT INCONSISTENCY (%d %d)\n", lctxt.cnt, len(lctxt.hrefs)) 115 | } 116 | return string(out.Bytes()) 117 | } 118 | 119 | func (e *element) stringEx(out *bytes.Buffer, flags Flags, lctxt *linkContext) { 120 | if e.childs == nil { 121 | if len(e.hrefs) > 0 { 122 | ctnt := lctxt.convertLinks(e.content, flags&KeepLinks != 0) 123 | io.WriteString(out, strings.TrimSpace(ctnt)) 124 | lctxt.push(e.hrefs) 125 | } else { 126 | io.WriteString(out, strings.TrimSpace(e.content)) 127 | } 128 | } else { 129 | out.Write([]byte{'\n'}) 130 | sep := true 131 | for _, child := range e.childs { 132 | if child != nil { 133 | child.stringEx(out, flags, lctxt) 134 | } else { 135 | if !sep { 136 | out.Write([]byte{'\n'}) 137 | sep = true 138 | } 139 | } 140 | } 141 | } 142 | 143 | out.Write([]byte{'\n'}) 144 | } 145 | 146 | type linkContext struct { 147 | cnt int 148 | hrefs []string 149 | } 150 | 151 | func (lctxt *linkContext) convertLinks(s string, keep bool) string { 152 | in := []byte(s) 153 | out := make([]byte, 0, len(in)) 154 | for _, ch := range in { 155 | switch ch { 156 | case _LINK_START[0]: 157 | // nothing 158 | case _LINK_END[0]: 159 | if keep { 160 | out = append(out, []byte(fmt.Sprintf(" [%d]", lctxt.cnt))...) 161 | } 162 | lctxt.cnt++ 163 | default: 164 | out = append(out, ch) 165 | } 166 | } 167 | return string(out) 168 | } 169 | 170 | func (lctxt *linkContext) push(hrefs []string) { 171 | lctxt.hrefs = append(lctxt.hrefs, hrefs...) 172 | } 173 | 174 | func (e *element) isHeader() bool { 175 | if e.tag != "~textdiv" && e.tag != "~text" { 176 | return false 177 | } 178 | return e.originalTag == "h" 179 | } 180 | 181 | func (e *element) isLinkList() bool { 182 | if e.childs == nil { 183 | return false 184 | } 185 | 186 | if len(e.childs) < 5 { 187 | return false 188 | } 189 | 190 | if e.tag == "select" { 191 | return true 192 | } 193 | 194 | nlinks := 0 195 | for _, child := range e.childs { 196 | if child.tag != "~text" && child.tag != "~textdiv" { 197 | return false 198 | } 199 | if child.linkPart > 0.70 { 200 | nlinks++ 201 | } 202 | } 203 | if nlinks < 2 { 204 | return false 205 | } 206 | return (nlinks >= len(e.childs)-2) || (nlinks > int(float32(len(e.childs))*0.75)) 207 | } 208 | 209 | func (e *element) isLinkBlob() bool { 210 | if e.tag != "~text" && e.tag != "~textdiv" { 211 | return false 212 | } 213 | return e.linkPart > 0.7 214 | } 215 | 216 | func (e *element) okText() bool { 217 | return e != nil && e.tag == "~textblock" && len(e.content) > 50 218 | } 219 | 220 | /* Fuses a text element to the last text element in childs. 221 | If this is not possible (for example because childs doesn't end with a text element) returns false 222 | */ 223 | func pushTextEx(childs []*element, ts string, hrefs []string, tsLinkPart float32) bool { 224 | if childs == nil || len(childs) == 0 { 225 | return false 226 | } 227 | last := childs[len(childs)-1] 228 | if last.tag != "~text" { 229 | return false 230 | } 231 | newLinkPart := float32(len(ts))*tsLinkPart + float32(len(last.content))*last.linkPart 232 | last.content += " " + ts 233 | last.linkPart = newLinkPart / float32(len(last.content)) 234 | last.hrefs = append(last.hrefs, hrefs...) 235 | return true 236 | } 237 | 238 | // Adds a new text element to childs 239 | func pushText(childs []*element, node *html.Node) []*element { 240 | 241 | ts := []rune(henc.UnescapeString(node.Data)) 242 | ts = collapseWhitespace(ts) 243 | ts = cleanAsciiArt(ts) 244 | ts = cleanControl(ts) 245 | 246 | if len(ts) <= 0 { 247 | return childs 248 | } 249 | 250 | added := pushTextEx(childs, string(ts), nil, 0.0) 251 | if !added { 252 | childs = append(childs, newContentElement("~text", string(ts))) 253 | } 254 | return childs 255 | } 256 | 257 | func pushElement(childs []*element, child *element) []*element { 258 | if !child.collapse { 259 | added := false 260 | if child.tag == "~text" { 261 | added = pushTextEx(childs, child.content, child.hrefs, child.linkPart) 262 | } 263 | if !added { 264 | childs = append(childs, child) 265 | } 266 | return childs 267 | } else { 268 | // collapsing 269 | for _, cc := range child.childs { 270 | added := false 271 | if cc.tag == "~text" { 272 | added = pushTextEx(childs, cc.content, cc.hrefs, cc.linkPart) 273 | } 274 | if !added { 275 | childs = append(childs, cc) 276 | } 277 | } 278 | return childs 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /example/extract.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "fmt" 6 | "bytes" 7 | "github.com/aarzilli/sandblast" 8 | "golang.org/x/net/html" 9 | "log" 10 | ) 11 | 12 | func usage() { 13 | fmt.Fprintf(os.Stderr, "Usage: extract [debug]\n") 14 | os.Exit(1) 15 | } 16 | 17 | func main() { 18 | if len(os.Args) < 2 { 19 | usage() 20 | } 21 | 22 | url := os.Args[1] 23 | isDebug := false 24 | if len(os.Args) >= 3 { 25 | if os.Args[2] == "debug" { 26 | isDebug = true 27 | } else { 28 | usage() 29 | } 30 | } 31 | 32 | rawhtml, _, _, err := sandblast.FetchURL(url) 33 | if err != nil { 34 | log.Fatalf("Could not fetch url: %s\n", url) 35 | } 36 | 37 | node, err := html.Parse(bytes.NewReader([]byte(rawhtml))) 38 | if err != nil { 39 | log.Fatal("Parsing error: ", err) 40 | } 41 | title, text, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, sandblast.KeepLinks) 42 | if err != nil { 43 | log.Fatal("Extraction error: ", err) 44 | } 45 | 46 | fmt.Printf("TITLE: %s\n", title) 47 | if isDebug { 48 | fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString()) 49 | fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString()) 50 | fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString()) 51 | } 52 | fmt.Printf("TEXT:\n%s\n", text) 53 | } 54 | -------------------------------------------------------------------------------- /fetch.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "golang.org/x/net/html/charset" 5 | "golang.org/x/text/transform" 6 | "io" 7 | "io/ioutil" 8 | "net/http" 9 | ) 10 | 11 | // Returns the body of resp as a decoded string, detecting its encoding 12 | func DecodedBody(resp *http.Response) (content []byte, encoding string, err error) { 13 | defer resp.Body.Close() 14 | body, err := ioutil.ReadAll(resp.Body) 15 | if err != nil && err != io.EOF { 16 | content = body 17 | return 18 | } 19 | e, encoding, _ := charset.DetermineEncoding(body, resp.Header.Get("Content-Type")) 20 | t := e.NewDecoder() 21 | content = make([]byte, len(body)) 22 | start := 0 23 | for { 24 | var nDst, nSrc int 25 | nDst, nSrc, err = t.Transform(content[start:], body, true) 26 | body = body[nSrc:] 27 | start += nDst 28 | switch err { 29 | case transform.ErrShortDst: 30 | newContent := make([]byte, len(content)*2) 31 | copy(newContent, content) 32 | content = newContent 33 | case transform.ErrShortSrc: 34 | return 35 | default: 36 | content = content[:start] 37 | return 38 | } 39 | } 40 | return 41 | } 42 | 43 | func FetchURL(url string) (body []byte, status int, encoding string, err error) { 44 | resp, err := http.Get(url) 45 | if resp != nil { 46 | status = resp.StatusCode 47 | } 48 | if err != nil { 49 | return 50 | } 51 | body, encoding, err = DecodedBody(resp) 52 | return 53 | } 54 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "golang.org/x/net/html" 7 | "strings" 8 | ) 9 | 10 | func extractEx(node *html.Node, flags Flags) (title, text string, simplified, flattened, cleaned *element, err error) { 11 | root := findRoot(node) 12 | if root == nil { 13 | err = fmt.Errorf("Could not find root") 14 | return 15 | } 16 | 17 | title = getTitle(root) 18 | simplified, flattened, cleaned = extractTextEx(root, flags) 19 | if cleaned == nil { 20 | text = "" 21 | } else { 22 | text = cleaned.String(flags) 23 | } 24 | return 25 | } 26 | 27 | func ExtractEx(node *html.Node, flags Flags) (title, text string, simplified, flattened, cleaned *element, err error) { 28 | title, text, simplified, flattened, cleaned, err = extractEx(node, flags) 29 | return 30 | } 31 | 32 | func Extract(node *html.Node, flags Flags) (title, text string, err error) { 33 | title, text, _, _, _, err = extractEx(node, flags|isDestructive) 34 | return 35 | } 36 | 37 | func findRoot(node *html.Node) *html.Node { 38 | if node == nil { 39 | return nil 40 | } 41 | if node.Type == html.DocumentNode { 42 | return findRoot(node.FirstChild) 43 | } 44 | for node != nil { 45 | if (node.Type == html.ElementNode) && (strings.ToLower(node.Data) == "html") { 46 | return node 47 | } 48 | node = node.NextSibling 49 | } 50 | return nil 51 | } 52 | 53 | func getTitle(root *html.Node) string { 54 | head := findChild(root, "head") 55 | title := findChild(head, "title") 56 | if title == nil { 57 | return "" 58 | } 59 | return strings.TrimSpace(findContent(title.FirstChild)) 60 | 61 | } 62 | 63 | func findChild(root *html.Node, name string) *html.Node { 64 | if root == nil { 65 | return nil 66 | } 67 | name = strings.ToLower(name) 68 | child := root.FirstChild 69 | for child != nil { 70 | if (child.Type == html.ElementNode) && (strings.ToLower(child.Data) == name) { 71 | return child 72 | } 73 | child = child.NextSibling 74 | } 75 | return nil 76 | } 77 | 78 | func findContent(node *html.Node) string { 79 | if node == nil { 80 | return "" 81 | } 82 | out := bytes.NewBuffer([]byte{}) 83 | for node != nil { 84 | if node.Type == html.TextNode { 85 | out.Write([]byte(node.Data)) 86 | } 87 | node = node.NextSibling 88 | } 89 | return string(out.Bytes()) 90 | } 91 | -------------------------------------------------------------------------------- /qa/qa.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "fmt" 7 | "github.com/aarzilli/sandblast" 8 | "golang.org/x/net/html" 9 | "golang.org/x/net/html/charset" 10 | "golang.org/x/text/transform" 11 | "io" 12 | "io/ioutil" 13 | "os" 14 | "strings" 15 | "unicode" 16 | ) 17 | 18 | func usage() { 19 | fmt.Fprintf(os.Stderr, "./qa run \n") 20 | fmt.Fprintf(os.Stderr, "./qa rebuild \n") 21 | fmt.Fprintf(os.Stderr, "./qa one \n") 22 | os.Exit(1) 23 | } 24 | 25 | func must(err error) { 26 | if err != nil { 27 | panic(err) 28 | } 29 | } 30 | 31 | type test struct { 32 | name string 33 | input *zip.File 34 | target *zip.File 35 | } 36 | 37 | type Dataset struct { 38 | datazip *zip.ReadCloser 39 | index *zip.File 40 | tests []test 41 | } 42 | 43 | func (d *Dataset) Close() error { 44 | return d.datazip.Close() 45 | } 46 | 47 | func collapseWhitespace(in []rune) []rune { 48 | var b []rune = make([]rune, len(in)) 49 | d := 0 50 | spaceSeen := true 51 | for s := range in { 52 | if spaceSeen { 53 | if !unicode.IsSpace(in[s]) { 54 | spaceSeen = false 55 | b[d] = in[s] 56 | d++ 57 | } 58 | } else { 59 | if unicode.IsSpace(in[s]) { 60 | b[d] = ' ' 61 | d++ 62 | spaceSeen = true 63 | } else { 64 | b[d] = in[s] 65 | d++ 66 | } 67 | } 68 | } 69 | return b[:d] 70 | } 71 | 72 | func openDataset(datapath string) *Dataset { 73 | dataset, err := zip.OpenReader(datapath) 74 | must(err) 75 | 76 | ins := map[string]*zip.File{} 77 | outs := map[string]*zip.File{} 78 | var index *zip.File 79 | 80 | for _, file := range dataset.File { 81 | if file.Name == "index.txt" { 82 | index = file 83 | continue 84 | } 85 | v := strings.Split(file.Name, ".") 86 | if len(v) != 2 { 87 | panic(fmt.Errorf("wrong name in dataset: %s\n", file.Name)) 88 | } 89 | 90 | switch v[1] { 91 | case "html": 92 | ins[v[0]] = file 93 | case "target": 94 | outs[v[0]] = file 95 | default: 96 | panic(fmt.Errorf("wrong name in dataset: %s\n", file.Name)) 97 | } 98 | } 99 | 100 | tests := make([]test, 0, len(ins)) 101 | for k := range ins { 102 | in, inok := ins[k] 103 | out, outok := outs[k] 104 | if !inok || !outok { 105 | panic(fmt.Errorf("problem with dataset: %s", k)) 106 | } 107 | tests = append(tests, test{name: k, input: in, target: out}) 108 | } 109 | 110 | return &Dataset{index: index, datazip: dataset, tests: tests} 111 | } 112 | 113 | func qarun(datapath string) { 114 | dataset := openDataset(datapath) 115 | defer dataset.Close() 116 | 117 | os.Mkdir("work", 0770) 118 | 119 | count := 0 120 | for _, test := range dataset.tests { 121 | fmt.Printf("Processing %s\n", test.name) 122 | if !qaruntest(test, false) { 123 | count++ 124 | } 125 | if count > 10 { 126 | fmt.Printf("Too many differences\n") 127 | return 128 | } 129 | } 130 | fmt.Printf("All ok\n") 131 | } 132 | 133 | func qaone(datapath string, name string) { 134 | dataset := openDataset(datapath) 135 | defer dataset.Close() 136 | 137 | os.Mkdir("work", 0770) 138 | 139 | for _, test := range dataset.tests { 140 | if test.name == name { 141 | qaruntest(test, true) 142 | return 143 | } 144 | } 145 | } 146 | 147 | func extractTest(test test, writeextract bool) ([]byte, string) { 148 | in, err := test.input.Open() 149 | must(err) 150 | defer in.Close() 151 | 152 | body, err := ioutil.ReadAll(in) 153 | must(err) 154 | 155 | e, _, _ := charset.DetermineEncoding(body, "UTF-8") 156 | r := transform.NewReader(bytes.NewReader(body), e.NewDecoder()) 157 | node, err := html.Parse(r) 158 | must(err) 159 | 160 | _, output, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, 0) 161 | must(err) 162 | 163 | if writeextract { 164 | fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString()) 165 | fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString()) 166 | fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString()) 167 | } 168 | 169 | return body, output 170 | } 171 | 172 | func qaruntest(test test, writein bool) bool { 173 | body, output := extractTest(test, writein) 174 | 175 | tgt, err := test.target.Open() 176 | must(err) 177 | defer tgt.Close() 178 | 179 | tgtbody, err := ioutil.ReadAll(tgt) 180 | must(err) 181 | target := strings.TrimSpace(string(tgtbody)) 182 | 183 | a := strings.TrimSpace(string(collapseWhitespace([]rune(target)))) 184 | b := strings.TrimSpace(string(collapseWhitespace([]rune(output)))) 185 | if a != b { 186 | fmt.Printf("%s output and target differ\n", test.name) 187 | //fmt.Printf("target: <%s>\noutput: <%s>\n", a, b) 188 | tgtout, err := os.Create(fmt.Sprintf("work/%s.target", test.name)) 189 | must(err) 190 | io.WriteString(tgtout, target) 191 | io.WriteString(tgtout, "\n") 192 | tgtout.Close() 193 | outout, err := os.Create(fmt.Sprintf("work/%s.out", test.name)) 194 | must(err) 195 | io.WriteString(outout, output) 196 | io.WriteString(outout, "\n") 197 | outout.Close() 198 | 199 | if writein { 200 | inout, err := os.Create(fmt.Sprintf("work/%s.html", test.name)) 201 | must(err) 202 | inout.Write(body) 203 | inout.Close() 204 | } 205 | 206 | return false 207 | } 208 | return true 209 | } 210 | 211 | func qarebuild(datapath, outpath string) { 212 | dataset := openDataset(datapath) 213 | defer dataset.Close() 214 | 215 | outw, err := os.Create(outpath) 216 | must(err) 217 | defer outw.Close() 218 | 219 | outzip := zip.NewWriter(outw) 220 | defer outzip.Close() 221 | 222 | copyFile(outzip, "index.txt", dataset.index) 223 | 224 | for _, test := range dataset.tests { 225 | fmt.Printf("processing %s\n", test.name) 226 | copyFile(outzip, fmt.Sprintf("%s.html", test.name), test.input) 227 | _, output := extractTest(test, false) 228 | w, err := outzip.Create(fmt.Sprintf("%s.target", test.name)) 229 | must(err) 230 | _, err = io.WriteString(w, output) 231 | must(err) 232 | } 233 | } 234 | 235 | func copyFile(outzip *zip.Writer, name string, in *zip.File) { 236 | w, err := outzip.Create(name) 237 | must(err) 238 | r, err := in.Open() 239 | must(err) 240 | defer r.Close() 241 | _, err = io.Copy(w, r) 242 | must(err) 243 | } 244 | 245 | func main() { 246 | if len(os.Args) < 1 { 247 | usage() 248 | } 249 | switch os.Args[1] { 250 | case "run": 251 | if len(os.Args) < 3 { 252 | usage() 253 | } 254 | qarun(os.Args[2]) 255 | case "one": 256 | qaone(os.Args[2], os.Args[3]) 257 | case "rebuild": 258 | if len(os.Args) < 4 { 259 | usage() 260 | } 261 | qarebuild(os.Args[2], os.Args[3]) 262 | case "help": 263 | usage() 264 | default: 265 | usage() 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /text.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "bytes" 5 | "golang.org/x/net/html" 6 | "strings" 7 | ) 8 | 9 | const _MAX_PROCESSING_DEPTH = 100 10 | 11 | type Flags int 12 | 13 | const ( 14 | KeepMenus = Flags(1 << iota) // Not implemented 15 | KeepLinks // Keeps link destinations for links embedded inside text blocks 16 | KeepImages // Not implemented 17 | MarkTitles // Not implemented 18 | isDestructive // Intermediate values will be discarded (internal) 19 | ) 20 | 21 | func extractTextEx(root *html.Node, flags Flags) (simplified, flattened, cleaned *element) { 22 | simplified = simplify(root, 0) 23 | if simplified == nil { 24 | return nil, nil, nil 25 | } 26 | if flags&isDestructive != 0 { 27 | flattened = flatten(simplified) 28 | } else { 29 | x := simplified.Clone() 30 | //println("Flatten argument:", x.DebugString()) 31 | flattened = flatten(x) 32 | } 33 | if flags&isDestructive != 0 { 34 | cleaned = clean(flattened) 35 | } else { 36 | cleaned = clean(flattened.Clone()) 37 | } 38 | return 39 | } 40 | 41 | func simplify(node *html.Node, depth int) *element { 42 | if depth > _MAX_PROCESSING_DEPTH { 43 | return nil 44 | } 45 | 46 | switch node.Type { 47 | case html.ErrorNode: 48 | return nil 49 | case html.CommentNode: 50 | return nil 51 | case html.DoctypeNode: 52 | return nil 53 | case html.DocumentNode: 54 | return nil 55 | 56 | case html.TextNode: 57 | return newContentElement("~text", node.Data) 58 | 59 | case html.ElementNode: 60 | // rest 61 | } 62 | 63 | kind := getNodeKind(node) 64 | if kind == _K_SUPPRESSED { 65 | return nil 66 | } 67 | 68 | childs := []*element{} 69 | 70 | for childn := node.FirstChild; childn != nil; childn = childn.NextSibling { 71 | if childn.Type == html.TextNode { 72 | childs = pushText(childs, childn) 73 | } else { 74 | child := simplify(childn, depth+1) 75 | if child != nil { 76 | childs = pushElement(childs, child) 77 | } 78 | } 79 | } 80 | 81 | if len(childs) == 0 { 82 | return nil 83 | } 84 | 85 | kot := false 86 | switch kind { 87 | case _K_KOTCONTAINER: 88 | kot = true 89 | fallthrough 90 | case _K_CONTAINER: 91 | if len(childs) == 1 { 92 | if childs[0].tag == "~text" { 93 | if kot { 94 | childs[0].originalTag = "h" 95 | } 96 | childs[0].tag = "~textdiv" 97 | } 98 | return childs[0] 99 | } 100 | 101 | case _K_FORMATTING: 102 | if len(childs) == 1 { 103 | return childs[0] 104 | } 105 | 106 | case _K_INLINE: 107 | if len(childs) <= 0 { 108 | return nil 109 | } 110 | 111 | if len(childs) == 1 { 112 | if (childs[0].tag == "~text" || childs[0].tag == "~textdiv") && strings.ToLower(node.Data) == "a" { 113 | childs[0].hrefs = []string{getAttribute(node, "href")} 114 | childs[0].linkPart = 1.0 115 | childs[0].content = _LINK_START + childs[0].content + _LINK_END 116 | } 117 | return childs[0] 118 | } 119 | 120 | case _K_TODESTRUCTURE: 121 | if strings.ToLower(node.Data) == "tr" { 122 | linkPart := float32(0.0) 123 | hasComplexTds := false 124 | trText := bytes.NewBuffer([]byte{}) 125 | 126 | for _, child := range childs { 127 | if !(child.tag == "~textdiv") { 128 | hasComplexTds = true 129 | break 130 | } 131 | 132 | trText.Write([]byte(child.content)) 133 | trText.Write([]byte{' '}) 134 | linkPart += float32(len(child.content)) * child.linkPart 135 | } 136 | 137 | if !hasComplexTds { 138 | r := newContentElement("~textdiv", string(trText.Bytes())) 139 | r.linkPart = linkPart / float32(len(r.content)) 140 | for _, child := range childs { 141 | r.hrefs = append(r.hrefs, child.hrefs...) 142 | } 143 | return r 144 | } 145 | } 146 | 147 | r := newChildElement("~transient", childs) 148 | r.collapse = true 149 | 150 | return r 151 | } 152 | 153 | return newChildElement(strings.ToLower(node.Data), childs) 154 | } 155 | 156 | func flatten(e *element) *element { 157 | if e == nil { 158 | return e 159 | } 160 | 161 | if e.isHeader() { 162 | e.tag = "~header" 163 | return e 164 | } 165 | 166 | if e.isLinkList() { 167 | e.tag = "~linklist" 168 | return e 169 | } 170 | 171 | if e.isLinkBlob() { 172 | e.tag = "~linkblob" 173 | return e 174 | } 175 | 176 | if e.childs == nil { 177 | e.tag = "~textblock" 178 | return e 179 | } 180 | 181 | childs := make([]*element, 0, len(e.childs)) 182 | 183 | for i := range e.childs { 184 | fchild := flatten(e.childs[i]) 185 | if fchild.collapse { 186 | for _, subchild := range fchild.childs { 187 | childs = append(childs, subchild) 188 | } 189 | } else { 190 | childs = append(childs, fchild) 191 | } 192 | } 193 | 194 | e.tag = "~transient" 195 | e.childs = childs 196 | e.collapse = true 197 | return e 198 | } 199 | 200 | func clean(e *element) *element { 201 | if e == nil || e.childs == nil { 202 | return e 203 | } 204 | 205 | for i := range e.childs { 206 | if e.childs[i] == nil { 207 | continue 208 | } 209 | 210 | switch e.childs[i].tag { 211 | case "~linkblob": 212 | fallthrough 213 | case "~linklist": 214 | e.childs[i] = nil 215 | 216 | case "~textblock": 217 | if len(e.childs[i].content) <= 15 || strings.Index(e.childs[i].content, " ") < 0 { 218 | e.childs[i] = nil 219 | } 220 | } 221 | } 222 | 223 | for i := range e.childs { 224 | if e.childs[i] == nil { 225 | continue 226 | } 227 | 228 | var next *element 229 | if i+1 < len(e.childs) { 230 | next = e.childs[i+1] 231 | } 232 | 233 | var prev *element 234 | if i-1 >= 0 { 235 | prev = e.childs[i-1] 236 | } 237 | 238 | if e.tag == "~header" { 239 | if !next.okText() { 240 | e.childs[i] = nil 241 | } 242 | } else if !e.childs[i].okText() { 243 | if !next.okText() && !prev.okText() { 244 | e.childs[i] = nil 245 | } 246 | } 247 | } 248 | 249 | return e 250 | } 251 | 252 | func makeIndent(depth int) string { 253 | b := make([]byte, depth*3) 254 | for i := range b { 255 | b[i] = ' ' 256 | } 257 | return string(b) 258 | } 259 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "golang.org/x/net/html" 5 | "strings" 6 | "unicode" 7 | ) 8 | 9 | func collapseWhitespace(in []rune) []rune { 10 | var b []rune = make([]rune, len(in)) 11 | d := 0 12 | spaceSeen := true 13 | for s := range in { 14 | if spaceSeen { 15 | if !unicode.IsSpace(in[s]) { 16 | spaceSeen = false 17 | b[d] = in[s] 18 | d++ 19 | } 20 | } else { 21 | if unicode.IsSpace(in[s]) { 22 | b[d] = ' ' 23 | d++ 24 | spaceSeen = true 25 | } else { 26 | b[d] = in[s] 27 | d++ 28 | } 29 | } 30 | } 31 | return b[:d] 32 | } 33 | 34 | type cleanAsciiArtStateFn func(int) cleanAsciiArtStateFn 35 | 36 | func cleanAsciiArt(in []rune) []rune { 37 | b := make([]rune, 0, len(in)) 38 | start := 0 39 | count := 0 40 | 41 | var baseSpace, baseNormal, maybeAsciiArt cleanAsciiArtStateFn 42 | 43 | isAsciiArt := func(r rune) bool { 44 | return !unicode.In(r, unicode.Ll, unicode.Lu, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nd, unicode.Nl, unicode.No) 45 | } 46 | 47 | baseSpace = func(s int) cleanAsciiArtStateFn { 48 | //println("baseSpace <", string(in[s]), ">", isAsciiArt(in[s])) 49 | if unicode.IsSpace(in[s]) { 50 | b = append(b, in[s]) 51 | return baseSpace 52 | } else if isAsciiArt(in[s]) { 53 | start = s 54 | count = 1 55 | return maybeAsciiArt 56 | } else { 57 | b = append(b, in[s]) 58 | return baseNormal 59 | } 60 | } 61 | 62 | baseNormal = func(s int) cleanAsciiArtStateFn { 63 | //println("baseNormal <", string(in[s]), ">",) 64 | b = append(b, in[s]) 65 | if unicode.IsSpace(in[s]) { 66 | return baseSpace 67 | } else { 68 | return baseNormal 69 | } 70 | } 71 | 72 | maybeAsciiArt = func(s int) cleanAsciiArtStateFn { 73 | //println("maybeAsciiArt <", string(in[s]), ">") 74 | if isAsciiArt(in[s]) && !unicode.IsSpace(in[s]) { 75 | count++ 76 | return maybeAsciiArt 77 | } else if unicode.IsSpace(in[s]) { 78 | //println("exiting", count) 79 | if count > 3 { 80 | b = append(b, in[s]) 81 | } else { 82 | b = append(b, in[start:s+1]...) 83 | } 84 | return baseSpace 85 | } else { 86 | //println("exiting (to normal)") 87 | b = append(b, in[start:s+1]...) 88 | return baseNormal 89 | } 90 | } 91 | 92 | state := baseSpace 93 | 94 | for s := range in { 95 | state = state(s) 96 | } 97 | 98 | return b 99 | } 100 | 101 | func cleanControl(in []rune) []rune { 102 | var b []rune = nil 103 | 104 | for s := range in { 105 | if unicode.IsControl(in[s]) && !unicode.IsSpace(in[s]) { 106 | if b == nil { 107 | b = make([]rune, 0, len(in)) 108 | b = append(b, in[:s]...) 109 | } 110 | } else { 111 | if b != nil { 112 | b = append(b, in[s]) 113 | } 114 | } 115 | s++ 116 | } 117 | 118 | if b == nil { 119 | return in 120 | } 121 | return b 122 | } 123 | 124 | func getAttribute(node *html.Node, name string) string { 125 | for i := range node.Attr { 126 | if strings.ToLower(node.Attr[i].Key) == "href" { 127 | return node.Attr[i].Val 128 | } 129 | } 130 | return "" 131 | } 132 | -------------------------------------------------------------------------------- /util_test.go: -------------------------------------------------------------------------------- 1 | package sandblast 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestCleanControl(t *testing.T) { 8 | tf := func(in, target string) { 9 | out := string(cleanControl([]rune(in))) 10 | if out != target { 11 | t.Errorf("Error cleaning control character on <%s>\n\tgot <%s>\n\texpected <%s>\n", in, out, target) 12 | } 13 | } 14 | tf("test\ntest\002 test", "test\ntest test") 15 | } 16 | 17 | func TestCollapseWhitespace(t *testing.T) { 18 | tf := func(in, target string) { 19 | out := string(collapseWhitespace([]rune(in))) 20 | if out != target { 21 | t.Errorf("Error collapsing whitespace on <%s>\n\tgot <%s>\n\texpected <%s>\n", in, out, target) 22 | } 23 | } 24 | tf(" \n\ttest\ntest\ttest ", "test test test ") 25 | } 26 | 27 | func TestCleanAsciiArt(t *testing.T) { 28 | tf := func(in, target string) { 29 | out := string(cleanAsciiArt([]rune(in))) 30 | if out != target { 31 | t.Errorf("Error cleaning ASCII art on <%s>\n\tgot <%s>\n\texpected <%s>\n", in, out, target) 32 | } 33 | } 34 | tf("test ===== test === test", "test test === test") 35 | tf("saw this exact same trick performed in a public bar OVER FORTY YEARS AGO. pretty good then; old hat now.", "saw this exact same trick performed in a public bar OVER FORTY YEARS AGO. pretty good then; old hat now.") 36 | } 37 | --------------------------------------------------------------------------------