├── README.md ├── articletext.go ├── exported.go ├── go.mod ├── go.sum ├── investigate.go ├── selectionpath.go └── textextract.go /README.md: -------------------------------------------------------------------------------- 1 | ## ArticleText 2 | 3 | Golang package with a function to extract useful text from a HTML document. 4 | 5 | A function analyses a html code and drops everything related to navigation, advertising etc. 6 | Extracts only useful contents of a document, text of a central element. 7 | 8 | ### Installation 9 | 10 | go get github.com/gelembjuk/articletext 11 | 12 | ### Manual 13 | 14 | There are 3 types of exported functions. 15 | 16 | 1. Functions to get a text from a HTML document. From 3 different types of sources 17 | 18 | #### GetArticleText(input io.Reader) 19 | 20 | #### GetArticleTextFromFile(filepath string) 21 | 22 | #### GetArticleTextFromUrl(url string) 23 | 24 | 2. Functions to return a path (signature) for a text location block. The path is a JQuery style selector - tags with classes. 25 | 26 | Also 3 functions for input form different sources 27 | 28 | #### GetArticleSignature(input io.Reader) 29 | 30 | #### GetArticleSignatureFromFile(filepath string) 31 | 32 | #### GetArticleSignatureFromUrl(url string) 33 | 34 | Result of these functions is somethign like "body div div div.content div.article div.text" . And then this path can be used to get a text with one of following functions 35 | 36 | 3. Functions to get a text from a HTML document using a path (signature) in a JQuery style. A path can be get by using one of functions from blcok 2, or prepared manually 37 | 38 | #### GetArticleTextByPath(input io.Reader, path string) 39 | 40 | #### GetArticleTextFromFileByPath(filepath string, path string) 41 | 42 | #### GetArticleTextFromUrlByPath(url string, path string) 43 | 44 | ### Example 45 | 46 | ``` 47 | package main 48 | 49 | import ( 50 | "fmt" 51 | "os" 52 | "github.com/gelembjuk/articletext" 53 | ) 54 | 55 | func main() { 56 | 57 | url := os.Args[1] 58 | text, err := articletext.GetArticleTextFromUrl(url) 59 | 60 | fmt.Println(text) 61 | } 62 | ``` 63 | 64 | ### Author 65 | 66 | Roman Gelembjuk (@gelembjuk) 67 | 68 | -------------------------------------------------------------------------------- /articletext.go: -------------------------------------------------------------------------------- 1 | package articletext 2 | 3 | /* 4 | The package is used to extracts article text from a HTML page 5 | It drops all additional elements from a html page (navigation, advertizing etc) 6 | 7 | This file containes internal functions and a logic 8 | 9 | Author: Roman Gelembjuk 10 | */ 11 | 12 | import ( 13 | "sort" 14 | 15 | "github.com/PuerkitoBio/goquery" 16 | "github.com/jaytaylor/html2text" 17 | ) 18 | 19 | // liist of tags to ignore, as they dones't contain useful data 20 | var skiphtmltags []string = []string{"script", "style", "noscript", "head", 21 | "header", "footer", "nav"} 22 | 23 | func init() { 24 | // to make lookup faster 25 | sort.Strings(skiphtmltags) 26 | } 27 | 28 | // the function prepares a document for analysing 29 | // cleans a DOM object and starts analysing 30 | func processArticle(doc *goquery.Document, responsetype int) (string, error) { 31 | 32 | if doc == nil { 33 | return "", nil 34 | } 35 | 36 | // get clone of a selection. Clone is neede,d because we willdo some transformations 37 | 38 | docselection := doc.Selection.Clone() 39 | 40 | // preprocess. Remove all tags that are not useful and can make parsing wrong 41 | cleanDocument(docselection) 42 | 43 | // get a selection that contains a text of a page (only primary or article text) 44 | selection := getPrimarySelection(docselection) 45 | 46 | if responsetype == 2 { 47 | // return parent node path and attributes 48 | return getSelectionSignature(selection), nil 49 | } 50 | 51 | return getTextFromHtml(selection), nil 52 | } 53 | 54 | // clean HTML document. Removes all tags that are not useful 55 | func cleanDocument(s *goquery.Selection) *goquery.Selection { 56 | tagname := goquery.NodeName(s) 57 | 58 | if checkTagsToSkip(tagname) { 59 | s.Remove() 60 | return nil 61 | } 62 | // for each child node check if to remove or not 63 | s.Children().Each(func(i int, sec *goquery.Selection) { 64 | tagname := goquery.NodeName(sec) 65 | 66 | if checkTagsToSkip(tagname) { 67 | 68 | sec.Remove() 69 | 70 | return 71 | } 72 | // go deeper recursively 73 | cleanDocument(sec) 74 | }) 75 | 76 | return s 77 | } 78 | 79 | // convert HTML to text from a DOM node 80 | // we ignore errors in this function 81 | func getTextFromHtml(s *goquery.Selection) string { 82 | // gethtml from a node 83 | html, _ := s.Html() 84 | // convert to text 85 | text, err := html2text.FromString(html) 86 | 87 | if err != nil { 88 | return "" 89 | } 90 | 91 | return text 92 | } 93 | 94 | // check if aword (string) is in an array of tags 95 | // we have list of tags to ignore some not useful tags 96 | func checkTagsToSkip(tag string) bool { 97 | for _, v := range skiphtmltags { 98 | if v == tag { 99 | return true 100 | } 101 | } 102 | return false 103 | } 104 | -------------------------------------------------------------------------------- /exported.go: -------------------------------------------------------------------------------- 1 | package articletext 2 | 3 | /* 4 | The package is used extracts article text from a HTML page 5 | It drops all additional elements from a html page (navigation, advertizing etc) 6 | 7 | This file contains exported functiosn of a package. It is entry point of the package 8 | 9 | Author: Roman Gelembjuk 10 | */ 11 | 12 | import ( 13 | "io" 14 | "log" 15 | "os" 16 | 17 | "github.com/PuerkitoBio/goquery" 18 | ) 19 | 20 | // extracts useful text from a html file 21 | func GetArticleTextFromFile(filepath string) (string, error) { 22 | // create reader from file 23 | reader, err := os.Open(filepath) 24 | 25 | if err != nil { 26 | log.Fatal(err) 27 | return "", err 28 | } 29 | 30 | return GetArticleText(reader) 31 | } 32 | 33 | // extracts useful text from a html page presented by an url 34 | func GetArticleTextFromUrl(url string) (string, error) { 35 | doc, err := goquery.NewDocument(url) 36 | 37 | if err != nil { 38 | log.Fatal(err) 39 | return "", err 40 | } 41 | 42 | return processArticle(doc, 1) 43 | } 44 | 45 | // extracts useful text from a html document presented as a Reader object 46 | func GetArticleText(input io.Reader) (string, error) { 47 | 48 | doc, err := goquery.NewDocumentFromReader(input) 49 | 50 | if err != nil { 51 | log.Fatal(err) 52 | return "", err 53 | } 54 | 55 | return processArticle(doc, 1) 56 | } 57 | 58 | // extracts useful text from a html file 59 | // returns a DOM signature 60 | func GetArticleSignatureFromFile(filepath string) (string, error) { 61 | // create reader from file 62 | reader, err := os.Open(filepath) 63 | 64 | if err != nil { 65 | log.Fatal(err) 66 | return "", err 67 | } 68 | 69 | return GetArticleSignature(reader) 70 | } 71 | 72 | // extracts useful text from a html page presented by an url 73 | func GetArticleSignatureFromUrl(url string) (string, error) { 74 | doc, err := goquery.NewDocument(url) 75 | 76 | if err != nil { 77 | log.Fatal(err) 78 | return "", err 79 | } 80 | 81 | return processArticle(doc, 2) 82 | } 83 | 84 | // extracts useful text from a html document presented as a Reader object 85 | func GetArticleSignature(input io.Reader) (string, error) { 86 | 87 | doc, err := goquery.NewDocumentFromReader(input) 88 | 89 | if err != nil { 90 | log.Fatal(err) 91 | return "", err 92 | } 93 | 94 | return processArticle(doc, 2) 95 | } 96 | 97 | // extracts useful text from a html file 98 | func GetArticleTextFromFileByPath(filepath string, path string) (string, error) { 99 | // create reader from file 100 | reader, err := os.Open(filepath) 101 | 102 | if err != nil { 103 | log.Fatal(err) 104 | return "", err 105 | } 106 | 107 | return GetArticleTextByPath(reader, path) 108 | } 109 | 110 | // extracts useful text from a html page presented by an url 111 | func GetArticleTextFromUrlByPath(url string, path string) (string, error) { 112 | doc, err := goquery.NewDocument(url) 113 | 114 | if err != nil { 115 | log.Fatal(err) 116 | return "", err 117 | } 118 | 119 | return getTextByPathFromDocument(doc, path) 120 | } 121 | 122 | // extracts useful text from a html document presented as a Reader object 123 | func GetArticleTextByPath(input io.Reader, path string) (string, error) { 124 | 125 | doc, err := goquery.NewDocumentFromReader(input) 126 | 127 | if err != nil { 128 | log.Fatal(err) 129 | return "", err 130 | } 131 | 132 | return getTextByPathFromDocument(doc, path) 133 | } 134 | 135 | // the functions finds a path (selector, signature) for each url and returns one that was found most often 136 | func GetOptimalArticleSignatureByUrls(urls []string) (string, error) { 137 | 138 | return getOptimalArticleSignatureByUrls(urls) 139 | } 140 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/gelembjuk/articletext 2 | 3 | go 1.17 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.8.1 7 | github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 8 | gopkg.in/neurosnap/sentences.v1 v1.0.7 9 | ) 10 | 11 | require ( 12 | github.com/andybalholm/cascadia v1.3.1 // indirect 13 | github.com/mattn/go-runewidth v0.0.9 // indirect 14 | github.com/neurosnap/sentences v1.1.2 // indirect 15 | github.com/olekukonko/tablewriter v0.0.5 // indirect 16 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect 17 | golang.org/x/net v0.7.0 // indirect 18 | ) 19 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= 2 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= 3 | github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= 4 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= 5 | github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA= 6 | github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= 7 | github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= 8 | github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= 9 | github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw= 10 | github.com/neurosnap/sentences v1.1.2/go.mod h1:/pwU4E9XNL21ygMIkOIllv/SMy2ujHwpf8GQPu1YPbQ= 11 | github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= 12 | github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= 13 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= 14 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= 15 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 16 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 17 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 18 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 19 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 20 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 21 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 22 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 23 | golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= 24 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 25 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 26 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 27 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 28 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 29 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 30 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 31 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 32 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 33 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 34 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 35 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 36 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 37 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 38 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 39 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 40 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 41 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 42 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 43 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 44 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 45 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 46 | gopkg.in/neurosnap/sentences.v1 v1.0.7 h1:gpTUYnqthem4+o8kyTLiYIB05W+IvdQFYR29erfe8uU= 47 | gopkg.in/neurosnap/sentences.v1 v1.0.7/go.mod h1:YlK+SN+fLQZj+kY3r8DkGDhDr91+S3JmTb5LSxFRQo0= 48 | -------------------------------------------------------------------------------- /investigate.go: -------------------------------------------------------------------------------- 1 | package articletext 2 | 3 | /* 4 | This file contains a function to investigate a list of urls and chooose optimal 5 | path (selector) to use later for quick extracting a text from HTML document 6 | 7 | Author: Roman Gelembjuk 8 | */ 9 | 10 | import ( 11 | "errors" 12 | ) 13 | 14 | // the functions finds a path (selector, signature) for each url and returns one that was found most often 15 | func getOptimalArticleSignatureByUrls(urls []string) (string, error) { 16 | 17 | if len(urls) < 1 { 18 | return "", errors.New("No urls provided") 19 | } 20 | 21 | var paths map[string]int 22 | paths = make(map[string]int) 23 | 24 | for _, url := range urls { 25 | 26 | path, err := GetArticleSignatureFromUrl(url) 27 | 28 | if err != nil { 29 | return "", err 30 | } 31 | 32 | if count, ok := paths[path]; ok { 33 | paths[path] = count + 1 34 | } else { 35 | paths[path] = 1 36 | } 37 | } 38 | 39 | // find what path has maximum of occurences 40 | maxpath := "" 41 | maxval := 0 42 | 43 | for k, v := range paths { 44 | if v > maxval { 45 | maxval = v 46 | maxpath = k 47 | } 48 | } 49 | 50 | return maxpath, nil 51 | } 52 | -------------------------------------------------------------------------------- /selectionpath.go: -------------------------------------------------------------------------------- 1 | package articletext 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/PuerkitoBio/goquery" 7 | ) 8 | 9 | /* 10 | Extract text by DOM path, aka jquery style 11 | */ 12 | func getTextByPathFromDocument(doc *goquery.Document, path string) (string, error) { 13 | sel := doc.Find(path) 14 | 15 | if sel != nil { 16 | 17 | return getTextFromHtml(sel), nil 18 | } 19 | 20 | return "nothing", nil 21 | } 22 | 23 | // this function returns some specific signature of a selection 24 | // so it can be easy found to get data quickly next time 25 | func getSelectionSignature(s *goquery.Selection) string { 26 | var signature string 27 | 28 | tag, _ := goquery.OuterHtml(s) 29 | 30 | pos := strings.Index(tag, ">") 31 | 32 | if pos > -1 { 33 | tag = tag[1:pos] 34 | } else { 35 | return "" 36 | } 37 | 38 | signature = convertTagToJqueryFormat(tag, s) 39 | 40 | s.Parents().Each(func(i int, sec *goquery.Selection) { 41 | ohtml, _ := goquery.OuterHtml(sec) 42 | 43 | pos := strings.Index(ohtml, ">") 44 | 45 | if pos > -1 { 46 | ohtml = ohtml[1:pos] 47 | } 48 | 49 | tag := convertTagToJqueryFormat(ohtml, sec) 50 | 51 | signature = tag + " " + signature 52 | }) 53 | 54 | return signature 55 | } 56 | 57 | func convertTagToJqueryFormat(tag string, s *goquery.Selection) string { 58 | tagitself := tag 59 | 60 | pos := strings.Index(tag, " ") 61 | 62 | if pos > -1 { 63 | tagitself = tag[0:pos] 64 | } else { 65 | 66 | return tag 67 | } 68 | 69 | class, found := s.Attr("class") 70 | 71 | if found && class != "" { 72 | pos := strings.Index(class, " ") 73 | // leave only a first class from a list 74 | if pos > -1 { 75 | class = class[0:pos] 76 | } 77 | 78 | tagitself = tagitself + "." + class 79 | } 80 | 81 | return tagitself 82 | } 83 | -------------------------------------------------------------------------------- /textextract.go: -------------------------------------------------------------------------------- 1 | package articletext 2 | 3 | /* 4 | The file contains a logic of a package to find a DOM node containing majority of a text 5 | in HTML document 6 | 7 | Author: Roman Gelembjuk 8 | */ 9 | 10 | import ( 11 | "math" 12 | "regexp" 13 | "strconv" 14 | "strings" 15 | "unicode/utf8" 16 | 17 | "github.com/PuerkitoBio/goquery" 18 | "gopkg.in/neurosnap/sentences.v1" 19 | "gopkg.in/neurosnap/sentences.v1/data" 20 | ) 21 | 22 | type TextDescription struct { 23 | CountSentences int 24 | AverageWords int 25 | CountLongSentences int 26 | CountGoodSentences int 27 | CountCorrectSentences int 28 | } 29 | 30 | var tokenizer *sentences.DefaultSentenceTokenizer 31 | 32 | func init() { 33 | // prepare tokenizer 34 | b, _ := data.Asset("data/english.json") 35 | 36 | // load the training data 37 | training, _ := sentences.LoadTraining(b) 38 | 39 | // create the default sentence tokenizer 40 | tokenizer = sentences.NewSentenceTokenizer(training) 41 | } 42 | 43 | func getPrimarySelection(s *goquery.Selection) *goquery.Selection { 44 | 45 | // prepare a selection for search. Add some descriptions for DOM nodes 46 | // to find most correct node with a text 47 | describeDocumentNode(s) 48 | 49 | // now find a node with a text and return it 50 | return findSelectionWithPrimaryText(s) 51 | } 52 | 53 | /* 54 | * This is the core function. It checks a selection object and finds if this is a text node 55 | * or it is needed to go deeper , inside a node that has most of text 56 | */ 57 | func findSelectionWithPrimaryText(s *goquery.Selection) *goquery.Selection { 58 | 59 | // if no children then return a text from this node 60 | if s.Children().Length() == 0 { 61 | return s 62 | } 63 | 64 | // variable to find a node with longest text inside it 65 | sort_by_count_sentences := 0 66 | // a node with longest text inside it 67 | var sort_by_text_node *goquery.Selection = nil 68 | 69 | // keep count of nodes containing more 2 sentences 70 | count_of_nodes_with_sentences := 0 71 | 72 | max_count_of_correct_sentences := 0 73 | 74 | // calcuate count of real symbols 75 | node_full_text_len := utf8.RuneCountInString(s.Text()) 76 | 77 | top_total_count_of_correct_sentences := getNumbericAttribute(s, "totalcountofcorrectsentences") 78 | 79 | // all subnodes lengths 80 | tlengths := []int{} 81 | densityes := []int{} 82 | 83 | s.Children().Each(func(i int, sec *goquery.Selection) { 84 | totalcountofcorrectsentences := getNumbericAttribute(sec, "totalcountofcorrectsentences") 85 | 86 | if totalcountofcorrectsentences > 1 { 87 | count_of_nodes_with_sentences++ 88 | 89 | if totalcountofcorrectsentences > max_count_of_correct_sentences { 90 | max_count_of_correct_sentences = totalcountofcorrectsentences 91 | } 92 | } 93 | 94 | // node text length 95 | tlen := utf8.RuneCountInString(sec.Text()) 96 | 97 | html, _ := sec.Html() 98 | hlen := utf8.RuneCountInString(html) 99 | 100 | if tlen == 0 { 101 | // process next subnode 102 | return 103 | } 104 | 105 | tlengths = append(tlengths, tlen) 106 | 107 | density := (hlen / tlen) 108 | 109 | densityes = append(densityes, density) 110 | 111 | // check if this block is better then previous 112 | // choose better block only if previous is empty or 113 | // has less then 10 real sentences 114 | if totalcountofcorrectsentences > sort_by_count_sentences && sort_by_count_sentences < 10 { 115 | 116 | sort_by_count_sentences = totalcountofcorrectsentences 117 | sort_by_text_node = sec 118 | } 119 | 120 | }) 121 | 122 | // if any nide with a text was found 123 | if sort_by_count_sentences > 0 { 124 | // calculate mean deviation 125 | lvar := getMeanDeviation(tlengths) 126 | 127 | // get relative value of a mean deviation agains full text length in a node 128 | lvarproc := (100 * lvar) / float64(node_full_text_len) 129 | 130 | // during tests we found that if this value is less 5 131 | // the a node is what we are looking for 132 | // it is the node with "main" text of a page 133 | if lvarproc < 15 && len(tlengths) > 3 || 134 | (count_of_nodes_with_sentences > 2 && 135 | float32(max_count_of_correct_sentences) < float32(top_total_count_of_correct_sentences)*0.8) { 136 | 137 | // we found that a text is equally distributed between subnodes 138 | // no need to go deeper 139 | 140 | return s 141 | } 142 | // go deeper inside a node with most of text 143 | 144 | return findSelectionWithPrimaryText(sort_by_text_node) 145 | } 146 | // no subnodes found. return a node itself 147 | return s 148 | } 149 | 150 | // describe a text inside a node and add description as pseudo attributes 151 | func describeDocumentNode(s *goquery.Selection) *goquery.Selection { 152 | var totalcountofgoodsentences int 153 | var totalcountofcorrectsentences int 154 | var maxcountofflatsentences int 155 | 156 | countchildren := s.Children().Length() 157 | 158 | var sd TextDescription 159 | 160 | if countchildren > 0 { 161 | // for each child node check if to remove or not 162 | s.Children().Each(func(i int, sec *goquery.Selection) { 163 | 164 | // go deeper recursively 165 | describeDocumentNode(sec) 166 | 167 | // aggregate data to set to a node 168 | 169 | totalcountofgoodsentences += getNumbericAttribute(sec, "totalcountofgoodsentences") 170 | totalcountofcorrectsentences += getNumbericAttribute(sec, "totalcountofcorrectsentences") 171 | 172 | countsentences := getNumbericAttribute(sec, "maxcountofflatsentences") 173 | 174 | if countsentences > maxcountofflatsentences { 175 | maxcountofflatsentences = countsentences 176 | } 177 | 178 | }) 179 | 180 | // describe sentences in this html tag only, drop child nodes 181 | secclone := getSelectionWihoutChildren(s) 182 | 183 | sd = describeSentences(secclone) 184 | 185 | totalcountofgoodsentences += sd.CountGoodSentences 186 | totalcountofcorrectsentences += sd.CountCorrectSentences 187 | 188 | if sd.CountGoodSentences > maxcountofflatsentences { 189 | maxcountofflatsentences = sd.CountGoodSentences 190 | } 191 | 192 | } else { 193 | // no child nodes 194 | //fmt.Println(s.Text()) 195 | 196 | sd = describeSentences(s) 197 | totalcountofgoodsentences = sd.CountGoodSentences 198 | maxcountofflatsentences = sd.CountGoodSentences 199 | totalcountofcorrectsentences = sd.CountCorrectSentences 200 | } 201 | //fmt.Printf("set totalcountofgoodsentences ") 202 | // set attributes for the node 203 | s.SetAttr("countsentences", strconv.Itoa(sd.CountSentences)) 204 | s.SetAttr("averagewords", strconv.Itoa(sd.AverageWords)) 205 | s.SetAttr("countgoodsentences", strconv.Itoa(sd.CountGoodSentences)) 206 | s.SetAttr("countlongsentences", strconv.Itoa(sd.CountLongSentences)) 207 | s.SetAttr("totalcountofgoodsentences", strconv.Itoa(totalcountofgoodsentences)) 208 | s.SetAttr("totalcountofcorrectsentences", strconv.Itoa(totalcountofcorrectsentences)) 209 | s.SetAttr("maxcountofflatsentences", strconv.Itoa(maxcountofflatsentences)) 210 | 211 | return s 212 | } 213 | 214 | /* 215 | * 216 | */ 217 | func describeSentences(s *goquery.Selection) TextDescription { 218 | var d TextDescription 219 | 220 | var text string 221 | // get text of this node and then split for sentences 222 | if s.Children().Length() > 0 { 223 | text = getTextFromHtml(s) 224 | } else { 225 | text = s.Text() 226 | } 227 | 228 | sentences := tokenizer.Tokenize(text) 229 | 230 | d.CountSentences = len(sentences) 231 | //fmt.Println("==============================================") 232 | for _, s := range sentences { 233 | sentence := s.Text 234 | 235 | if len(sentence) == 0 { 236 | continue 237 | } 238 | 239 | c := len(get_words_from(sentence)) 240 | //fmt.Println(sentence) 241 | 242 | d.AverageWords += c 243 | 244 | if c > 3 { 245 | // presume normal sentence usually has more 3 words 246 | d.CountLongSentences++ 247 | 248 | if c < 25 { 249 | // but a sentence should not have nore 25 words. We will not 250 | // consider such sentence as a good one 251 | d.CountGoodSentences++ 252 | 253 | } 254 | lastsymbol := sentence[len(sentence)-1:] 255 | 256 | if strings.ContainsAny(lastsymbol, ".?!") { 257 | d.CountCorrectSentences++ 258 | } 259 | } 260 | 261 | } 262 | 263 | if d.CountSentences > 0 { 264 | d.AverageWords = int(d.AverageWords / d.CountSentences) 265 | } 266 | 267 | return d 268 | } 269 | 270 | func get_words_from(text string) []string { 271 | words := regexp.MustCompile("[^\\s]+") 272 | return words.FindAllString(text, -1) 273 | } 274 | 275 | // The function to calculate a Mean Deviation for a list of integer values 276 | func getMeanDeviation(list []int) float64 { 277 | 278 | if len(list) < 1 { 279 | return 0.0 280 | } 281 | 282 | sum := 0 283 | 284 | for i := range list { 285 | sum += list[i] 286 | } 287 | 288 | // calculate arithmetic mean 289 | avg := float64(sum / len(list)) 290 | 291 | number1 := 0.0 292 | 293 | for i := range list { 294 | number1 += math.Abs(float64(list[i]) - avg) 295 | } 296 | // calculate mean deviation 297 | meandeviation := number1 / float64(len(list)) 298 | 299 | return meandeviation 300 | } 301 | 302 | // this function returns text in a selection with ignoring child nodes 303 | // for "xxx YYY" result wil be "xxx " 304 | 305 | func getSelectionWihoutChildren(s *goquery.Selection) *goquery.Selection { 306 | clone := s.Clone() 307 | 308 | // remove all child nodes in this selection 309 | clone.Children().Each(func(i int, sec *goquery.Selection) { 310 | sec.Remove() 311 | }) 312 | 313 | return clone 314 | } 315 | 316 | func getNumbericAttribute(s *goquery.Selection, attr string) int { 317 | a, f := s.Attr(attr) 318 | 319 | if f { 320 | ai, _ := strconv.Atoi(a) 321 | return ai 322 | } 323 | return 0 324 | } 325 | --------------------------------------------------------------------------------