├── .gitignore ├── LICENSE-domdistiller.txt ├── internal ├── re2go │ ├── base.go │ ├── base.re │ ├── word-counter.re │ ├── document-title.re │ ├── document-title.go │ ├── dom-converter.re │ ├── domutil.re │ └── terminating-blocks.re ├── pagination │ ├── info │ │ ├── utils.go │ │ ├── constant.go │ │ ├── page-info.go │ │ └── linear-formula.go │ ├── parser │ │ ├── constant.go │ │ └── param-detector.go │ ├── pattern │ │ ├── utils.go │ │ ├── page-pattern.go │ │ └── constant.go │ └── constant.go ├── tableclass │ ├── table-type.go │ ├── constant.go │ └── type-reason.go ├── converter │ └── utils.go ├── markup │ ├── schemaorg │ │ ├── thing-item-unsupported.go │ │ ├── thing-item-org.go │ │ ├── thing-item-person.go │ │ ├── thing-item-image.go │ │ └── constant.go │ ├── opengraph │ │ ├── prefixes.go │ │ └── constant.go │ └── accessor.go ├── filter │ ├── heuristic │ │ ├── constant_test.go │ │ ├── list-at-end.go │ │ ├── heading-fusion.go │ │ ├── expand-title.go │ │ └── large-block-around-level.go │ ├── docfilter │ │ ├── scorer │ │ │ ├── image.go │ │ │ ├── image-ratio.go │ │ │ ├── image-area.go │ │ │ ├── image-has-figure_test.go │ │ │ ├── image-has-figure.go │ │ │ ├── image-dom-distance_test.go │ │ │ └── image-dom-distance.go │ │ ├── relevant-elements.go │ │ └── nested-element.go │ ├── filter.go │ ├── simple │ │ ├── label-to-boilerplate.go │ │ └── boilerplate-block.go │ └── english │ │ ├── terminating-blocks.go │ │ ├── terminating-blocks_test.go │ │ └── num-words.go ├── logutil │ └── logger.go ├── testutil │ ├── text-document.go │ ├── text-block-builder.go │ ├── page-param-content-info.go │ ├── html_test.go │ ├── text-document-builder.go │ ├── fake-document-builder.go │ ├── text-builder.go │ └── document-builder.go ├── extractor │ └── embed │ │ ├── embed.go │ │ ├── constant.go │ │ └── embed-vimeo_test.go ├── webdoc │ ├── element.go │ ├── constant.go │ ├── figure.go │ ├── tag.go │ ├── tag_test.go │ ├── table.go │ ├── text-document_test.go │ ├── video.go │ ├── embed.go │ ├── text-document.go │ ├── table_test.go │ ├── image.go │ ├── element-action.go │ └── image_test.go ├── domutil │ ├── walker.go │ ├── tree-clone.go │ └── tree-clone_test.go └── label │ └── label.go ├── Makefile ├── example ├── from-file.go └── from-url.go ├── LICENSE-boilerpipe.txt ├── NOTICE-boilerpipe.txt ├── go.mod ├── LICENSE ├── IMPROVEMENTS.md ├── data ├── timing-info.go └── data.go └── logger.go /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ -------------------------------------------------------------------------------- /LICENSE-domdistiller.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/re2go/base.go: -------------------------------------------------------------------------------- 1 | // Code generated by re2c 3.1, DO NOT EDIT. 2 | package re2go 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | generate: 2 | @for name in internal/re2go/*.re; do \ 3 | RE_IN=$$name; \ 4 | RE_OUT=$$(echo $$name | sed 's/\.re/.go/'); \ 5 | re2go -W -F --input-encoding utf8 --utf8 --no-generation-date -i $$RE_IN -o $$RE_OUT; \ 6 | gofmt -w $$RE_OUT; \ 7 | done 8 | 9 | test: generate 10 | go test -timeout 30s ./... -------------------------------------------------------------------------------- /example/from-file.go: -------------------------------------------------------------------------------- 1 | // +build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | 8 | "github.com/go-shiori/dom" 9 | distiller "github.com/markusmobius/go-domdistiller" 10 | ) 11 | 12 | func main() { 13 | result, err := distiller.ApplyForFile("example/sample.html", nil) 14 | if err != nil { 15 | panic(err) 16 | } 17 | 18 | rawHTML := dom.OuterHTML(result.Node) 19 | fmt.Println(rawHTML) 20 | } 21 | -------------------------------------------------------------------------------- /example/from-url.go: -------------------------------------------------------------------------------- 1 | // +build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | "time" 8 | 9 | "github.com/go-shiori/dom" 10 | distiller "github.com/markusmobius/go-domdistiller" 11 | ) 12 | 13 | func main() { 14 | url := "https://arstechnica.com/gadgets/2020/10/iphone-12-and-12-pro-double-review-playing-apples-greatest-hits/" 15 | 16 | // Start distiller 17 | result, err := distiller.ApplyForURL(url, time.Minute, nil) 18 | if err != nil { 19 | panic(err) 20 | } 21 | 22 | rawHTML := dom.OuterHTML(result.Node) 23 | fmt.Println(rawHTML) 24 | } 25 | -------------------------------------------------------------------------------- /internal/re2go/base.re: -------------------------------------------------------------------------------- 1 | package re2go 2 | 3 | /*!rules:re2c:base_template 4 | re2c:eof = 0; 5 | re2c:yyfill:enable = 0; 6 | re2c:posix-captures = 0; 7 | re2c:case-insensitive = 0; 8 | 9 | re2c:define:YYCTYPE = byte; 10 | re2c:define:YYPEEK = "input[cursor]"; 11 | re2c:define:YYSKIP = "cursor++"; 12 | re2c:define:YYBACKUP = "marker = cursor"; 13 | re2c:define:YYRESTORE = "cursor = marker"; 14 | re2c:define:YYLESSTHAN = "limit <= cursor"; 15 | re2c:define:YYSTAGP = "@@{tag} = cursor"; 16 | re2c:define:YYSTAGN = "@@{tag} = -1"; 17 | re2c:define:YYSHIFTSTAG = "@@{tag} += @@{shift}"; 18 | */ 19 | -------------------------------------------------------------------------------- /LICENSE-boilerpipe.txt: -------------------------------------------------------------------------------- 1 | boilerpipe 2 | 3 | Copyright (c) 2009-2011 Christian Kohlschütter 4 | 5 | The author licenses this file to You under the Apache License, Version 2.0 6 | (the "License"); you may not use this file except in compliance with 7 | the License. You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. -------------------------------------------------------------------------------- /NOTICE-boilerpipe.txt: -------------------------------------------------------------------------------- 1 | boilerpipe 2 | 3 | Copyright (c) 2009-2011 Christian Kohlschütter 4 | 5 | The author licenses this file to You under the Apache License, Version 2.0 6 | (the "License"); you may not use this file except in compliance with 7 | the License. You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | 18 | This software contains the following parts which are also provided 19 | under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt): 20 | 21 | - NekoHTML 22 | - Xerces -------------------------------------------------------------------------------- /internal/re2go/word-counter.re: -------------------------------------------------------------------------------- 1 | /*!include:re2c "base.re" */ 2 | 3 | // Original pattern: [\x{3040}-\x{A4CF}] 4 | func UseFullWordCounter(input string) bool { 5 | var cursor, marker int 6 | input += string(rune(0)) // add terminating null 7 | limit := len(input) - 1 // limit points at the terminating null 8 | _ = marker 9 | 10 | for { /*!use:re2c:base_template 11 | re2c:case-insensitive = 1; 12 | 13 | [\u3040-\uA4CF] { return true } 14 | * { continue } 15 | $ { return false } 16 | */ 17 | } 18 | } 19 | 20 | // Original pattern: [\x{AC00}-\x{D7AF}] 21 | func UseLetterWordCounter(input string) bool { 22 | var cursor, marker int 23 | input += string(rune(0)) // add terminating null 24 | limit := len(input) - 1 // limit points at the terminating null 25 | _ = marker 26 | 27 | for { /*!use:re2c:base_template 28 | re2c:case-insensitive = 1; 29 | 30 | [\uAC00-\uD7AF] { return true } 31 | * { continue } 32 | $ { return false } 33 | */ 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/markusmobius/go-domdistiller 2 | 3 | go 1.20 4 | 5 | require ( 6 | github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c 7 | github.com/rs/zerolog v1.33.0 8 | github.com/stretchr/testify v1.7.0 9 | github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4 10 | golang.org/x/net v0.29.0 11 | golang.org/x/text v0.18.0 12 | ) 13 | 14 | require ( 15 | github.com/andybalholm/cascadia v1.3.2 // indirect 16 | github.com/davecgh/go-spew v1.1.1 // indirect 17 | github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect 18 | github.com/kr/text v0.2.0 // indirect 19 | github.com/mattn/go-colorable v0.1.13 // indirect 20 | github.com/mattn/go-isatty v0.0.20 // indirect 21 | github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect 22 | github.com/pmezard/go-difflib v1.0.0 // indirect 23 | golang.org/x/sys v0.25.0 // indirect 24 | gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b // indirect 25 | gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776 // indirect 26 | ) 27 | -------------------------------------------------------------------------------- /internal/re2go/document-title.re: -------------------------------------------------------------------------------- 1 | /*!include:re2c "base.re" */ 2 | 3 | import "strings" 4 | 5 | // Original pattern: (?i)[\?\!\.\-\:]+ 6 | func RemoveDtmCharacters(input string) string { 7 | var cursor, marker int 8 | input += string(rune(0)) // add terminating null 9 | limit := len(input) - 1 // limit points at the terminating null 10 | _ = marker 11 | 12 | // Variable for capturing parentheses (twice the number of groups). 13 | /*!maxnmatch:re2c*/ 14 | yypmatch := make([]int, YYMAXNMATCH*2) 15 | var yynmatch int 16 | _ = yynmatch 17 | 18 | // Autogenerated tag variables used by the lexer to track tag values. 19 | /*!stags:re2c format = 'var @@ int; _ = @@\n'; */ 20 | 21 | var start int 22 | var sb strings.Builder 23 | for { /*!use:re2c:base_template 24 | re2c:posix-captures = 1; 25 | 26 | [?!.\-:]+ { 27 | sb.WriteString(input[start:yypmatch[0]]) 28 | start = yypmatch[1] 29 | continue 30 | } 31 | 32 | $ { 33 | sb.WriteString(input[start:limit]) 34 | return sb.String() 35 | } 36 | 37 | * { continue } 38 | */ 39 | } 40 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Markus Mobius 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /internal/pagination/info/utils.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Markus Mobius 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | // SOFTWARE. 20 | 21 | package info 22 | 23 | func maxInt(a, b int) int { 24 | if a > b { 25 | return a 26 | } 27 | return b 28 | } 29 | -------------------------------------------------------------------------------- /internal/pagination/parser/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParameterDetector.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package parser 28 | 29 | const ( 30 | MaxPagingDocs = 100 31 | ) 32 | -------------------------------------------------------------------------------- /internal/tableclass/table-type.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/TableClassifier.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package tableclass 28 | 29 | type Type uint 30 | 31 | const ( 32 | Data Type = iota 33 | Layout 34 | ) 35 | 36 | func (t Type) String() string { 37 | switch t { 38 | case Data: 39 | return "Data" 40 | case Layout: 41 | return "Layout" 42 | } 43 | return "" 44 | } 45 | -------------------------------------------------------------------------------- /internal/pagination/pattern/utils.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Markus Mobius 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | // SOFTWARE. 20 | 21 | package pattern 22 | 23 | import ( 24 | nurl "net/url" 25 | ) 26 | 27 | // replaceUrlQueryValue replaces query value of the specified URL. The original URL 28 | // is preserved and not changed. Returns the mutated URL after its query changed. 29 | func replaceUrlQueryValue(url *nurl.URL, queryName string, queryValue string) *nurl.URL { 30 | clonedURL := *url 31 | queries := clonedURL.Query() 32 | queries.Set(queryName, PageParamPlaceholder) 33 | clonedURL.RawQuery = queries.Encode() 34 | return &clonedURL 35 | } 36 | -------------------------------------------------------------------------------- /internal/converter/utils.go: -------------------------------------------------------------------------------- 1 | package converter 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/go-shiori/dom" 7 | "github.com/markusmobius/go-domdistiller/internal/re2go" 8 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 9 | "golang.org/x/net/html" 10 | ) 11 | 12 | var ( 13 | unlikelyRoles = map[string]struct{}{ 14 | "menu": {}, 15 | "menubar": {}, 16 | "complementary": {}, 17 | "navigation": {}, 18 | "alert": {}, 19 | "alertdialog": {}, 20 | "dialog": {}, 21 | } 22 | ) 23 | 24 | // isElementWithoutContent determines if node is empty 25 | // or only filled with
and
. 26 | func isElementWithoutContent(node *html.Node) bool { 27 | brs := dom.GetElementsByTagName(node, "br") 28 | hrs := dom.GetElementsByTagName(node, "hr") 29 | childs := dom.Children(node) 30 | 31 | return node.Type == html.ElementNode && 32 | strings.TrimSpace(dom.TextContent(node)) == "" && 33 | (len(childs) == 0 || len(childs) == len(brs)+len(hrs)) 34 | } 35 | 36 | func isByline(node *html.Node, matchString string) bool { 37 | rel := dom.GetAttribute(node, "rel") 38 | itemprop := dom.GetAttribute(node, "itemprop") 39 | nodeText := dom.TextContent(node) 40 | if (rel == "author" || strings.Contains(itemprop, "author") || re2go.IsByline(matchString)) && 41 | isValidByline(nodeText) { 42 | return true 43 | } 44 | 45 | return false 46 | } 47 | 48 | // isValidByline checks whether the input string could be a byline. 49 | // This verifies that the input is a string, and that the length 50 | // is less than 100 chars. 51 | func isValidByline(byline string) bool { 52 | byline = strings.TrimSpace(byline) 53 | nChar := stringutil.CharCount(byline) 54 | return nChar > 0 && nChar < 100 55 | } 56 | -------------------------------------------------------------------------------- /internal/markup/schemaorg/thing-item-unsupported.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/SchemaOrgParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package schemaorg 28 | 29 | import ( 30 | "golang.org/x/net/html" 31 | ) 32 | 33 | type UnsupportedItem struct { 34 | BaseThingItem 35 | } 36 | 37 | func NewUnsupportedItem(element *html.Node) *UnsupportedItem { 38 | item := &UnsupportedItem{} 39 | item.init(Unsupported, element) 40 | return item 41 | } 42 | -------------------------------------------------------------------------------- /internal/filter/heuristic/constant_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Markus Mobius 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | // SOFTWARE. 20 | 21 | package heuristic_test 22 | 23 | const ( 24 | titleText = "I am the document title" 25 | 26 | contentText = "Lorem Ipsum Lorem Ipsum Lorem Ipsum." 27 | 28 | longText = "Lorem Ipsum Lorem Ipsum Lorem Ipsum. " + 29 | "Lorem Ipsum Lorem Ipsum Lorem Ipsum. " + 30 | "Lorem Ipsum Lorem Ipsum Lorem Ipsum." 31 | 32 | longLeadingText = "" + 33 | "Leading text that's used to start a document but just to offset a " + 34 | "few text blocks. This will allow testing in-page merges." 35 | 36 | shortText = "I might be a header." 37 | 38 | headingText = "Heading" 39 | ) 40 | -------------------------------------------------------------------------------- /internal/re2go/document-title.go: -------------------------------------------------------------------------------- 1 | // Code generated by re2c 3.1, DO NOT EDIT. 2 | package re2go 3 | 4 | import "strings" 5 | 6 | // Original pattern: (?i)[\?\!\.\-\:]+ 7 | func RemoveDtmCharacters(input string) string { 8 | var cursor, marker int 9 | input += string(rune(0)) // add terminating null 10 | limit := len(input) - 1 // limit points at the terminating null 11 | _ = marker 12 | 13 | // Variable for capturing parentheses (twice the number of groups). 14 | var YYMAXNMATCH int = 1 15 | 16 | yypmatch := make([]int, YYMAXNMATCH*2) 17 | var yynmatch int 18 | _ = yynmatch 19 | 20 | // Autogenerated tag variables used by the lexer to track tag values. 21 | var yyt1 int 22 | _ = yyt1 23 | 24 | var start int 25 | var sb strings.Builder 26 | for { 27 | { 28 | var yych byte 29 | yych = input[cursor] 30 | switch yych { 31 | case '!': 32 | fallthrough 33 | case '-', '.': 34 | fallthrough 35 | case ':': 36 | fallthrough 37 | case '?': 38 | yyt1 = cursor 39 | goto yy2 40 | default: 41 | if limit <= cursor { 42 | goto yy4 43 | } 44 | goto yy1 45 | } 46 | yy1: 47 | cursor++ 48 | { 49 | continue 50 | } 51 | yy2: 52 | cursor++ 53 | yych = input[cursor] 54 | switch yych { 55 | case '!': 56 | fallthrough 57 | case '-', '.': 58 | fallthrough 59 | case ':': 60 | fallthrough 61 | case '?': 62 | goto yy2 63 | default: 64 | goto yy3 65 | } 66 | yy3: 67 | yynmatch = 1 68 | yypmatch[0] = yyt1 69 | yypmatch[1] = cursor 70 | { 71 | sb.WriteString(input[start:yypmatch[0]]) 72 | start = yypmatch[1] 73 | continue 74 | } 75 | yy4: 76 | { 77 | sb.WriteString(input[start:limit]) 78 | return sb.String() 79 | } 80 | } 81 | 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /internal/pagination/info/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParamInfo.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package info 28 | 29 | const ( 30 | minLinksToJustifyLinearMap = 2 31 | ) 32 | 33 | // ParamType is types of page parameter values in paging URLs. 34 | type ParamType uint 35 | 36 | const ( 37 | Unset ParamType = iota // Initialized type to indicate empty PageParamInfo. 38 | PageNumber // Value is a page number. 39 | Unknown // None of the above. 40 | ) 41 | -------------------------------------------------------------------------------- /internal/logutil/logger.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/LogUtil.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package logutil 28 | 29 | // Logger is the base interface for logging process of distiller. 30 | type Logger interface { 31 | InternallyNil() bool 32 | IsLogExtraction() bool 33 | IsLogVisibility() bool 34 | IsLogPagination() bool 35 | IsLogTiming() bool 36 | 37 | PrintExtractionInfo(args ...interface{}) 38 | PrintVisibilityInfo(args ...interface{}) 39 | PrintPaginationInfo(args ...interface{}) 40 | PrintTimingInfo(args ...interface{}) 41 | } 42 | -------------------------------------------------------------------------------- /internal/testutil/text-document.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/document/TextDocumentTestUtil.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | import ( 30 | "strings" 31 | 32 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 33 | ) 34 | 35 | func GetContentFromTextDocument(doc *webdoc.TextDocument) string { 36 | var buffer strings.Builder 37 | for _, tb := range doc.TextBlocks { 38 | if tb.IsContent() { 39 | buffer.WriteString(tb.Text) 40 | buffer.WriteString("\n") 41 | } 42 | } 43 | return buffer.String() 44 | } 45 | -------------------------------------------------------------------------------- /IMPROVEMENTS.md: -------------------------------------------------------------------------------- 1 | # Improvements 2 | 3 | After using both Readability.js and DOM Distiller, we found that there are several improvements that can be implemented into this port. Besides that, from our experiments we also found some possible bugs that we decided to fix. 4 | 5 | These so-called improvements are listed here as historical documentation and to explain the difference between the main branch and stable branch. 6 | 7 | ## From Readability 8 | 9 | - Implement function to check if a HTML element is probably visible or not. This is especially useful since one of the DOM Distiller strategy is to exclude invisible elements by computing the stylesheets (which is impossible to do in Go). 10 | - Exclude form and input element, since in distilled mode we only want to read. 11 | - Skip byline, empty div and unlikely elements by checking its class name, id and role attributes. 12 | - Convert anchors with Javascript URL into an ordinary text node. 13 | - Convert font to span elements. This is done because the font elements is usually only used for styling, so Readability.js decided to convert it. 14 | - Exclude identification and presentational attributes (eg. `id`, `class` and `style`) from each elements. 15 | 16 | ## From our own experiments 17 | 18 | - Make sure figure's caption doesn't contains noscript elements. This is done because noscript in Go is a bit weird, sometimes it detected as HTML element while the other times it detected as plain text, so we need additional schecks to clean it. 19 | - Mark large blocks around main content's tag level as content as well. In original DOM Distiller, they are looking for the most likely main content, then they mark text blocks that exist in the same tag level of the main content as content as well. Unfortunately, we found out that in some sites parts of the article are omitted by DOM Distiller. To fix this, we decided to make the filter more tolerant by checking text blocks in lower and upper tag levels as well. -------------------------------------------------------------------------------- /internal/extractor/embed/embed.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/extractors/embeds/EmbedExtractor.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package embed 28 | 29 | import ( 30 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 31 | "golang.org/x/net/html" 32 | ) 33 | 34 | // EmbedExtractor is interface for extracting embedded nodes int webdoc.Element. 35 | type EmbedExtractor interface { 36 | // RelevantTagNames returns a set of HTML tag names that are relevant to this extractor. 37 | RelevantTagNames() []string 38 | // Extract detects if a node should be extracted as an embedded element; if not return nil. 39 | Extract(node *html.Node) webdoc.Element 40 | } 41 | -------------------------------------------------------------------------------- /internal/pagination/info/page-info.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParamInfo.java and 2 | // java/MonotonicPageInfosGroups.java 3 | 4 | // Copyright (c) 2020 Markus Mobius 5 | // 6 | // Permission is hereby granted, free of charge, to any person obtaining a copy 7 | // of this software and associated documentation files (the "Software"), to deal 8 | // in the Software without restriction, including without limitation the rights 9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the Software is 11 | // furnished to do so, subject to the following conditions: 12 | // 13 | // The above copyright notice and this permission notice shall be included in all 14 | // copies or substantial portions of the Software. 15 | // 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | // SOFTWARE. 23 | 24 | // Copyright 2015 The Chromium Authors. All rights reserved. 25 | // Use of this source code is governed by a BSD-style license that can be 26 | // found in the LICENSE file. 27 | 28 | package info 29 | 30 | import "fmt" 31 | 32 | // PageInfo stores potential pagination info: 33 | // - page number represented as original plain text in document URL 34 | // - if the info is extracted from an anchor, its href. 35 | type PageInfo struct { 36 | PageNumber int 37 | URL string 38 | } 39 | 40 | func (pi *PageInfo) String() string { 41 | return fmt.Sprintf("pg%d: %s", pi.PageNumber, pi.URL) 42 | } 43 | 44 | type PageInfoGroup struct { 45 | List []*PageInfo 46 | DeltaSign int 47 | } 48 | -------------------------------------------------------------------------------- /internal/pagination/info/linear-formula.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParamInfo.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package info 28 | 29 | import ( 30 | "fmt" 31 | ) 32 | 33 | // LinearFormula stores the coefficient and delta values of the linear formula: 34 | // pageParamValue = coefficient * pageNum + delta. 35 | type LinearFormula struct { 36 | Coefficient int 37 | Delta int 38 | } 39 | 40 | func NewLinearFormula(coefficient, delta int) *LinearFormula { 41 | return &LinearFormula{ 42 | Coefficient: coefficient, 43 | Delta: delta, 44 | } 45 | } 46 | 47 | func (lf *LinearFormula) String() string { 48 | return fmt.Sprintf("coefficient=%d, delta=%d", lf.Coefficient, lf.Delta) 49 | } 50 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/images/ImageScorer.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer 28 | 29 | import "golang.org/x/net/html" 30 | 31 | // ImageScorer is used to represent a single heuristic used in image extraction. 32 | // The provided image will be given a score based on the heuristic and a max score. 33 | type ImageScorer interface { 34 | // GetImageScore returns a particular image a score based on the heuristic 35 | // implemented in this ImageScorer and what the max score is set to. 36 | GetImageScore(e *html.Node) int 37 | 38 | // GetMaxScore returns the maximum possible score that this ImageScorer can return. 39 | GetMaxScore() int 40 | } 41 | -------------------------------------------------------------------------------- /data/timing-info.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: Protobuf model in proto/dom_distiller.proto 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | package data 24 | 25 | import "time" 26 | 27 | type TimingEntry struct { 28 | Name string 29 | Time time.Duration 30 | } 31 | 32 | type TimingInfo struct { 33 | MarkupParsingTime time.Duration 34 | DocumentConstructionTime time.Duration 35 | ArticleProcessingTime time.Duration 36 | FormattingTime time.Duration 37 | TotalTime time.Duration 38 | 39 | // A place to hold arbitrary breakdowns of time. The perf scoring/server 40 | // should display these entries with appropriate names. 41 | OtherTimes []TimingEntry 42 | } 43 | 44 | func (ti *TimingInfo) AddEntry(start time.Time, name string) { 45 | if ti == nil { 46 | return 47 | } 48 | 49 | ti.OtherTimes = append(ti.OtherTimes, TimingEntry{ 50 | Name: name, 51 | Time: time.Since(start), 52 | }) 53 | } 54 | -------------------------------------------------------------------------------- /internal/webdoc/element.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebElement.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | // Element is some logical part of a web document (text block, image, video, table, etc.) 30 | type Element interface { 31 | // GenerateOutput generates HTML output for this Element. 32 | GenerateOutput(textOnly bool) string 33 | IsContent() bool 34 | SetIsContent(bool) 35 | ElementType() string 36 | String() string 37 | } 38 | 39 | // BaseElement is base of any other element. 40 | type BaseElement struct { 41 | isContent bool 42 | } 43 | 44 | func (be *BaseElement) IsContent() bool { 45 | return be.isContent 46 | } 47 | 48 | func (be *BaseElement) SetIsContent(b bool) { 49 | be.isContent = b 50 | } 51 | -------------------------------------------------------------------------------- /internal/markup/schemaorg/thing-item-org.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/SchemaOrgParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package schemaorg 28 | 29 | import ( 30 | "golang.org/x/net/html" 31 | ) 32 | 33 | type OrganizationItem struct { 34 | BaseThingItem 35 | } 36 | 37 | func NewOrganizationItem(element *html.Node) *OrganizationItem { 38 | item := &OrganizationItem{} 39 | item.init(Organization, element) 40 | item.addStringPropertyName(LegalNameProp) 41 | return item 42 | } 43 | 44 | func (oi *OrganizationItem) getName() string { 45 | // Returns either the value of NameProp, or LegalNameProp. 46 | if name := oi.getStringProperty(NameProp); name != "" { 47 | return name 48 | } 49 | 50 | return oi.getStringProperty(LegalNameProp) 51 | } 52 | -------------------------------------------------------------------------------- /internal/re2go/dom-converter.re: -------------------------------------------------------------------------------- 1 | /*!include:re2c "base.re" */ 2 | 3 | // Original pattern: (?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote 4 | func IsUnlikelyCandidates(input string) bool { 5 | var cursor, marker int 6 | input += string(rune(0)) // add terminating null 7 | limit := len(input) - 1 // limit points at the terminating null 8 | _ = marker 9 | 10 | for { /*!use:re2c:base_template 11 | re2c:case-insensitive = 1; 12 | 13 | unlikely = -ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote; 14 | 15 | {unlikely} { return true } 16 | * { continue } 17 | $ { return false } 18 | */ 19 | } 20 | } 21 | 22 | // Original pattern: (?i)and|article|body|column|content|main|shadow 23 | func MaybeItsACandidate(input string) bool { 24 | var cursor, marker int 25 | input += string(rune(0)) // add terminating null 26 | limit := len(input) - 1 // limit points at the terminating null 27 | _ = marker 28 | 29 | for { /*!use:re2c:base_template 30 | re2c:case-insensitive = 1; 31 | 32 | maybe = and|article|body|column|content|main|shadow; 33 | 34 | {maybe} { return true } 35 | * { continue } 36 | $ { return false } 37 | */ 38 | } 39 | } 40 | 41 | // Original pattern: (?i)byline|author|dateline|writtenby|p-author 42 | func IsByline(input string) bool { 43 | var cursor, marker int 44 | input += string(rune(0)) // add terminating null 45 | limit := len(input) - 1 // limit points at the terminating null 46 | _ = marker 47 | 48 | for { /*!use:re2c:base_template 49 | re2c:case-insensitive = 1; 50 | 51 | byline = byline|author|dateline|writtenby|p-author; 52 | 53 | {byline} { return true } 54 | * { continue } 55 | $ { return false } 56 | */ 57 | } 58 | } -------------------------------------------------------------------------------- /internal/filter/docfilter/relevant-elements.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/RelevantElements.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package docfilter 28 | 29 | import "github.com/markusmobius/go-domdistiller/internal/webdoc" 30 | 31 | type RelevantElements struct{} 32 | 33 | func NewRelevantElements() *RelevantElements { 34 | return &RelevantElements{} 35 | } 36 | 37 | func (f *RelevantElements) Process(doc *webdoc.Document) bool { 38 | changes := false 39 | inContent := false 40 | 41 | for _, e := range doc.Elements { 42 | if e.IsContent() { 43 | inContent = true 44 | } else if _, isText := e.(*webdoc.Text); isText { 45 | inContent = false 46 | } else { 47 | if inContent { 48 | e.SetIsContent(true) 49 | changes = true 50 | } 51 | } 52 | } 53 | 54 | return changes 55 | } 56 | -------------------------------------------------------------------------------- /data/data.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: Protobuf model in proto/dom_distiller.proto 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | package data 24 | 25 | type PaginationInfo struct { 26 | NextPage string 27 | PrevPage string 28 | } 29 | 30 | // MarkupArticle is object to contains the properties of an article document. 31 | type MarkupArticle struct { 32 | PublishedTime string 33 | ModifiedTime string 34 | ExpirationTime string 35 | Section string 36 | Authors []string 37 | } 38 | 39 | // MarkupImage is used to contains the properties of an image in the document. 40 | type MarkupImage struct { 41 | Root string 42 | URL string 43 | SecureURL string 44 | Type string 45 | Caption string 46 | Width int 47 | Height int 48 | } 49 | 50 | type MarkupInfo struct { 51 | Title string 52 | Type string 53 | URL string 54 | Description string 55 | Publisher string 56 | Copyright string 57 | Author string 58 | Article MarkupArticle 59 | Images []MarkupImage 60 | } 61 | -------------------------------------------------------------------------------- /internal/webdoc/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebText.java, java/webdocument/WebTag.java, 2 | // java/webdocument/WebImage.java 3 | 4 | // Copyright (c) 2020 Markus Mobius 5 | // 6 | // Permission is hereby granted, free of charge, to any person obtaining a copy 7 | // of this software and associated documentation files (the "Software"), to deal 8 | // in the Software without restriction, including without limitation the rights 9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the Software is 11 | // furnished to do so, subject to the following conditions: 12 | // 13 | // The above copyright notice and this permission notice shall be included in all 14 | // copies or substantial portions of the Software. 15 | // 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | // SOFTWARE. 23 | 24 | // Copyright 2015 The Chromium Authors. All rights reserved. 25 | // Use of this source code is governed by a BSD-style license that can be 26 | // found in the LICENSE file. 27 | 28 | package webdoc 29 | 30 | type TagType uint 31 | 32 | const ( 33 | TagStart TagType = iota 34 | TagEnd 35 | ) 36 | 37 | var lazyImageAttrs = map[string]string{ 38 | "data-srcset": "srcset", 39 | } 40 | 41 | func CanBeNested(tagName string) bool { 42 | switch tagName { 43 | case "ul", "ol", "li", "blockquote", "pre": 44 | return true 45 | 46 | default: 47 | return false 48 | } 49 | } 50 | 51 | // All inline elements except for impossible tags: br, object, and script. 52 | // Please refer to DomConverter.visitElement() for skipped tags. 53 | // Reference: https://developer.mozilla.org/en-US/docs/HTML/Inline_elements 54 | var inlineTagNames = map[string]struct{}{} 55 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image-ratio.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/images/DimensionsRatioScorer.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer 28 | 29 | import "golang.org/x/net/html" 30 | 31 | // ImageRatioScorer uses image ratio (length/width) as its heuristic. 32 | // Unfortunately to do that we need to compute CSS which is impossible 33 | // in Go, so this scorer do nothing. NEED-COMPUTE-CSS. 34 | type ImageRatioScorer struct { 35 | maxScore int 36 | } 37 | 38 | // NewImageRatioScorer returns and initiates the ImageRatioScorer. 39 | func NewImageRatioScorer(maxScore int) *ImageRatioScorer { 40 | return &ImageRatioScorer{ 41 | maxScore: maxScore, 42 | } 43 | } 44 | 45 | func (s *ImageRatioScorer) GetImageScore(_ *html.Node) int { 46 | return 0 47 | } 48 | 49 | func (s *ImageRatioScorer) GetMaxScore() int { 50 | return s.maxScore 51 | } 52 | -------------------------------------------------------------------------------- /internal/testutil/text-block-builder.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/TestTextBlockBuilder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | import ( 30 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 31 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 32 | ) 33 | 34 | type TextBlockBuilder struct { 35 | textBuilder *TextBuilder 36 | } 37 | 38 | func NewTextBlockBuilder(wc stringutil.WordCounter) *TextBlockBuilder { 39 | return &TextBlockBuilder{ 40 | textBuilder: NewTextBuilder(wc), 41 | } 42 | } 43 | 44 | func (tbb *TextBlockBuilder) CreateForText(text string) *webdoc.TextBlock { 45 | wt := tbb.textBuilder.CreateForText(text) 46 | return webdoc.NewTextBlock(wt) 47 | } 48 | 49 | func (tbb *TextBlockBuilder) CreateForAnchorText(text string) *webdoc.TextBlock { 50 | wt := tbb.textBuilder.CreateForAnchorText(text) 51 | return webdoc.NewTextBlock(wt) 52 | } 53 | -------------------------------------------------------------------------------- /internal/webdoc/figure.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebImage.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import ( 30 | "github.com/go-shiori/dom" 31 | "github.com/markusmobius/go-domdistiller/internal/domutil" 32 | "golang.org/x/net/html" 33 | ) 34 | 35 | type Figure struct { 36 | Image 37 | Caption *html.Node 38 | } 39 | 40 | func (f *Figure) ElementType() string { 41 | return "figure" 42 | } 43 | 44 | func (f *Figure) GenerateOutput(textOnly bool) string { 45 | figCaption := domutil.CloneAndProcessTree(f.Caption, f.PageURL) 46 | if textOnly { 47 | return domutil.InnerText(figCaption) 48 | } 49 | 50 | figure := dom.CreateElement("figure") 51 | dom.AppendChild(figure, f.getProcessedNode()) 52 | if dom.InnerHTML(f.Caption) != "" { 53 | dom.AppendChild(figure, figCaption) 54 | } 55 | 56 | domutil.StripAttributes(figure) 57 | return dom.OuterHTML(figure) 58 | } 59 | -------------------------------------------------------------------------------- /internal/webdoc/tag.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebTag.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import "fmt" 30 | 31 | // Tag represents HTML tags that need to be preserved over. 32 | type Tag struct { 33 | BaseElement 34 | Name string 35 | Type TagType 36 | } 37 | 38 | func NewTag(name string, tagType TagType) *Tag { 39 | return &Tag{Name: name, Type: tagType} 40 | } 41 | 42 | func (t *Tag) ElementType() string { 43 | return "tag" 44 | } 45 | 46 | func (t *Tag) GenerateOutput(textOnly bool) string { 47 | if textOnly { 48 | return "" 49 | } 50 | 51 | if t.Type == TagStart { 52 | return "<" + t.Name + ">" 53 | } 54 | return "" 55 | } 56 | 57 | func (t *Tag) String() string { 58 | tp := "tag_start" 59 | if t.Type == TagEnd { 60 | tp = "tag_end" 61 | } 62 | 63 | return fmt.Sprintf("ELEMENT %q: name=%q, type=%s, is_content=%v", 64 | t.ElementType(), t.Name, tp, t.isContent) 65 | } 66 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image-area.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/images/AreaScorer.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer 28 | 29 | import "golang.org/x/net/html" 30 | 31 | // ImageAreaScorer uses image area (length*width) as its heuristic. 32 | // Unfortunately to do that we need to compute CSS which is impossible 33 | // in Go, so this scorer do nothing. NEED-COMPUTE-CSS. 34 | type ImageAreaScorer struct { 35 | maxScore int 36 | minArea int 37 | maxArea int 38 | } 39 | 40 | // NewImageAreaScorer returns and initiates the ImageAreaScorer. 41 | func NewImageAreaScorer(maxScore, minArea, maxArea int) *ImageAreaScorer { 42 | return &ImageAreaScorer{ 43 | maxScore: maxScore, 44 | minArea: minArea, 45 | maxArea: maxArea, 46 | } 47 | } 48 | 49 | func (s *ImageAreaScorer) GetImageScore(_ *html.Node) int { 50 | return 0 51 | } 52 | 53 | func (s *ImageAreaScorer) GetMaxScore() int { 54 | return s.maxScore 55 | } 56 | -------------------------------------------------------------------------------- /internal/testutil/page-param-content-info.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/PageParamContentInfo.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | type PageParamContentType uint 30 | 31 | const ( 32 | UnrelatedTerms PageParamContentType = iota 33 | NumberInPlainText 34 | NumericOutlink 35 | ) 36 | 37 | type PageParamContentInfo struct { 38 | Type PageParamContentType 39 | TargetURL string 40 | Number int 41 | } 42 | 43 | func PPCIUnrelatedTerms() *PageParamContentInfo { 44 | return &PageParamContentInfo{Type: UnrelatedTerms} 45 | } 46 | 47 | func PPCINumberInPlainText(number int) *PageParamContentInfo { 48 | return &PageParamContentInfo{ 49 | Type: NumberInPlainText, 50 | Number: number, 51 | } 52 | } 53 | 54 | func PPCINumericOutlink(targetURL string, number int) *PageParamContentInfo { 55 | return &PageParamContentInfo{ 56 | Type: NumericOutlink, 57 | TargetURL: targetURL, 58 | Number: number, 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /internal/markup/schemaorg/thing-item-person.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/SchemaOrgParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package schemaorg 28 | 29 | import ( 30 | "golang.org/x/net/html" 31 | ) 32 | 33 | type PersonItem struct { 34 | BaseThingItem 35 | } 36 | 37 | func NewPersonItem(element *html.Node) *PersonItem { 38 | item := &PersonItem{} 39 | item.init(Person, element) 40 | item.addStringPropertyName(FamilyNameProp) 41 | item.addStringPropertyName(GivenNameProp) 42 | return item 43 | } 44 | 45 | func (pi *PersonItem) getName() string { 46 | // Returns either the value of NameProp, or concatenated values 47 | // of GivenNameProp and FamilyNameProp delimited by a whitespace. 48 | if name := pi.getStringProperty(NameProp); name != "" { 49 | return name 50 | } 51 | 52 | givenName := pi.getStringProperty(GivenNameProp) 53 | familyName := pi.getStringProperty(FamilyNameProp) 54 | if givenName != "" && familyName != "" { 55 | givenName += " " 56 | } 57 | 58 | return givenName + familyName 59 | } 60 | -------------------------------------------------------------------------------- /internal/pagination/pattern/page-pattern.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParameterDetector.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package pattern 28 | 29 | import ( 30 | nurl "net/url" 31 | ) 32 | 33 | // PagePattern is the interface that page pattern handlers must implement to detect 34 | // page parameter from potential pagination URLs. 35 | type PagePattern interface { 36 | // String returns the string of the URL page pattern. 37 | String() string 38 | 39 | // PageNumber returns the page number extracted from the URL during creation of 40 | // object that implements this interface. 41 | PageNumber() int 42 | 43 | // IsValidFor validates this page pattern according to the current document URL 44 | // through a pipeline of rules. Returns true if page pattern is valid. 45 | // docUrl is the current document URL. 46 | IsValidFor(docURL *nurl.URL) bool 47 | 48 | // IsPagingURL returns true if a URL matches this page pattern based on a pipeline of rules. 49 | // url is the URL to evaluate. 50 | IsPagingURL(url string) bool 51 | } 52 | -------------------------------------------------------------------------------- /internal/webdoc/tag_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/webdocument/WebTagTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc_test 28 | 29 | import ( 30 | "testing" 31 | 32 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 33 | "github.com/stretchr/testify/assert" 34 | ) 35 | 36 | func Test_WebDoc_Tag_OLGenerateOutput(t *testing.T) { 37 | olStartTag := webdoc.Tag{Name: "ol", Type: webdoc.TagStart} 38 | olEndTag := webdoc.Tag{Name: "ol", Type: webdoc.TagEnd} 39 | startResult := olStartTag.GenerateOutput(false) 40 | endResult := olEndTag.GenerateOutput(false) 41 | assert.Equal(t, "
    ", startResult) 42 | assert.Equal(t, "
", endResult) 43 | } 44 | 45 | func Test_WebDoc_Tag_GenerateOutput(t *testing.T) { 46 | startTag := webdoc.Tag{Name: "anytext", Type: webdoc.TagStart} 47 | endTag := webdoc.Tag{Name: "anytext", Type: webdoc.TagEnd} 48 | startResult := startTag.GenerateOutput(false) 49 | endResult := endTag.GenerateOutput(false) 50 | assert.Equal(t, "", startResult) 51 | assert.Equal(t, "", endResult) 52 | } 53 | -------------------------------------------------------------------------------- /internal/domutil/walker.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/DomWalker.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package domutil 28 | 29 | import ( 30 | "golang.org/x/net/html" 31 | ) 32 | 33 | // WalkNodes used to walk the subtree of the DOM rooted at a particular root. It has two 34 | // function parameters, i.e. fnVisit and fnExit : 35 | // - fnVisit is called when we reach a node during the walk. If it returns false, children 36 | // of the node will be skipped and fnExit won't be called for this node. 37 | // - fnExit is called when exiting a node, after visiting all of its children. 38 | func WalkNodes(root *html.Node, fnVisit func(*html.Node) bool, fnExit func(*html.Node)) { 39 | if root == nil { 40 | return 41 | } 42 | 43 | visitChildren := false 44 | if fnVisit != nil { 45 | visitChildren = fnVisit(root) 46 | } 47 | 48 | if !visitChildren { 49 | return 50 | } 51 | 52 | for child := root.FirstChild; child != nil; child = child.NextSibling { 53 | WalkNodes(child, fnVisit, fnExit) 54 | } 55 | 56 | if fnExit != nil { 57 | fnExit(root) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /internal/testutil/html_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/TestUtilTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil_test 28 | 29 | import ( 30 | "regexp" 31 | "testing" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/testutil" 35 | "github.com/stretchr/testify/assert" 36 | ) 37 | 38 | var ( 39 | rxCleanWhitespaces = regexp.MustCompile(`(?mi)^\s+`) 40 | rxNewlines = regexp.MustCompile(`(?i)\n`) 41 | ) 42 | 43 | func Test_TestUtil_CreateDivTree(t *testing.T) { 44 | expectedHTML := ` 45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
` 67 | 68 | expectedHTML = rxCleanWhitespaces.ReplaceAllString(expectedHTML, "") 69 | expectedHTML = rxNewlines.ReplaceAllString(expectedHTML, "") 70 | 71 | divs := testutil.CreateDivTree() 72 | assert.Equal(t, expectedHTML, dom.OuterHTML(divs[0])) 73 | } 74 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image-has-figure_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/ImageHeuristicsTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer_test 28 | 29 | import ( 30 | "testing" 31 | 32 | "github.com/go-shiori/dom" 33 | "github.com/markusmobius/go-domdistiller/internal/filter/docfilter/scorer" 34 | "github.com/markusmobius/go-domdistiller/internal/testutil" 35 | "github.com/stretchr/testify/assert" 36 | ) 37 | 38 | func Test_Filter_DocFilter_Scorer_ImageHasFigureScorer(t *testing.T) { 39 | root := testutil.CreateDiv(0) 40 | fig := dom.CreateElement("figure") 41 | 42 | goodImage := dom.CreateElement("img") 43 | dom.SetAttribute(goodImage, "style", "width: 100px; height: 100px; display: block;") 44 | 45 | badImage := dom.CreateElement("img") 46 | dom.SetAttribute(badImage, "style", "width: 100px; height: 100px; display: block;") 47 | 48 | dom.AppendChild(fig, goodImage) 49 | dom.AppendChild(root, fig) 50 | dom.AppendChild(root, badImage) 51 | 52 | imgScorer := scorer.NewImageHasFigureScorer(50) 53 | 54 | assert.True(t, imgScorer.GetImageScore(goodImage) > 0) 55 | assert.Equal(t, 0, imgScorer.GetImageScore(badImage)) 56 | assert.Equal(t, 0, imgScorer.GetImageScore(nil)) 57 | } 58 | -------------------------------------------------------------------------------- /internal/markup/opengraph/prefixes.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/OpenGraphProtocolParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package opengraph 28 | 29 | import "strings" 30 | 31 | type Prefix uint 32 | 33 | const ( 34 | OG Prefix = iota 35 | Profile 36 | Article 37 | ) 38 | 39 | type PrefixNameList map[Prefix]string 40 | 41 | func (prefixes PrefixNameList) addObjectType(prefix, objType string) { 42 | if objType == "" { 43 | prefixes[OG] = prefix 44 | return 45 | } 46 | 47 | objType = strings.TrimPrefix(objType, "/") 48 | if objType == ProfileObjtype { 49 | prefixes[Profile] = prefix 50 | return 51 | } 52 | 53 | if objType == ArticleObjtype { 54 | prefixes[Article] = prefix 55 | } 56 | } 57 | 58 | func (prefixes PrefixNameList) setDefault() { 59 | // For any unspecified prefix, use common ones: 60 | // - "og": http://ogp.me/ns# 61 | // - "profile": http://ogp.me/ns/profile# 62 | // - "article": http://ogp.me/ns/article#. 63 | if _, exist := prefixes[OG]; !exist { 64 | prefixes[OG] = "og" 65 | } 66 | 67 | if _, exist := prefixes[Profile]; !exist { 68 | prefixes[Profile] = ProfileObjtype 69 | } 70 | 71 | if _, exist := prefixes[Article]; !exist { 72 | prefixes[Article] = ArticleObjtype 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image-has-figure.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/images/HasFigureScorer.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer 28 | 29 | import ( 30 | "github.com/markusmobius/go-domdistiller/internal/domutil" 31 | "golang.org/x/net/html" 32 | ) 33 | 34 | // ImageHasFigureScorer scores based on if the image has a "figure" node as an ancestor. 35 | type ImageHasFigureScorer struct { 36 | maxScore int 37 | } 38 | 39 | // NewImageHasFigureScorer returns and initiates the ImageHasFigureScorer. 40 | func NewImageHasFigureScorer(maxScore int) *ImageHasFigureScorer { 41 | return &ImageHasFigureScorer{ 42 | maxScore: maxScore, 43 | } 44 | } 45 | 46 | func (s *ImageHasFigureScorer) GetImageScore(node *html.Node) int { 47 | var score int 48 | if node != nil { 49 | score = s.compute(node) 50 | } 51 | 52 | if score < s.maxScore { 53 | return score 54 | } 55 | 56 | return s.maxScore 57 | } 58 | 59 | func (s *ImageHasFigureScorer) GetMaxScore() int { 60 | return s.maxScore 61 | } 62 | 63 | func (s *ImageHasFigureScorer) compute(node *html.Node) int { 64 | parents := domutil.GetParentNodes(node) 65 | for _, n := range parents { 66 | if n.Type == html.ElementNode && n.Data == "figure" { 67 | return s.maxScore 68 | } 69 | } 70 | return 0 71 | } 72 | -------------------------------------------------------------------------------- /internal/re2go/domutil.re: -------------------------------------------------------------------------------- 1 | /*!include:re2c "base.re" */ 2 | 3 | import "strings" 4 | 5 | // Original pattern: \s+([.?!,;])\s*(\S*) 6 | func TidyUpPunctuation(input string) string { 7 | var cursor, marker int 8 | input += string(rune(0)) // add terminating null 9 | limit := len(input) - 1 // limit points at the terminating null 10 | _ = marker 11 | 12 | // Variable for capturing parentheses (twice the number of groups). 13 | /*!maxnmatch:re2c*/ 14 | yypmatch := make([]int, YYMAXNMATCH*2) 15 | var yynmatch int 16 | _ = yynmatch 17 | 18 | // Autogenerated tag variables used by the lexer to track tag values. 19 | /*!stags:re2c format = 'var @@ int; _ = @@\n'; */ 20 | 21 | var start int 22 | var sb strings.Builder 23 | for { /*!use:re2c:base_template 24 | re2c:posix-captures = 1; 25 | 26 | space = [\t\n\f\r ]; 27 | nonSpace = [^\t\n\f\r ]; 28 | 29 | quant1 = {space}+; 30 | punctuation = {space}+([.?!,;]){space}*({nonSpace}*); 31 | 32 | {quant1} { continue } 33 | {punctuation} { 34 | before := input[start:yypmatch[0]] 35 | submatch1 := input[yypmatch[2]:yypmatch[3]] 36 | submatch2 := input[yypmatch[4]:yypmatch[5]] 37 | 38 | sb.WriteString(before) 39 | sb.WriteString(submatch1) 40 | sb.WriteString(" ") 41 | sb.WriteString(submatch2) 42 | 43 | start = yypmatch[1] 44 | continue 45 | } 46 | 47 | $ { 48 | sb.WriteString(input[start:limit]) 49 | return sb.String() 50 | } 51 | 52 | * { continue } 53 | */ 54 | } 55 | } 56 | 57 | // Original pattern: \s*\|\\/\|\s* 58 | func FixTempNewline(input string) string { 59 | var cursor, marker int 60 | input += string(rune(0)) // add terminating null 61 | limit := len(input) - 1 // limit points at the terminating null 62 | _ = marker 63 | 64 | // Variable for capturing parentheses (twice the number of groups). 65 | /*!maxnmatch:re2c*/ 66 | yypmatch := make([]int, YYMAXNMATCH*2) 67 | var yynmatch int 68 | _ = yynmatch 69 | 70 | // Autogenerated tag variables used by the lexer to track tag values. 71 | /*!stags:re2c format = 'var @@ int; _ = @@\n'; */ 72 | 73 | var start int 74 | var sb strings.Builder 75 | for { /*!use:re2c:base_template 76 | re2c:posix-captures = 1; 77 | 78 | space = [\t\n\f\r ]; 79 | quant1 = {space}*[^|]; 80 | tmpNewline = {space}*[|][\\][/][|]{space}*; 81 | 82 | {quant1} { continue } 83 | 84 | {tmpNewline} { 85 | sb.WriteString(input[start:yypmatch[0]]) 86 | sb.WriteString("\n") 87 | start = yypmatch[1] 88 | continue 89 | } 90 | 91 | $ { 92 | sb.WriteString(input[start:limit]) 93 | return sb.String() 94 | } 95 | 96 | * { continue } 97 | */ 98 | } 99 | } -------------------------------------------------------------------------------- /internal/filter/docfilter/nested-element.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/NestedElementRetainer.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package docfilter 28 | 29 | import ( 30 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 31 | ) 32 | 33 | type NestedElementRetainer struct{} 34 | 35 | func NewNestedElementRetainer() *NestedElementRetainer { 36 | return &NestedElementRetainer{} 37 | } 38 | 39 | func (f *NestedElementRetainer) Process(doc *webdoc.Document) bool { 40 | isContent := false 41 | stackMark := -1 42 | stack := []*webdoc.Tag{} 43 | 44 | for _, e := range doc.Elements { 45 | if webTag, isTag := e.(*webdoc.Tag); !isTag { 46 | if !isContent { 47 | isContent = e.IsContent() 48 | } 49 | } else { 50 | if webTag.Type == webdoc.TagStart { 51 | webTag.SetIsContent(isContent) 52 | stack = append(stack, webTag) 53 | isContent = false 54 | } else { 55 | startWebTag := stack[len(stack)-1] 56 | stack = stack[:len(stack)-1] 57 | 58 | isContent = isContent || stackMark >= len(stack) 59 | if isContent { 60 | stackMark = len(stack) - 1 61 | } 62 | 63 | wasContent := startWebTag.IsContent() 64 | startWebTag.SetIsContent(isContent) 65 | webTag.SetIsContent(isContent) 66 | isContent = wasContent 67 | } 68 | } 69 | } 70 | 71 | return true 72 | } 73 | -------------------------------------------------------------------------------- /internal/tableclass/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/TableClassifier.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package tableclass 28 | 29 | var headerTags = map[string]bool{ 30 | "colgroup": false, 31 | "col": false, 32 | "th": true, 33 | } 34 | 35 | var objectTags = map[string]bool{ 36 | "embed": false, 37 | "object": false, 38 | "applet": false, 39 | "iframe": false, 40 | } 41 | 42 | // ARIA roles for table, see http://www.w3.org/TR/wai-aria/roles#widget_roles_header. 43 | var ariaTableRoles = map[string]struct{}{ 44 | "grid": {}, 45 | "treegrid": {}, 46 | } 47 | 48 | // ARIA roles for descendants of table, see : 49 | // - http://www.w3.org/TR/wai-aria/roles#widget_roles_header. 50 | // - http://www.w3.org/TR/wai-aria/roles#document_structure_roles_header. 51 | var ariaTableDescendantRoles = map[string]struct{}{ 52 | "gridcell": {}, 53 | "columnheader": {}, 54 | "row": {}, 55 | "rowgroup": {}, 56 | "rowheader": {}, 57 | } 58 | 59 | // ARIA landmark roles, applicable to both table and its descendants 60 | // - http://www.w3.org/TR/wai-aria/roles#landmark_roles_header. 61 | var ariaRoles = map[string]struct{}{ 62 | "application": {}, 63 | "banner": {}, 64 | "complementary": {}, 65 | "contentinfo": {}, 66 | "form": {}, 67 | "main": {}, 68 | "navigation": {}, 69 | "search": {}, 70 | } 71 | -------------------------------------------------------------------------------- /internal/filter/filter.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/BoilerpipeFilter.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package filter 44 | 45 | import "github.com/markusmobius/go-domdistiller/internal/webdoc" 46 | 47 | // TextDocumentFilter is interface for filter that process a TextDocument. 48 | type TextDocumentFilter interface { 49 | // Process processes the given document. 50 | // Returns true if changes have been made to the document. 51 | Process(doc *webdoc.TextDocument) bool 52 | } 53 | 54 | // DocumentFilter is interface for filter that process a Document. 55 | type DocumentFilter interface { 56 | // Process processes the given document. 57 | Process(doc *webdoc.Document) bool 58 | } 59 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image-dom-distance_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/ImageHeuristicsTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer_test 28 | 29 | import ( 30 | "testing" 31 | 32 | "github.com/go-shiori/dom" 33 | "github.com/markusmobius/go-domdistiller/internal/filter/docfilter/scorer" 34 | "github.com/markusmobius/go-domdistiller/internal/testutil" 35 | "github.com/stretchr/testify/assert" 36 | ) 37 | 38 | func Test_Filter_DocFilter_Scorer_ImageDomDistanceScorer(t *testing.T) { 39 | root := testutil.CreateDiv(0) 40 | content := testutil.CreateDiv(1) 41 | image := dom.CreateElement("img") 42 | dom.SetAttribute(image, "style", "width: 100px; height: 100px; display: block;") 43 | 44 | dom.AppendChild(content, image) 45 | dom.AppendChild(root, content) 46 | 47 | // Build long chain of divs to separate image from content. 48 | currentDiv := testutil.CreateDiv(3) 49 | dom.AppendChild(root, currentDiv) 50 | for i := 0; i < 7; i++ { 51 | child := testutil.CreateDiv(i + 4) 52 | dom.AppendChild(currentDiv, child) 53 | currentDiv = child 54 | } 55 | 56 | normalScorer := scorer.NewImageDomDistanceScorer(50, content) 57 | farContentScorer := scorer.NewImageDomDistanceScorer(50, currentDiv) 58 | 59 | assert.True(t, normalScorer.GetImageScore(image) > 0) 60 | assert.Equal(t, 0, farContentScorer.GetImageScore(image)) 61 | assert.Equal(t, 0, normalScorer.GetImageScore(nil)) 62 | } 63 | -------------------------------------------------------------------------------- /internal/pagination/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParameterParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package pagination 28 | 29 | import ( 30 | "regexp" 31 | "strconv" 32 | "strings" 33 | "unicode" 34 | ) 35 | 36 | const ( 37 | // If the numeric value of a link's anchor text is greater than this number, 38 | // we don't think it represents the page number of the link. 39 | MaxNumForPageParam = 100 40 | ) 41 | 42 | var ( 43 | // Regex for page number finder. If you are looking for regex for prev next finder, 44 | // they are compiled to re2go because it's quite slow. 45 | rxLinkNumberCleaner = regexp.MustCompile(`[()\[\]{}]`) 46 | rxInvalidParentWrapper = regexp.MustCompile(`(?i)(body)|(html)`) 47 | rxTerms = regexp.MustCompile(`(?i)(\S*[\w\x{00C0}-\x{1FFF}\x{2C00}-\x{D7FF}]\S*)`) 48 | rxSurroundingDigits = regexp.MustCompile(`(?i)^[\W_]*(\d+)[\W_]*$`) 49 | ) 50 | 51 | func containsNumber(s string) bool { 52 | for _, r := range s { 53 | if unicode.IsDigit(r) { 54 | return true 55 | } 56 | } 57 | return false 58 | } 59 | 60 | func getStartingNumber(s string) (int, bool) { 61 | var b strings.Builder 62 | for _, r := range s { 63 | if !unicode.IsDigit(r) { 64 | break 65 | } 66 | b.WriteRune(r) 67 | } 68 | 69 | str := b.String() 70 | if str == "" { 71 | return 0, false 72 | } 73 | 74 | i, err := strconv.Atoi(b.String()) 75 | return i, err == nil 76 | } 77 | -------------------------------------------------------------------------------- /internal/pagination/pattern/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParameterDetector.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package pattern 28 | 29 | import "regexp" 30 | 31 | const ( 32 | PageParamPlaceholder = "[*!]" 33 | ) 34 | 35 | var ( 36 | rxNumber = regexp.MustCompile(`(?i)(\d+)`) 37 | rxEndOrHasSHTML = regexp.MustCompile(`(?i)(.s?html?)?$`) 38 | rxLastPathComponent = regexp.MustCompile(`(?i)([^/]*)/$`) 39 | rxTrailingSlashHTML = regexp.MustCompile(`(?i)(?:/|(.html?))$`) 40 | rxPageParamSeparator = regexp.MustCompile(`[-_;,]`) 41 | ) 42 | 43 | var badPageParamNames = map[string]struct{}{ 44 | "baixar-gratis": {}, 45 | "category": {}, 46 | "content": {}, 47 | "day": {}, 48 | "date": {}, 49 | "definition": {}, 50 | "etiket": {}, 51 | "film-seyret": {}, 52 | "key": {}, 53 | "keys": {}, 54 | "keyword": {}, 55 | "label": {}, 56 | "news": {}, 57 | "q": {}, 58 | "query": {}, 59 | "rating": {}, 60 | "s": {}, 61 | "search": {}, 62 | "seasons": {}, 63 | "search_keyword": {}, 64 | "search_query": {}, 65 | "sortby": {}, 66 | "subscriptions": {}, 67 | "tag": {}, 68 | "tags": {}, 69 | "video": {}, 70 | "videos": {}, 71 | "w": {}, 72 | "wiki": {}, 73 | } 74 | -------------------------------------------------------------------------------- /internal/extractor/embed/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/extractors/embeds/*.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package embed 28 | 29 | import "regexp" 30 | 31 | var ( 32 | rxB64DataURL = regexp.MustCompile(`(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*`) 33 | rxSrcsetURL = regexp.MustCompile(`(?i)(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))`) 34 | rxImgExtensions = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)`) 35 | rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) 36 | rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) 37 | 38 | figureImageSelectors = []string{ 39 | "noscript picture", 40 | "noscript img", 41 | "picture", 42 | "img", 43 | } 44 | 45 | lazyImageSrcAttrs = []string{ 46 | "data-src", 47 | "data-original", 48 | "datasrc", 49 | "data-url", 50 | } 51 | 52 | lazyImageSrcsetAttrs = []string{ 53 | "data-srcset", 54 | "datasrcset", 55 | } 56 | 57 | relevantImageTags = map[string]struct{}{ 58 | // TODO: Add "div" to this list for css images and possibly captions. 59 | "img": {}, 60 | "picture": {}, 61 | "figure": {}, 62 | "span": {}, 63 | } 64 | 65 | relevantTwitterTags = map[string]struct{}{ 66 | "blockquote": {}, 67 | "iframe": {}, 68 | } 69 | 70 | relevantVimeoTags = map[string]struct{}{ 71 | "iframe": {}, 72 | } 73 | 74 | relevantYouTubeTags = map[string]struct{}{ 75 | "iframe": {}, 76 | "object": {}, 77 | } 78 | ) 79 | -------------------------------------------------------------------------------- /internal/markup/accessor.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/MarkupParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package markup 28 | 29 | import "github.com/markusmobius/go-domdistiller/data" 30 | 31 | // Accessor is the interface that all parsers must implement so that Parser 32 | // can retrieve their properties. 33 | type Accessor interface { 34 | // Title returns the markup title of the document, empty if none. 35 | Title() string 36 | 37 | // Type returns the markup type of the document, empty if none. 38 | Type() string 39 | 40 | // URL returns the markup url of the document, empty if none. 41 | URL() string 42 | 43 | // Images returns the properties of all markup images in the document. 44 | // The first image is the dominant (i.e. top or salient) one. 45 | Images() []data.MarkupImage 46 | 47 | // Description returns the markup description of the document, empty if none. 48 | Description() string 49 | 50 | // Publisher returns the markup publisher of the document, empty if none. 51 | Publisher() string 52 | 53 | // Copyright returns the markup copyright of the document, empty if none. 54 | Copyright() string 55 | 56 | // Author returns the full name of the markup author, empty if none. 57 | Author() string 58 | 59 | // Article returns the properties of the markup "article" object, null if none. 60 | Article() *data.MarkupArticle 61 | 62 | // OptOut returns true if page owner has opted out of distillation. 63 | OptOut() bool 64 | } 65 | -------------------------------------------------------------------------------- /internal/markup/schemaorg/thing-item-image.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/SchemaOrgParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package schemaorg 28 | 29 | import ( 30 | "strconv" 31 | "strings" 32 | 33 | "github.com/markusmobius/go-domdistiller/data" 34 | "golang.org/x/net/html" 35 | ) 36 | 37 | type ImageItem struct { 38 | BaseThingItem 39 | } 40 | 41 | func NewImageItem(element *html.Node) *ImageItem { 42 | item := &ImageItem{} 43 | item.init(Image, element) 44 | item.addStringPropertyName(ContentURLProp) 45 | item.addStringPropertyName(EncodingFormatProp) 46 | item.addStringPropertyName(CaptionProp) 47 | item.addStringPropertyName(RepresentativeProp) 48 | item.addStringPropertyName(WidthProp) 49 | item.addStringPropertyName(HeightProp) 50 | return item 51 | } 52 | 53 | func (ii *ImageItem) isRepresentativeOfPage() bool { 54 | propValue := ii.getStringProperty(RepresentativeProp) 55 | return strings.ToLower(propValue) == "true" 56 | } 57 | 58 | func (ii *ImageItem) getImage() *data.MarkupImage { 59 | width, _ := strconv.Atoi(ii.getStringProperty(WidthProp)) 60 | height, _ := strconv.Atoi(ii.getStringProperty(HeightProp)) 61 | imageURL := ii.getStringProperty(ContentURLProp) 62 | if imageURL == "" { 63 | imageURL = ii.getStringProperty(URLProp) 64 | } 65 | 66 | return &data.MarkupImage{ 67 | URL: imageURL, 68 | Type: ii.getStringProperty(EncodingFormatProp), 69 | Caption: ii.getStringProperty(CaptionProp), 70 | Width: width, 71 | Height: height, 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /internal/webdoc/table.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebTable.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import ( 30 | "fmt" 31 | nurl "net/url" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/domutil" 35 | "golang.org/x/net/html" 36 | ) 37 | 38 | type Table struct { 39 | BaseElement 40 | 41 | Element *html.Node 42 | PageURL *nurl.URL 43 | 44 | cloned *html.Node 45 | } 46 | 47 | func (t *Table) ElementType() string { 48 | return "table" 49 | } 50 | 51 | func (t *Table) GenerateOutput(textOnly bool) string { 52 | if t.cloned == nil { 53 | t.cloned = domutil.CloneAndProcessTree(t.Element, t.PageURL) 54 | } 55 | 56 | if textOnly { 57 | return domutil.InnerText(t.cloned) 58 | } 59 | 60 | return dom.OuterHTML(t.cloned) 61 | } 62 | 63 | // GetImageURLs returns list of source URLs of all image inside the table. 64 | func (t *Table) GetImageURLs() []string { 65 | if t.cloned == nil { 66 | t.cloned = domutil.CloneAndProcessTree(t.Element, t.PageURL) 67 | } 68 | 69 | imgURLs := []string{} 70 | for _, img := range dom.QuerySelectorAll(t.cloned, "img,source") { 71 | src := dom.GetAttribute(img, "src") 72 | if src != "" { 73 | imgURLs = append(imgURLs, src) 74 | } 75 | 76 | imgURLs = append(imgURLs, domutil.GetAllSrcSetURLs(img)...) 77 | } 78 | 79 | return imgURLs 80 | } 81 | 82 | func (t *Table) String() string { 83 | return fmt.Sprintf("ELEMENT %q: html=%q, is_content=%v", 84 | t.ElementType(), dom.OuterHTML(t.Element), t.isContent) 85 | } 86 | -------------------------------------------------------------------------------- /internal/webdoc/text-document_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/TextDocumentStatisticsTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc_test 28 | 29 | import ( 30 | "testing" 31 | 32 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 33 | "github.com/markusmobius/go-domdistiller/internal/testutil" 34 | "github.com/stretchr/testify/assert" 35 | ) 36 | 37 | const ThreeWords = "I love statistics" 38 | 39 | func Test_WebDoc_TextDocument_OnlyContent(t *testing.T) { 40 | builder := testutil.NewTextDocumentBuilder(stringutil.FastWordCounter{}) 41 | builder.AddContentBlock(ThreeWords) 42 | builder.AddContentBlock(ThreeWords) 43 | builder.AddContentBlock(ThreeWords) 44 | 45 | doc := builder.Build() 46 | assert.Equal(t, 9, doc.CountWordsInContent()) 47 | } 48 | 49 | func Test_WebDoc_TextDocument_OnlyNonContent(t *testing.T) { 50 | builder := testutil.NewTextDocumentBuilder(stringutil.FastWordCounter{}) 51 | builder.AddNonContentBlock(ThreeWords) 52 | builder.AddNonContentBlock(ThreeWords) 53 | builder.AddNonContentBlock(ThreeWords) 54 | 55 | doc := builder.Build() 56 | assert.Equal(t, 0, doc.CountWordsInContent()) 57 | } 58 | 59 | func Test_WebDoc_TextDocument_MixedContent(t *testing.T) { 60 | builder := testutil.NewTextDocumentBuilder(stringutil.FastWordCounter{}) 61 | builder.AddContentBlock(ThreeWords) 62 | builder.AddNonContentBlock(ThreeWords) 63 | builder.AddContentBlock(ThreeWords) 64 | builder.AddNonContentBlock(ThreeWords) 65 | 66 | doc := builder.Build() 67 | assert.Equal(t, 6, doc.CountWordsInContent()) 68 | } 69 | -------------------------------------------------------------------------------- /internal/domutil/tree-clone.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/TreeCloneBuilder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package domutil 28 | 29 | import ( 30 | "golang.org/x/net/html" 31 | ) 32 | 33 | // TreeClone takes a list of nodes and returns a clone of the minimum tree in the 34 | // DOM that contains all of them. This is done by going through each node, cloning its 35 | // parent and adding children to that parent until the next node is not contained in 36 | // that parent (originally). The list cannot contain a parent of any of the other nodes. 37 | // Children of the nodes in the provided list are excluded. 38 | // 39 | // This implementation doesn't come from the original dom-distiller code. Instead I 40 | // created it from scratch to make it simpler and more Go idiomatic. 41 | func TreeClone(nodes []*html.Node) *html.Node { 42 | // Get the nearest ancestor 43 | allAncestors, nearestAncestor := GetAncestors(nodes...) 44 | if nearestAncestor == nil { 45 | return nil 46 | } 47 | 48 | // Clone the ancestor and childrens that required to reach specified nodes 49 | var fnClone func(src *html.Node) *html.Node 50 | fnClone = func(src *html.Node) *html.Node { 51 | clone := &html.Node{ 52 | Type: src.Type, 53 | DataAtom: src.DataAtom, 54 | Data: src.Data, 55 | Attr: append([]html.Attribute{}, src.Attr...), 56 | } 57 | 58 | for child := src.FirstChild; child != nil; child = child.NextSibling { 59 | if _, exist := allAncestors[child]; exist { 60 | clone.AppendChild(fnClone(child)) 61 | } 62 | } 63 | 64 | return clone 65 | } 66 | 67 | return fnClone(nearestAncestor) 68 | } 69 | -------------------------------------------------------------------------------- /internal/filter/simple/label-to-boilerplate.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/simple/LabelToBoilerplateFilter.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package simple 44 | 45 | import ( 46 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 47 | ) 48 | 49 | // LabelToBoilerplate marks all blocks that contain a given label as "boilerplate". 50 | type LabelToBoilerplate struct { 51 | labels []string 52 | } 53 | 54 | func NewLabelToBoilerplate(labels ...string) *LabelToBoilerplate { 55 | return &LabelToBoilerplate{labels: labels} 56 | } 57 | 58 | func (f *LabelToBoilerplate) Process(doc *webdoc.TextDocument) bool { 59 | changes := false 60 | 61 | blockLoop: 62 | for _, tb := range doc.TextBlocks { 63 | if tb.IsContent() { 64 | for _, label := range f.labels { 65 | if tb.HasLabel(label) { 66 | tb.SetIsContent(false) 67 | changes = true 68 | continue blockLoop 69 | } 70 | } 71 | } 72 | } 73 | 74 | return changes 75 | } 76 | -------------------------------------------------------------------------------- /internal/filter/docfilter/scorer/image-dom-distance.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/filters/images/DomDistanceScorer.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package scorer 28 | 29 | import ( 30 | "github.com/markusmobius/go-domdistiller/internal/domutil" 31 | "golang.org/x/net/html" 32 | ) 33 | 34 | // ImageDomDistanceScorer uses DOM distance as its heuristic. 35 | type ImageDomDistanceScorer struct { 36 | maxScore int 37 | firstContentNode *html.Node 38 | } 39 | 40 | // NewImageDomDistanceScorer returns and initiates the ImageDomDistanceScorer. 41 | func NewImageDomDistanceScorer(maxScore int, firstContent *html.Node) *ImageDomDistanceScorer { 42 | return &ImageDomDistanceScorer{ 43 | maxScore: maxScore, 44 | firstContentNode: firstContent, 45 | } 46 | } 47 | 48 | func (s *ImageDomDistanceScorer) GetImageScore(node *html.Node) int { 49 | var score int 50 | if node != nil { 51 | score = s.compute(node) 52 | } 53 | 54 | if score < s.maxScore { 55 | return score 56 | } 57 | 58 | return s.maxScore 59 | } 60 | 61 | func (s *ImageDomDistanceScorer) GetMaxScore() int { 62 | return s.maxScore 63 | } 64 | 65 | func (s *ImageDomDistanceScorer) compute(node *html.Node) int { 66 | if s.firstContentNode == nil { 67 | return 0 68 | } 69 | 70 | depthDiff := domutil.GetNodeDepth(s.firstContentNode) - 71 | domutil.GetNodeDepth(domutil.GetNearestCommonAncestor(s.firstContentNode, node)) 72 | 73 | var multiplier float64 74 | if depthDiff < 4 { 75 | multiplier = 1 76 | } else if depthDiff < 6 { 77 | multiplier = 0.6 78 | } else if depthDiff < 8 { 79 | multiplier = 0.2 80 | } 81 | 82 | return int(float64(s.maxScore) * multiplier) 83 | } 84 | -------------------------------------------------------------------------------- /internal/domutil/tree-clone_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/TreeCloneBuilderTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package domutil_test 28 | 29 | import ( 30 | "regexp" 31 | "testing" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/domutil" 35 | "github.com/markusmobius/go-domdistiller/internal/testutil" 36 | "github.com/stretchr/testify/assert" 37 | "golang.org/x/net/html" 38 | ) 39 | 40 | func Test_DomUtil_TreeClone_FullTreeBuilder(t *testing.T) { 41 | expectedHTML := ` 42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
` 56 | 57 | expectedHTML = regexp.MustCompile(`(?mi)^\s+`).ReplaceAllString(expectedHTML, "") 58 | expectedHTML = regexp.MustCompile(`(?i)\n`).ReplaceAllString(expectedHTML, "") 59 | 60 | divs := testutil.CreateDivTree() 61 | leafNodes := []*html.Node{divs[3], divs[4], divs[5], divs[14]} 62 | 63 | root := domutil.TreeClone(leafNodes) 64 | assert.Equal(t, expectedHTML, dom.OuterHTML(root)) 65 | } 66 | 67 | func Test_DomUtil_TreeClone_SingleLeafNode(t *testing.T) { 68 | leafNodes := []*html.Node{dom.CreateTextNode("some content")} 69 | 70 | root := domutil.TreeClone(leafNodes) 71 | assert.Equal(t, dom.TextContent(leafNodes[0]), dom.TextContent(root)) 72 | } 73 | 74 | func Test_DomUtil_TreeClone_NoCommonAncestors(t *testing.T) { 75 | divs := testutil.CreateDivTree() 76 | leafNodes := []*html.Node{divs[3], dom.CreateElement("div")} 77 | 78 | root := domutil.TreeClone(leafNodes) 79 | assert.Nil(t, root) 80 | } 81 | -------------------------------------------------------------------------------- /internal/label/label.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/labels/DefaultLabels.java 2 | 3 | // Label package contains some pre-defined labels for web element. 4 | 5 | // Copyright (c) 2020 Markus Mobius 6 | // 7 | // Permission is hereby granted, free of charge, to any person obtaining a copy 8 | // of this software and associated documentation files (the "Software"), to deal 9 | // in the Software without restriction, including without limitation the rights 10 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | // copies of the Software, and to permit persons to whom the Software is 12 | // furnished to do so, subject to the following conditions: 13 | // 14 | // The above copyright notice and this permission notice shall be included in all 15 | // copies or substantial portions of the Software. 16 | // 17 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | // SOFTWARE. 24 | 25 | // Copyright 2014 The Chromium Authors. All rights reserved. 26 | // Use of this source code is governed by a BSD-style license that can be 27 | // found in the LICENSE file. 28 | 29 | // boilerpipe 30 | // 31 | // Copyright (c) 2009 Christian Kohlschütter 32 | // 33 | // The author licenses this file to You under the Apache License, Version 2.0 34 | // (the "License"); you may not use this file except in compliance with 35 | // the License. You may obtain a copy of the License at 36 | // 37 | // http://www.apache.org/licenses/LICENSE-2.0 38 | // 39 | // Unless required by applicable law or agreed to in writing, software 40 | // distributed under the License is distributed on an "AS IS" BASIS, 41 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 42 | // See the License for the specific language governing permissions and 43 | // limitations under the License. 44 | 45 | package label 46 | 47 | const ( 48 | Title = "de.l3s.boilerpipe/TITLE" 49 | ArticleMetadata = "de.l3s.boilerpipe/ARTICLE_METADATA" 50 | MightBeContent = "de.l3s.boilerpipe/MIGHT_BE_CONTENT" 51 | VeryLikelyContent = "de.l3s.boilerpipe/VERY_LIKELY_CONTENT" 52 | Hr = "de.l3s.boilerpipe/HR" 53 | Li = "de.l3s.boilerpipe/LI" 54 | Heading = "de.l3s.boilerpipe/HEADING" 55 | H1 = "de.l3s.boilerpipe/H1" 56 | H2 = "de.l3s.boilerpipe/H2" 57 | H3 = "de.l3s.boilerpipe/H3" 58 | MarkupPrefix = "<" 59 | BoilerplateHeadingFused = "BOILERPLATE_HEADING_FUSED" 60 | StrictlyNotContent = "STRICTLY_NOT_CONTENT" 61 | SiblingOfMainContent = "SIBLING_OF_MAIN_CONTENT" 62 | ) 63 | -------------------------------------------------------------------------------- /internal/webdoc/video.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebVideo.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import ( 30 | "fmt" 31 | nurl "net/url" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/domutil" 35 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 36 | "golang.org/x/net/html" 37 | ) 38 | 39 | type Video struct { 40 | BaseElement 41 | 42 | // TODO: Handle multiple nested "source" and "track" tags. 43 | Element *html.Node 44 | Width int 45 | Height int 46 | PageURL *nurl.URL 47 | } 48 | 49 | func NewVideo(node *html.Node, pageURL *nurl.URL, width, height int) *Video { 50 | return &Video{ 51 | Element: node, 52 | PageURL: pageURL, 53 | Width: width, 54 | Height: height, 55 | } 56 | } 57 | 58 | func (v *Video) ElementType() string { 59 | return "video" 60 | } 61 | 62 | func (v *Video) GenerateOutput(textOnly bool) string { 63 | if textOnly { 64 | return "" 65 | } 66 | 67 | vNode := dom.Clone(v.Element, false) 68 | for _, child := range dom.Children(v.Element) { 69 | childTag := dom.TagName(child) 70 | if childTag == "source" || childTag == "track" { 71 | dom.AppendChild(vNode, dom.Clone(child, false)) 72 | } 73 | } 74 | 75 | if poster := dom.GetAttribute(vNode, "poster"); poster != "" { 76 | poster = stringutil.CreateAbsoluteURL(poster, v.PageURL) 77 | dom.SetAttribute(vNode, "poster", poster) 78 | } 79 | 80 | domutil.MakeAllSrcAttributesAbsolute(vNode, v.PageURL) 81 | domutil.StripAttributes(vNode) 82 | return dom.OuterHTML(vNode) 83 | } 84 | 85 | func (v *Video) String() string { 86 | return fmt.Sprintf("ELEMENT %q: html=%q, is_content=%v", 87 | v.ElementType(), dom.OuterHTML(v.Element), v.isContent) 88 | } 89 | -------------------------------------------------------------------------------- /internal/filter/heuristic/list-at-end.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/heuristics/ListAtEndFilter.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package heuristic 44 | 45 | import ( 46 | "math" 47 | 48 | "github.com/markusmobius/go-domdistiller/internal/label" 49 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 50 | ) 51 | 52 | // ListAtEnd marks nested list-item blocks after the end of the main content. 53 | type ListAtEnd struct{} 54 | 55 | func NewListAtEnd() *ListAtEnd { 56 | return &ListAtEnd{} 57 | } 58 | 59 | func (f *ListAtEnd) Process(doc *webdoc.TextDocument) bool { 60 | changes := false 61 | tagLevel := math.MaxInt16 62 | 63 | for _, tb := range doc.TextBlocks { 64 | if tb.IsContent() && tb.HasLabel(label.VeryLikelyContent) { 65 | tagLevel = tb.TagLevel 66 | continue 67 | } 68 | 69 | if tb.TagLevel > tagLevel && tb.HasLabel(label.MightBeContent) && 70 | tb.HasLabel(label.Li) && tb.LinkDensity == 0 { 71 | tb.SetIsContent(true) 72 | changes = true 73 | continue 74 | } 75 | 76 | tagLevel = math.MaxInt16 77 | } 78 | 79 | return changes 80 | } 81 | -------------------------------------------------------------------------------- /internal/webdoc/embed.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebEmbed.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import ( 30 | "fmt" 31 | 32 | "github.com/go-shiori/dom" 33 | "github.com/markusmobius/go-domdistiller/internal/domutil" 34 | "golang.org/x/net/html" 35 | ) 36 | 37 | // Embed is the base class for many site-specific embedded 38 | // elements (Twitter, YouTube, etc.). 39 | type Embed struct { 40 | BaseElement 41 | 42 | Element *html.Node 43 | ID string 44 | Type string 45 | Params map[string]string 46 | } 47 | 48 | func (e *Embed) ElementType() string { 49 | return "embed" 50 | } 51 | 52 | func (e *Embed) GenerateOutput(textOnly bool) string { 53 | if textOnly { 54 | return "" 55 | } 56 | 57 | embed := dom.CreateElement("div") 58 | dom.SetAttribute(embed, "class", "embed-placeholder") 59 | dom.SetAttribute(embed, "data-type", e.Type) 60 | dom.SetAttribute(embed, "data-id", e.ID) 61 | 62 | // Radhi: 63 | // I just realize the embed element never used in original dom-distiller. No wonder 64 | // Chromium doesn't render any embedded element. To be fair Readability.js doesn't 65 | // render some embed as well citing security concerns. In my opinion since dom- 66 | // distiller usually only used in page that we already visit, the embedded iframe 67 | // should automatically be trustworthy enough. 68 | // TODO: Maybe just to be save we should sanitize it. 69 | tagName := dom.TagName(e.Element) 70 | if tagName == "blockquote" || tagName == "iframe" { 71 | domutil.StripAttributes(e.Element) 72 | dom.AppendChild(embed, e.Element) 73 | } 74 | 75 | return dom.OuterHTML(embed) 76 | } 77 | 78 | func (e *Embed) String() string { 79 | return fmt.Sprintf("ELEMENT %q: type=%q id=%q, is_content=%v", 80 | e.ElementType(), e.Type, e.ID, e.isContent) 81 | } 82 | -------------------------------------------------------------------------------- /internal/markup/opengraph/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/OpenGraphProtocolParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package opengraph 28 | 29 | const ( 30 | TitleProp = "title" 31 | TypeProp = "type" 32 | ImageProp = "image" 33 | URLProp = "url" 34 | DescriptionProp = "description" 35 | SiteNameProp = "site_name" 36 | ImageStructPropPfx = "image:" 37 | ImageURLProp = "image:url" 38 | ImageSecureURLProp = "image:secure_url" 39 | ImageTypeProp = "image:type" 40 | ImageWidthProp = "image:width" 41 | ImageHeightProp = "image:height" 42 | ProfileFirstnameProp = "first_name" 43 | ProfileLastnameProp = "last_name" 44 | ArticleSectionProp = "section" 45 | ArticlePublishedTimeProp = "published_time" 46 | ArticleModifiedTimeProp = "modified_time" 47 | ArticleExpirationTimeProp = "expiration_time" 48 | ArticleAuthorProp = "author" 49 | ProfileObjtype = "profile" 50 | ArticleObjtype = "article" 51 | 52 | doPrefixFiltering = true 53 | ) 54 | 55 | var importantProperties = []struct { 56 | Name string 57 | Prefix Prefix 58 | Type string 59 | }{ 60 | {TitleProp, OG, ""}, 61 | {TypeProp, OG, ""}, 62 | {URLProp, OG, ""}, 63 | {DescriptionProp, OG, ""}, 64 | {SiteNameProp, OG, ""}, 65 | {ImageProp, OG, "image"}, 66 | {ImageStructPropPfx, OG, "image"}, 67 | {ProfileFirstnameProp, Profile, "profile"}, 68 | {ProfileLastnameProp, Profile, "profile"}, 69 | {ArticleSectionProp, Article, "article"}, 70 | {ArticlePublishedTimeProp, Article, "article"}, 71 | {ArticleModifiedTimeProp, Article, "article"}, 72 | {ArticleExpirationTimeProp, Article, "article"}, 73 | {ArticleAuthorProp, Article, "article"}, 74 | } 75 | -------------------------------------------------------------------------------- /internal/testutil/text-document-builder.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/TestTextDocumentBuilder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | import ( 30 | "net/url" 31 | 32 | "github.com/markusmobius/go-domdistiller/internal/converter" 33 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 34 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 35 | "golang.org/x/net/html" 36 | ) 37 | 38 | type TextDocumentBuilder struct { 39 | textBlocks []*webdoc.TextBlock 40 | textBuilder *TextBuilder 41 | } 42 | 43 | func NewTextDocumentBuilder(wc stringutil.WordCounter) *TextDocumentBuilder { 44 | return &TextDocumentBuilder{ 45 | textBuilder: NewTextBuilder(wc), 46 | } 47 | } 48 | 49 | func (tdb *TextDocumentBuilder) AddContentBlock(str string, labels ...string) *webdoc.TextBlock { 50 | tb := tdb.addBlock(str, labels...) 51 | tb.SetIsContent(true) 52 | return tb 53 | } 54 | 55 | func (tdb *TextDocumentBuilder) AddNonContentBlock(str string, labels ...string) *webdoc.TextBlock { 56 | tb := tdb.addBlock(str, labels...) 57 | tb.SetIsContent(false) 58 | return tb 59 | } 60 | 61 | func (tdb *TextDocumentBuilder) Build() *webdoc.TextDocument { 62 | return webdoc.NewTextDocument(tdb.textBlocks) 63 | } 64 | 65 | func (tdb *TextDocumentBuilder) addBlock(str string, labels ...string) *webdoc.TextBlock { 66 | wt := tdb.textBuilder.CreateForText(str) 67 | for _, label := range labels { 68 | wt.AddLabel(label) 69 | } 70 | 71 | tdb.textBlocks = append(tdb.textBlocks, webdoc.NewTextBlock(wt)) 72 | return tdb.textBlocks[len(tdb.textBlocks)-1] 73 | } 74 | 75 | func NewTextDocumentFromPage(doc *html.Node, wc stringutil.WordCounter, pageURL *url.URL) *webdoc.TextDocument { 76 | builder := webdoc.NewWebDocumentBuilder(wc, pageURL) 77 | converter.NewDomConverter(converter.Default, builder, pageURL, nil).Convert(doc) 78 | return builder.Build().CreateTextDocument() 79 | } 80 | -------------------------------------------------------------------------------- /internal/filter/simple/boilerplate-block.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/simple/BoilerplateBlockFilter.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package simple 44 | 45 | import ( 46 | "github.com/markusmobius/go-domdistiller/internal/label" 47 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 48 | ) 49 | 50 | // BoilerplateBlock removes TextBlocks which have explicitly been 51 | // marked as "not content". 52 | type BoilerplateBlock struct { 53 | labelToKeep string 54 | } 55 | 56 | func NewBoilerplateBlock(labelToKeep string) *BoilerplateBlock { 57 | return &BoilerplateBlock{labelToKeep: labelToKeep} 58 | } 59 | 60 | func (f *BoilerplateBlock) Process(doc *webdoc.TextDocument) bool { 61 | hasChanges := false 62 | textBlocks := doc.TextBlocks 63 | 64 | for i := 0; i < len(textBlocks); i++ { 65 | tb := textBlocks[i] 66 | if !tb.IsContent() && (f.labelToKeep != "" || !tb.HasLabel(label.Title)) { 67 | hasChanges = true 68 | 69 | // These lines is used to remove item from array. 70 | copy(textBlocks[i:], textBlocks[i+1:]) 71 | textBlocks[len(textBlocks)-1] = nil 72 | textBlocks = textBlocks[:len(textBlocks)-1] 73 | i-- 74 | } 75 | } 76 | 77 | doc.TextBlocks = textBlocks 78 | return hasChanges 79 | } 80 | -------------------------------------------------------------------------------- /internal/webdoc/text-document.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/document/TextDocument.java and 2 | // java/document/TextDocumentStatistics.java 3 | 4 | // Copyright (c) 2020 Markus Mobius 5 | // 6 | // Permission is hereby granted, free of charge, to any person obtaining a copy 7 | // of this software and associated documentation files (the "Software"), to deal 8 | // in the Software without restriction, including without limitation the rights 9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the Software is 11 | // furnished to do so, subject to the following conditions: 12 | // 13 | // The above copyright notice and this permission notice shall be included in all 14 | // copies or substantial portions of the Software. 15 | // 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | // SOFTWARE. 23 | 24 | // Copyright 2014 The Chromium Authors. All rights reserved. 25 | // Use of this source code is governed by a BSD-style license that can be 26 | // found in the LICENSE file. 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package webdoc 44 | 45 | import ( 46 | "strings" 47 | ) 48 | 49 | // TextDocument is a text document, consisting of one or more TextBlock. 50 | type TextDocument struct { 51 | TextBlocks []*TextBlock 52 | } 53 | 54 | func NewTextDocument(textBlocks []*TextBlock) *TextDocument { 55 | return &TextDocument{textBlocks} 56 | } 57 | 58 | func (td *TextDocument) ApplyToModel() { 59 | for _, tb := range td.TextBlocks { 60 | tb.ApplyToModel() 61 | } 62 | } 63 | 64 | // CountWordsInContent returns the sum of number of words in content blocks. 65 | func (td *TextDocument) CountWordsInContent() int { 66 | numWords := 0 67 | for _, tb := range td.TextBlocks { 68 | if tb.IsContent() { 69 | numWords += tb.NumWords 70 | } 71 | } 72 | return numWords 73 | } 74 | 75 | // DebugString returns detailed debugging information about the contained TextBlocks. 76 | func (td *TextDocument) DebugString() string { 77 | var buffer strings.Builder 78 | for _, tb := range td.TextBlocks { 79 | buffer.WriteString(tb.String()) 80 | buffer.WriteString("\n") 81 | } 82 | return buffer.String() 83 | } 84 | -------------------------------------------------------------------------------- /internal/tableclass/type-reason.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/TableClassifier.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package tableclass 28 | 29 | type Reason uint 30 | 31 | const ( 32 | Unknown Reason = iota 33 | InsideEditableArea 34 | RoleTable 35 | RoleDescendant 36 | Datatable0 37 | CaptionTheadTfootColgroupColTh 38 | AbbrHeadersScope 39 | OnlyHasAbbr 40 | More95PercentDocWidth 41 | Summary 42 | NestedTable 43 | LessEq1Row 44 | LessEq1Col 45 | MoreEq5Cols 46 | CellsHaveBorder 47 | DifferentlyColoredRows 48 | MoreEq20Rows 49 | LessEq10Cells 50 | EmbedObjectAppletIframe 51 | More90PercentDocHeight 52 | Default 53 | ) 54 | 55 | func (r Reason) String() string { 56 | switch r { 57 | case InsideEditableArea: 58 | return "InsideEditableArea" 59 | case RoleTable: 60 | return "RoleTable" 61 | case RoleDescendant: 62 | return "RoleDescendant" 63 | case Datatable0: 64 | return "Datatable0" 65 | case CaptionTheadTfootColgroupColTh: 66 | return "CaptionTheadTfootColgroupColTh" 67 | case AbbrHeadersScope: 68 | return "AbbrHeadersScope" 69 | case OnlyHasAbbr: 70 | return "OnlyHasAbbr" 71 | case More95PercentDocWidth: 72 | return "More95PercentDocWidth" 73 | case Summary: 74 | return "Summary" 75 | case NestedTable: 76 | return "NestedTable" 77 | case LessEq1Row: 78 | return "LessEq1Row" 79 | case LessEq1Col: 80 | return "LessEq1Col" 81 | case MoreEq5Cols: 82 | return "MoreEq5Cols" 83 | case CellsHaveBorder: 84 | return "CellsHaveBorder" 85 | case DifferentlyColoredRows: 86 | return "DifferentlyColoredRows" 87 | case MoreEq20Rows: 88 | return "MoreEq20Rows" 89 | case LessEq10Cells: 90 | return "LessEq10Cells" 91 | case EmbedObjectAppletIframe: 92 | return "EmbedObjectAppletIframe" 93 | case More90PercentDocHeight: 94 | return "More90PercentDocHeight" 95 | case Default: 96 | return "Default" 97 | } 98 | return "Unknown" 99 | } 100 | -------------------------------------------------------------------------------- /internal/extractor/embed/embed-vimeo_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/EmbedExtractorTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package embed_test 28 | 29 | import ( 30 | nurl "net/url" 31 | "testing" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/extractor/embed" 35 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 36 | "github.com/stretchr/testify/assert" 37 | ) 38 | 39 | func Test_Embed_Vimeo_Extract(t *testing.T) { 40 | vimeo := dom.CreateElement("iframe") 41 | dom.SetAttribute(vimeo, "src", "//player.vimeo.com/video/12345?portrait=0") 42 | 43 | pageURL, _ := nurl.ParseRequestURI("http://example.com") 44 | extractor := embed.NewVimeoExtractor(pageURL, nil) 45 | result, _ := (extractor.Extract(vimeo)).(*webdoc.Embed) 46 | 47 | // Check Vimeo specific attributes 48 | assert.NotNil(t, result) 49 | assert.Equal(t, "vimeo", result.Type) 50 | assert.Equal(t, "12345", result.ID) 51 | assert.Equal(t, "0", result.Params["portrait"]) 52 | 53 | // Begin negative test 54 | wrongDomain := dom.CreateElement("iframe") 55 | dom.SetAttribute(wrongDomain, "src", "http://vimeo.com/video/09876?portrait=1") 56 | 57 | result, _ = (extractor.Extract(wrongDomain)).(*webdoc.Embed) 58 | assert.Nil(t, result) 59 | } 60 | 61 | func Test_Embed_Vimeo_ExtractID(t *testing.T) { 62 | vimeo := dom.CreateElement("iframe") 63 | dom.SetAttribute(vimeo, "src", "http://player.vimeo.com/video/12345?portrait=0") 64 | 65 | extractor := embed.NewVimeoExtractor(nil, nil) 66 | result, _ := (extractor.Extract(vimeo)).(*webdoc.Embed) 67 | 68 | // Check Vimeo specific attributes 69 | assert.NotNil(t, result) 70 | assert.Equal(t, "vimeo", result.Type) 71 | assert.Equal(t, "12345", result.ID) 72 | 73 | // Begin negative test 74 | wrongDomain := dom.CreateElement("iframe") 75 | dom.SetAttribute(wrongDomain, "src", "http://player.vimeo.com/video") 76 | 77 | result, _ = (extractor.Extract(wrongDomain)).(*webdoc.Embed) 78 | assert.Nil(t, result) 79 | } 80 | -------------------------------------------------------------------------------- /internal/pagination/parser/param-detector.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/PageParameterDetector.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package parser 28 | 29 | import ( 30 | nurl "net/url" 31 | 32 | "github.com/markusmobius/go-domdistiller/internal/logutil" 33 | "github.com/markusmobius/go-domdistiller/internal/pagination/info" 34 | ) 35 | 36 | // DetectParamInfo creates a PageParamInfo based on outlinks and numeric text around them. 37 | // Always return PageParamInfo (never nil). If no page parameter is detected or 38 | // determined to be best, its ParamType is Unset. 39 | func DetectParamInfo(adjacentNumberGroups *info.MonotonicPageInfoGroups, docURL string, logger logutil.Logger) *info.PageParamInfo { 40 | // Make sure URL absolute and clean it 41 | parsedDocURL, err := nurl.ParseRequestURI(docURL) 42 | if err != nil || parsedDocURL.Scheme == "" || parsedDocURL.Hostname() == "" { 43 | return &info.PageParamInfo{} 44 | } 45 | parsedDocURL.User = nil 46 | 47 | // Start detection 48 | detectionState := &DetectionState{} 49 | for _, group := range adjacentNumberGroups.Groups { 50 | if len(group.List) < 2 { 51 | continue 52 | } 53 | 54 | strPattern := "" 55 | if !detectionState.isEmpty() { 56 | strPattern = detectionState.bestPageParamInfo.PagePattern 57 | } 58 | 59 | state := newDetectionStateFromMonotonicNumbers( 60 | group.List, group.DeltaSign < 0, parsedDocURL, strPattern) 61 | if state != nil { 62 | detectionState.compareAndUpdate(state) 63 | } 64 | } 65 | 66 | if detectionState.isEmpty() { 67 | return &info.PageParamInfo{} 68 | } 69 | 70 | // For now, if there're multiple page patterns, we take the first one. 71 | // If this doesn't work for most sites, we might have to return nothing. 72 | if detectionState.hasMultiPagePatterns && logger != nil && !logger.InternallyNil() { 73 | logger.PrintPaginationInfo("Detected multiple page pattern") 74 | } 75 | 76 | bestPageParamInfo := detectionState.bestPageParamInfo 77 | bestPageParamInfo.DetermineNextPagingURL(docURL) 78 | return bestPageParamInfo 79 | } 80 | -------------------------------------------------------------------------------- /internal/testutil/fake-document-builder.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/webdocument/FakeWebDocumentBuilder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | import ( 30 | "strings" 31 | 32 | "github.com/go-shiori/dom" 33 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 34 | "golang.org/x/net/html" 35 | ) 36 | 37 | // FakeWebDocumentBuilder is a simple builder that just creates an html-like string 38 | // from the calls. Only used for dom-converter test. 39 | type FakeWebDocumentBuilder struct { 40 | buffer strings.Builder 41 | nodes []*html.Node 42 | } 43 | 44 | func NewFakeWebDocumentBuilder() *FakeWebDocumentBuilder { 45 | return &FakeWebDocumentBuilder{} 46 | } 47 | 48 | func (db *FakeWebDocumentBuilder) Build() string { 49 | return db.buffer.String() 50 | } 51 | 52 | func (db *FakeWebDocumentBuilder) SkipNode(e *html.Node) {} 53 | 54 | func (db *FakeWebDocumentBuilder) StartNode(e *html.Node) { 55 | db.nodes = append(db.nodes, e) 56 | db.buffer.WriteString("<") 57 | db.buffer.WriteString(dom.TagName(e)) 58 | for _, attr := range e.Attr { 59 | db.buffer.WriteString(" ") 60 | db.buffer.WriteString(attr.Key) 61 | db.buffer.WriteString(`="`) 62 | db.buffer.WriteString(attr.Val) 63 | db.buffer.WriteString(`"`) 64 | } 65 | db.buffer.WriteString(">") 66 | } 67 | 68 | func (db *FakeWebDocumentBuilder) EndNode() { 69 | node := db.nodes[len(db.nodes)-1] 70 | db.nodes = db.nodes[:len(db.nodes)-1] 71 | db.buffer.WriteString("") 74 | } 75 | 76 | func (db *FakeWebDocumentBuilder) AddTextNode(textNode *html.Node) { 77 | db.buffer.WriteString(textNode.Data) 78 | } 79 | 80 | func (db *FakeWebDocumentBuilder) AddLineBreak(node *html.Node) { 81 | db.buffer.WriteString("\n") 82 | } 83 | 84 | func (db *FakeWebDocumentBuilder) AddDataTable(e *html.Node) { 85 | db.buffer.WriteString("") 86 | } 87 | 88 | func (db *FakeWebDocumentBuilder) AddTag(tag *webdoc.Tag) {} 89 | 90 | func (db *FakeWebDocumentBuilder) AddEmbed(embed webdoc.Element) {} 91 | -------------------------------------------------------------------------------- /internal/filter/heuristic/heading-fusion.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/heuristics/HeadingFusion.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package heuristic 28 | 29 | import ( 30 | "github.com/markusmobius/go-domdistiller/internal/label" 31 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 32 | ) 33 | 34 | // HeadingFusion fuses headings with the blocks after them. If the heading was 35 | // marked as boilerplate, the fused block will be labeled to prevent 36 | // BlockProximityFusion from merging through it. 37 | type HeadingFusion struct{} 38 | 39 | func NewHeadingFusion() *HeadingFusion { 40 | return &HeadingFusion{} 41 | } 42 | 43 | func (f *HeadingFusion) Process(doc *webdoc.TextDocument) bool { 44 | textBlocks := doc.TextBlocks 45 | if len(textBlocks) < 2 { 46 | return false 47 | } 48 | 49 | changes := false 50 | currentBlock := textBlocks[0] 51 | var prevBlock *webdoc.TextBlock 52 | 53 | for i := 1; i < len(textBlocks); i++ { 54 | prevBlock = currentBlock 55 | currentBlock = textBlocks[i] 56 | 57 | if !prevBlock.HasLabel(label.Heading) { 58 | continue 59 | } 60 | 61 | if prevBlock.HasLabel(label.StrictlyNotContent) || currentBlock.HasLabel(label.StrictlyNotContent) { 62 | continue 63 | } 64 | 65 | if prevBlock.HasLabel(label.Title) || currentBlock.HasLabel(label.Title) { 66 | continue 67 | } 68 | 69 | if currentBlock.IsContent() { 70 | changes = true 71 | 72 | headingWasContent := prevBlock.IsContent() 73 | prevBlock.MergeNext(currentBlock) 74 | currentBlock = prevBlock 75 | 76 | currentBlock.RemoveLabels(label.Heading) 77 | if !headingWasContent { 78 | currentBlock.AddLabels(label.BoilerplateHeadingFused) 79 | } 80 | 81 | // These lines is used to remove item from array. 82 | copy(textBlocks[i:], textBlocks[i+1:]) 83 | textBlocks[len(textBlocks)-1] = nil 84 | textBlocks = textBlocks[:len(textBlocks)-1] 85 | i-- 86 | } else if prevBlock.IsContent() { 87 | changes = true 88 | prevBlock.SetIsContent(false) 89 | } 90 | } 91 | 92 | doc.TextBlocks = textBlocks 93 | return changes 94 | } 95 | -------------------------------------------------------------------------------- /internal/filter/heuristic/expand-title.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/heuristics/ExpandTitleToContentFilter.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package heuristic 44 | 45 | import ( 46 | "github.com/markusmobius/go-domdistiller/internal/label" 47 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 48 | ) 49 | 50 | // ExpandTitleToContent marks all TextBlocks "content" which are between the headline and the 51 | // part that has already been marked content, if they are marked with label.MightBeContent. 52 | // This filter is quite specific to the news domain. 53 | type ExpandTitleToContent struct{} 54 | 55 | func NewExpandTitleToContent() *ExpandTitleToContent { 56 | return &ExpandTitleToContent{} 57 | } 58 | 59 | func (f *ExpandTitleToContent) Process(doc *webdoc.TextDocument) bool { 60 | title := -1 61 | contentStart := -1 62 | for i, tb := range doc.TextBlocks { 63 | if contentStart == -1 && tb.HasLabel(label.Title) { 64 | title = i 65 | contentStart = -1 66 | } 67 | 68 | if contentStart == -1 && tb.IsContent() { 69 | contentStart = i 70 | } 71 | } 72 | 73 | if contentStart <= title || title == -1 { 74 | return false 75 | } 76 | 77 | changes := false 78 | for _, tb := range doc.TextBlocks[title:contentStart] { 79 | if tb.HasLabel(label.MightBeContent) { 80 | changed := tb.SetIsContent(true) 81 | changes = changes || changed 82 | } 83 | } 84 | 85 | return changes 86 | } 87 | -------------------------------------------------------------------------------- /internal/re2go/terminating-blocks.re: -------------------------------------------------------------------------------- 1 | /*!include:re2c "base.re" */ 2 | 3 | // Original pattern: (?i)(^(comments|© reuters|please rate this|post a comment|\d+\s+(comments|users responded in))|what you think\.\.\.|add your comment|add comment|reader views|have your say|reader comments|rätta artikeln|^thanks for your comments - this feedback is now closed$) 4 | // For convenience it will be separated into 3 regexes: 5 | // - ^(comments|© reuters|please rate this|post a comment|\d+\s+(comments|users responded in)) 6 | // - what you think\.\.\.|add your comment|add comment|reader views|have your say|reader comments|rätta artikeln 7 | // - ^thanks for your comments - this feedback is now closed$ 8 | 9 | // Handle (^(comments|© reuters|please rate this|post a comment|\d+\s+(comments|users responded in)) 10 | func isTerminatingBlocks1(input string) bool { 11 | var cursor, marker int 12 | input += string(rune(0)) // add terminating null 13 | limit := len(input) - 1 // limit points at the terminating null 14 | _ = marker 15 | 16 | for { /*!use:re2c:base_template 17 | re2c:case-insensitive = 1; 18 | 19 | tb1a = comments; 20 | tb1b = ©[ ]reuters; 21 | tb1c = please[ ]rate[ ]this; 22 | tb1d = post[ ]a[ ]comment; 23 | 24 | tb1eQuant1 = [0-9]+; 25 | tb1eQuant2 = [0-9]+[\t\n\f\r ]+; 26 | tb1e = [0-9]+[\t\n\f\r ]+(comments|users[ ]responded[ ]in); 27 | 28 | {tb1a} { return true } 29 | {tb1b} { return true } 30 | {tb1c} { return true } 31 | {tb1d} { return true } 32 | {tb1e} { return true } 33 | 34 | {tb1eQuant1} { return false } 35 | {tb1eQuant2} { return false } 36 | 37 | * { return false } 38 | $ { return false } 39 | */ 40 | } 41 | } 42 | 43 | // Handle what you think\.\.\.|add your comment|add comment|reader views|have your say|reader comments|rätta artikeln 44 | func isTerminatingBlocks2(input string) bool { 45 | var cursor, marker int 46 | input += string(rune(0)) // add terminating null 47 | limit := len(input) - 1 // limit points at the terminating null 48 | _ = marker 49 | 50 | for { /*!use:re2c:base_template 51 | re2c:case-insensitive = 1; 52 | 53 | tb2a = what[ ]you[ ]think[.]{3}; 54 | tb2b = add[ ]your[ ]comment; 55 | tb2c = add[ ]comment; 56 | tb2d = reader[ ]views; 57 | tb2e = have[ ]your[ ]say; 58 | tb2f = reader[ ]comments; 59 | tb2g = r[äÄ]tta[ ]artikeln; 60 | 61 | {tb2a} { return true } 62 | {tb2b} { return true } 63 | {tb2c} { return true } 64 | {tb2d} { return true } 65 | {tb2e} { return true } 66 | {tb2f} { return true } 67 | {tb2g} { return true } 68 | 69 | * { continue } 70 | $ { return false } 71 | */ 72 | } 73 | } 74 | 75 | // Handle ^thanks for your comments - this feedback is now closed$ 76 | func isTerminatingBlocks3(input string) bool { 77 | var cursor, marker int 78 | input += string(rune(0)) // add terminating null 79 | limit := len(input) - 1 // limit points at the terminating null 80 | _ = marker 81 | 82 | var found bool 83 | for { /*!use:re2c:base_template 84 | re2c:case-insensitive = 1; 85 | 86 | thanks[ ]for[ ]your[ ]comments[ ]-[ ]this[ ]feedback[ ]is[ ]now[ ]closed { 87 | found = true 88 | continue 89 | } 90 | 91 | * { return false } 92 | $ { return found } 93 | */ 94 | } 95 | } 96 | 97 | func IsTerminatingBlocks(input string) bool { 98 | return isTerminatingBlocks1(input) || 99 | isTerminatingBlocks2(input) || 100 | isTerminatingBlocks3(input) 101 | } -------------------------------------------------------------------------------- /internal/filter/heuristic/large-block-around-level.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/heuristics/LargeBlockSameTagLevelToContentFilter.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package heuristic 44 | 45 | import ( 46 | "github.com/markusmobius/go-domdistiller/internal/label" 47 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 48 | ) 49 | 50 | // LargeBlockAroundTagLevelToContent marks all blocks as content that: 51 | // - are on the same or adjacent tag-level as very likely main content (usually the level of the largest block) 52 | // - have a significant number of words, currently: at least 100 53 | type LargeBlockAroundTagLevelToContent struct{} 54 | 55 | func NewLargeBlockAroundTagLevelToContent() *LargeBlockAroundTagLevelToContent { 56 | return &LargeBlockAroundTagLevelToContent{} 57 | } 58 | 59 | func (f *LargeBlockAroundTagLevelToContent) Process(doc *webdoc.TextDocument) bool { 60 | tagLevel := -1 61 | for _, tb := range doc.TextBlocks { 62 | if tb.IsContent() && tb.HasLabel(label.VeryLikelyContent) { 63 | tagLevel = tb.TagLevel 64 | break 65 | } 66 | } 67 | 68 | if tagLevel == -1 { 69 | return false 70 | } 71 | 72 | changes := false 73 | for _, tb := range doc.TextBlocks { 74 | if tb.IsContent() || tb.NumWords < 100 { 75 | continue 76 | } 77 | 78 | switch tb.TagLevel { 79 | case tagLevel, tagLevel - 1, tagLevel + 1: 80 | tb.SetIsContent(true) 81 | changes = true 82 | } 83 | } 84 | 85 | return changes 86 | } 87 | -------------------------------------------------------------------------------- /internal/webdoc/table_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/webdocument/WebTableTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc_test 28 | 29 | import ( 30 | nurl "net/url" 31 | "testing" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/testutil" 35 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 36 | "github.com/stretchr/testify/assert" 37 | ) 38 | 39 | func Test_WebDoc_Table_GenerateOutput(t *testing.T) { 40 | html := `` + 41 | `` + 42 | `` + 43 | `` + 44 | `` + 45 | `` + 46 | `
row1col1
` 47 | 48 | div := dom.CreateElement("div") 49 | dom.SetInnerHTML(div, html) 50 | 51 | table := dom.QuerySelector(div, "table") 52 | webTable := webdoc.Table{Element: table} 53 | 54 | // Output should be the same as the input in this case. 55 | got := webTable.GenerateOutput(false) 56 | assert.Equal(t, html, testutil.RemoveAllDirAttributes(got)) 57 | 58 | // Test GetImageURLs as well. 59 | imgURLs := webTable.GetImageURLs() 60 | assert.Equal(t, 1, len(imgURLs)) 61 | assert.Equal(t, "http://example.com/table.png", imgURLs[0]) 62 | } 63 | 64 | func Test_WebDoc_Table_GetImageURLs(t *testing.T) { 65 | div := dom.CreateElement("div") 66 | dom.SetInnerHTML(div, ` 67 | 68 | 69 | 70 | 73 | 79 | 80 | 81 |
71 | 72 | 74 | 75 | 76 | 77 | 78 |
`) 82 | 83 | table := dom.QuerySelector(div, "table") 84 | baseURL, _ := nurl.ParseRequestURI("http://example.com/") 85 | webTable := webdoc.Table{Element: table, PageURL: baseURL} 86 | 87 | urls := webTable.GetImageURLs() 88 | assert.Equal(t, 5, len(urls)) 89 | assert.Equal(t, "http://example.com/table.png", urls[0]) 90 | assert.Equal(t, "http://example.com/image100", urls[1]) 91 | assert.Equal(t, "http://example.org/image300", urls[2]) 92 | assert.Equal(t, "http://example.com/image200", urls[3]) 93 | assert.Equal(t, "http://example.org/image400", urls[4]) 94 | } 95 | -------------------------------------------------------------------------------- /internal/webdoc/image.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/WebImage.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import ( 30 | "fmt" 31 | nurl "net/url" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/domutil" 35 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 36 | "golang.org/x/net/html" 37 | ) 38 | 39 | type Image struct { 40 | BaseElement 41 | Element *html.Node // node for the image 42 | PageURL *nurl.URL // url of page where image is placed 43 | 44 | cloned *html.Node 45 | } 46 | 47 | func (i *Image) ElementType() string { 48 | return "image" 49 | } 50 | 51 | func (i *Image) GenerateOutput(textOnly bool) string { 52 | if textOnly { 53 | return "" 54 | } 55 | 56 | if i.cloned == nil { 57 | i.cloned = i.cloneAndProcessNode() 58 | } 59 | 60 | return dom.OuterHTML(i.cloned) 61 | } 62 | 63 | // GetURLs returns the list of source URLs of this image. 64 | func (i *Image) GetURLs() []string { 65 | if i.cloned == nil { 66 | i.cloned = i.cloneAndProcessNode() 67 | } 68 | 69 | urls := []string{} 70 | src := dom.GetAttribute(i.cloned, "src") 71 | if src != "" { 72 | urls = append(urls, src) 73 | } 74 | 75 | urls = append(urls, domutil.GetAllSrcSetURLs(i.cloned)...) 76 | return urls 77 | } 78 | 79 | func (i *Image) getProcessedNode() *html.Node { 80 | if i.cloned == nil { 81 | i.cloned = i.cloneAndProcessNode() 82 | } 83 | return i.cloned 84 | } 85 | 86 | func (i *Image) cloneAndProcessNode() *html.Node { 87 | cloned := dom.Clone(i.Element, true) 88 | img := domutil.GetFirstElementByTagNameInc(cloned, "img") 89 | if img != nil { 90 | if src := dom.GetAttribute(img, "src"); src != "" { 91 | src = stringutil.CreateAbsoluteURL(src, i.PageURL) 92 | dom.SetAttribute(img, "src", src) 93 | } 94 | } 95 | 96 | domutil.MakeAllSrcAttributesAbsolute(cloned, i.PageURL) 97 | domutil.MakeAllSrcSetAbsolute(cloned, i.PageURL) 98 | domutil.StripAttributes(cloned) 99 | return cloned 100 | } 101 | 102 | func (i *Image) String() string { 103 | return fmt.Sprintf("ELEMENT %q: html=%q, is_content=%v", 104 | i.ElementType(), dom.OuterHTML(i.Element), i.isContent) 105 | } 106 | -------------------------------------------------------------------------------- /internal/filter/english/terminating-blocks.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/english/TerminatingBlocksFinder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package english 44 | 45 | import ( 46 | "strings" 47 | 48 | "github.com/markusmobius/go-domdistiller/internal/label" 49 | "github.com/markusmobius/go-domdistiller/internal/re2go" 50 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 51 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 52 | ) 53 | 54 | // TerminatingBlocksFinder finds blocks which are potentially indicating the end of 55 | // an article text and marks them with label.StrictlyNotContent. 56 | type TerminatingBlocksFinder struct{} 57 | 58 | func NewTerminatingBlocksFinder() *TerminatingBlocksFinder { 59 | return &TerminatingBlocksFinder{} 60 | } 61 | 62 | func (f *TerminatingBlocksFinder) Process(doc *webdoc.TextDocument) bool { 63 | changes := false 64 | 65 | for _, block := range doc.TextBlocks { 66 | if f.isTerminating(block) { 67 | block.AddLabels(label.StrictlyNotContent) 68 | changes = true 69 | } 70 | } 71 | 72 | return changes 73 | } 74 | 75 | func (f *TerminatingBlocksFinder) isTerminating(tb *webdoc.TextBlock) bool { 76 | if tb.NumWords > 14 { 77 | return false 78 | } 79 | 80 | text := strings.TrimSpace(tb.Text) 81 | if stringutil.CharCount(text) >= 8 { 82 | return re2go.IsTerminatingBlocks(text) 83 | } else if tb.LinkDensity == 1 { 84 | return text == "Comment" 85 | } else if text == "Shares" { 86 | // Skip social and sharing elements. 87 | // See crbug.com/692553 88 | return true 89 | } 90 | 91 | return false 92 | } 93 | -------------------------------------------------------------------------------- /internal/testutil/text-builder.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/webdocument/TestWebTextBuilder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | import ( 30 | "github.com/go-shiori/dom" 31 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 32 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 33 | "golang.org/x/net/html" 34 | ) 35 | 36 | type TextBuilder struct { 37 | wordCounter stringutil.WordCounter 38 | textNodes []*html.Node 39 | } 40 | 41 | func NewTextBuilder(wc stringutil.WordCounter) *TextBuilder { 42 | return &TextBuilder{wordCounter: wc} 43 | } 44 | 45 | func (tb *TextBuilder) CreateForText(str string) *webdoc.Text { 46 | return tb.create(str, false) 47 | } 48 | 49 | func (tb *TextBuilder) CreateForAnchorText(str string) *webdoc.Text { 50 | return tb.create(str, true) 51 | } 52 | 53 | func (tb *TextBuilder) CreateNestedText(str string, levels int) *webdoc.Text { 54 | div := dom.CreateElement("div") 55 | tmp := div 56 | 57 | for i := 0; i < levels-1; i++ { 58 | dom.AppendChild(tmp, dom.CreateElement("div")) 59 | tmp = dom.FirstElementChild(tmp) 60 | } 61 | 62 | dom.AppendChild(tmp, dom.CreateTextNode(str)) 63 | tb.textNodes = append(tb.textNodes, tmp.FirstChild) 64 | 65 | idx := len(tb.textNodes) - 1 66 | numWords := tb.wordCounter.Count(str) 67 | 68 | return &webdoc.Text{ 69 | Text: str, 70 | TextNodes: tb.textNodes, 71 | Start: idx, 72 | End: idx + 1, 73 | FirstWordNode: idx, 74 | LastWordNode: idx, 75 | NumWords: numWords, 76 | NumLinkedWords: 0, 77 | TagLevel: 0, 78 | OffsetBlock: idx, 79 | } 80 | } 81 | 82 | func (tb *TextBuilder) create(str string, isAnchor bool) *webdoc.Text { 83 | tb.textNodes = append(tb.textNodes, dom.CreateTextNode(str)) 84 | 85 | idx := len(tb.textNodes) - 1 86 | numWords := tb.wordCounter.Count(str) 87 | numLinkedWords := numWords 88 | if !isAnchor { 89 | numLinkedWords = 0 90 | } 91 | 92 | return &webdoc.Text{ 93 | Text: str, 94 | TextNodes: tb.textNodes, 95 | Start: idx, 96 | End: idx + 1, 97 | FirstWordNode: idx, 98 | LastWordNode: idx, 99 | NumWords: numWords, 100 | NumLinkedWords: numLinkedWords, 101 | TagLevel: 0, 102 | OffsetBlock: idx, 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /internal/filter/english/terminating-blocks_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/TerminatingBlocksFinderTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package english 28 | 29 | import ( 30 | "testing" 31 | 32 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 33 | "github.com/markusmobius/go-domdistiller/internal/testutil" 34 | "github.com/stretchr/testify/assert" 35 | ) 36 | 37 | func Test_Filter_English_TerminatingBlocks_Positives(t *testing.T) { 38 | texts := []string{ 39 | // Startswith cases. 40 | "comments foo", "© reuters", "© reuters foo bar", "please rate this", 41 | "please rate this foo", "post a comment", "post a comment foo", "123 comments", 42 | "9 comments foo", "1346213423 users responded in", "1346213423 users responded in foo", 43 | 44 | // Contains cases. 45 | "foo what you think... bar", "what you think...", "foo what you think...", 46 | "add your comment", "foo add your comment", "add comment bar", "reader views bar", 47 | "have your say bar", "foo reader comments", "foo rätta artikeln", 48 | 49 | // Equals cases. 50 | "thanks for your comments - this feedback is now closed", 51 | 52 | // Check some case insensitivity. 53 | "Thanks for your comments - this feedback is now closed", "Add Comment Bar", 54 | "READER VIEWS BAR", "Comments FOO", 55 | } 56 | 57 | terminatingBlocksFinder := NewTerminatingBlocksFinder() 58 | builder := testutil.NewTextBlockBuilder(stringutil.FastWordCounter{}) 59 | 60 | for _, text := range texts { 61 | tb := builder.CreateForText(text) 62 | assert.True(t, terminatingBlocksFinder.isTerminating(tb)) 63 | } 64 | } 65 | 66 | func Test_Filter_English_TerminatingBlocks_Negatives(t *testing.T) { 67 | texts := []string{ 68 | // Startswith cases. 69 | "lcomments foo", "xd© reuters", "not please rate this", "xx post a comment", 70 | "users responded in", "123users responded in foo", 71 | 72 | // Contains cases. 73 | "what you think..", "addyour comment", "ad comment", "readerviews", 74 | 75 | // Equals cases. 76 | "thanks for your comments - this feedback is now closed foo", 77 | "foo thanks for your comments - this feedback is now closed", 78 | 79 | // Long case. 80 | "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", 81 | } 82 | 83 | terminatingBlocksFinder := NewTerminatingBlocksFinder() 84 | builder := testutil.NewTextBlockBuilder(stringutil.FastWordCounter{}) 85 | 86 | for _, text := range texts { 87 | tb := builder.CreateForText(text) 88 | assert.False(t, terminatingBlocksFinder.isTerminating(tb)) 89 | } 90 | } 91 | 92 | func Test_Filter_English_TerminatingBlocks_CommentsLink(t *testing.T) { 93 | } 94 | -------------------------------------------------------------------------------- /logger.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/LogUtil.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package distiller 28 | 29 | import ( 30 | "os" 31 | 32 | "github.com/rs/zerolog" 33 | ) 34 | 35 | // LogFlag is enum to specify logging level. 36 | type LogFlag uint 37 | 38 | const ( 39 | // LogNothing will disable the logger. 40 | LogNothing LogFlag = 0 41 | 42 | // If LogEverything is set DistillerLogger will enable all logs. 43 | LogEverything LogFlag = LogExtraction | LogVisibility | LogPagination | LogTiming 44 | 45 | // If LogExtraction is set DistillerLogger will print info of each process when extracting article. 46 | LogExtraction LogFlag = 1 << iota 47 | 48 | // If LogVisibility is set DistillerLogger will print info on why an element is visible. 49 | LogVisibility 50 | 51 | // If LogPagination is set DistillerLogger will print info of pagination process. 52 | LogPagination 53 | 54 | // If LogTiming is set DistillerLogger will print info of duration of each process when extracting article. 55 | LogTiming 56 | ) 57 | 58 | // distillerLogger is the main logger for dom-distiller 59 | type distillerLogger struct { 60 | log zerolog.Logger 61 | flags LogFlag 62 | } 63 | 64 | func newDistillerLogger(flags LogFlag) *distillerLogger { 65 | return &distillerLogger{ 66 | log: zerolog.New(zerolog.ConsoleWriter{ 67 | Out: os.Stderr, 68 | TimeFormat: "2006-01-02 15:04", 69 | }).With().Timestamp().Logger(), 70 | flags: flags, 71 | } 72 | } 73 | 74 | func (l *distillerLogger) InternallyNil() bool { return l == nil } 75 | 76 | func (l *distillerLogger) IsLogExtraction() bool { return l.hasFlag(LogExtraction) } 77 | 78 | func (l *distillerLogger) IsLogVisibility() bool { return l.hasFlag(LogVisibility) } 79 | 80 | func (l *distillerLogger) IsLogPagination() bool { return l.hasFlag(LogPagination) } 81 | 82 | func (l *distillerLogger) IsLogTiming() bool { return l.hasFlag(LogTiming) } 83 | 84 | func (l *distillerLogger) PrintExtractionInfo(args ...interface{}) { l.print(LogExtraction, args...) } 85 | 86 | func (l *distillerLogger) PrintVisibilityInfo(args ...interface{}) { l.print(LogVisibility, args...) } 87 | 88 | func (l *distillerLogger) PrintPaginationInfo(args ...interface{}) { l.print(LogPagination, args...) } 89 | 90 | func (l *distillerLogger) PrintTimingInfo(args ...interface{}) { l.print(LogTiming, args...) } 91 | 92 | func (l *distillerLogger) hasFlag(flag LogFlag) bool { 93 | if l.InternallyNil() { 94 | return false 95 | } 96 | return l.flags&flag != 0 97 | } 98 | 99 | func (l *distillerLogger) print(flag LogFlag, args ...interface{}) { 100 | if !l.InternallyNil() && l.hasFlag(flag) { 101 | l.log.Println(args...) 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /internal/testutil/document-builder.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/webdocument/TestWebDocumentBuilder.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package testutil 28 | 29 | import ( 30 | "github.com/go-shiori/dom" 31 | "github.com/markusmobius/go-domdistiller/internal/stringutil" 32 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 33 | ) 34 | 35 | // WebDocumentBuilder is a simple builder for testing. 36 | type WebDocumentBuilder struct { 37 | document *webdoc.Document 38 | textBuilder *TextBuilder 39 | } 40 | 41 | func NewWebDocumentBuilder() *WebDocumentBuilder { 42 | return &WebDocumentBuilder{ 43 | document: webdoc.NewDocument(), 44 | textBuilder: NewTextBuilder(stringutil.FastWordCounter{}), 45 | } 46 | } 47 | 48 | func (db *WebDocumentBuilder) AddText(text string) *webdoc.Text { 49 | wt := db.textBuilder.CreateForText(text) 50 | db.document.AddElements(wt) 51 | return wt 52 | } 53 | 54 | func (db *WebDocumentBuilder) AddNestedText(text string) *webdoc.Text { 55 | wt := db.textBuilder.CreateNestedText(text, 5) 56 | db.document.AddElements(wt) 57 | return wt 58 | } 59 | 60 | func (db *WebDocumentBuilder) AddAnchorText(text string) *webdoc.Text { 61 | wt := db.textBuilder.CreateForAnchorText(text) 62 | db.document.AddElements(wt) 63 | return wt 64 | } 65 | 66 | func (db *WebDocumentBuilder) AddTable(innerHTML string) *webdoc.Table { 67 | div := dom.CreateElement("div") 68 | dom.SetInnerHTML(div, ""+innerHTML+"
") 69 | 70 | table := dom.QuerySelector(div, "table") 71 | wt := &webdoc.Table{Element: table} 72 | db.document.AddElements(wt) 73 | return wt 74 | } 75 | 76 | func (db *WebDocumentBuilder) AddImage() *webdoc.Image { 77 | image := dom.CreateElement("img") 78 | dom.SetAttribute(image, "src", "http://www.example.com/foo.jpg") 79 | 80 | wi := &webdoc.Image{Element: image} 81 | db.document.AddElements(wi) 82 | return wi 83 | } 84 | 85 | func (db *WebDocumentBuilder) AddLeadImage() *webdoc.Image { 86 | image := dom.CreateElement("img") 87 | dom.SetAttribute(image, "width", "600") 88 | dom.SetAttribute(image, "height", "400") 89 | dom.SetAttribute(image, "src", "http://www.example.com/lead.bmp") 90 | 91 | wi := &webdoc.Image{Element: image} 92 | db.document.AddElements(wi) 93 | return wi 94 | } 95 | 96 | func (db *WebDocumentBuilder) AddTagStart(tagName string) *webdoc.Tag { 97 | wt := webdoc.NewTag(tagName, webdoc.TagStart) 98 | db.document.AddElements(wt) 99 | return wt 100 | } 101 | 102 | func (db *WebDocumentBuilder) AddTagEnd(tagName string) *webdoc.Tag { 103 | wt := webdoc.NewTag(tagName, webdoc.TagEnd) 104 | db.document.AddElements(wt) 105 | return wt 106 | } 107 | 108 | func (db *WebDocumentBuilder) Build() *webdoc.Document { 109 | return db.document 110 | } 111 | -------------------------------------------------------------------------------- /internal/filter/english/num-words.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/filters/english/NumWordsRulesClassifier.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2015 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | // boilerpipe 28 | // 29 | // Copyright (c) 2009 Christian Kohlschütter 30 | // 31 | // The author licenses this file to You under the Apache License, Version 2.0 32 | // (the "License"); you may not use this file except in compliance with 33 | // the License. You may obtain a copy of the License at 34 | // 35 | // http://www.apache.org/licenses/LICENSE-2.0 36 | // 37 | // Unless required by applicable law or agreed to in writing, software 38 | // distributed under the License is distributed on an "AS IS" BASIS, 39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 40 | // See the License for the specific language governing permissions and 41 | // limitations under the License. 42 | 43 | package english 44 | 45 | import ( 46 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 47 | ) 48 | 49 | // NumWordsRulesClassifier classifies several TextBlock as content or not-content through 50 | // rules that have been determined using the C4.8 machine learning algorithm, as described 51 | // in the paper "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly 52 | // using number of words per block and link density per block. 53 | type NumWordsRulesClassifier struct{} 54 | 55 | func NewNumWordsRulesClassifier() *NumWordsRulesClassifier { 56 | return &NumWordsRulesClassifier{} 57 | } 58 | 59 | func (f *NumWordsRulesClassifier) Process(doc *webdoc.TextDocument) bool { 60 | textBlocks := doc.TextBlocks 61 | if len(textBlocks) == 0 { 62 | return false 63 | } 64 | 65 | hasChanges := false 66 | for i, block := range textBlocks { 67 | var prevBlock, nextBlock *webdoc.TextBlock 68 | if i > 0 { 69 | prevBlock = textBlocks[i-1] 70 | } 71 | if i+1 < len(textBlocks) { 72 | nextBlock = textBlocks[i+1] 73 | } 74 | 75 | changed := f.classify(prevBlock, block, nextBlock) 76 | hasChanges = hasChanges || changed 77 | } 78 | 79 | return hasChanges 80 | } 81 | 82 | func (f *NumWordsRulesClassifier) classify(prev, current, next *webdoc.TextBlock) bool { 83 | isContent := false 84 | 85 | if current.LinkDensity <= 0.333333 { 86 | if prev == nil || prev.LinkDensity <= 0.555556 { 87 | if current.NumWords <= 16 { 88 | if next == nil || next.NumWords <= 15 { 89 | isContent = prev != nil && prev.NumWords > 4 90 | } else { 91 | isContent = true 92 | } 93 | } else { 94 | isContent = true 95 | } 96 | } else { 97 | if current.NumWords <= 40 { 98 | isContent = next != nil && next.NumWords > 17 99 | } else { 100 | isContent = true 101 | } 102 | } 103 | } else { 104 | isContent = false 105 | } 106 | 107 | return current.SetIsContent(isContent) 108 | } 109 | -------------------------------------------------------------------------------- /internal/webdoc/element-action.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/webdocument/ElementAction.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc 28 | 29 | import ( 30 | "regexp" 31 | "strings" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/domutil" 35 | "github.com/markusmobius/go-domdistiller/internal/label" 36 | "golang.org/x/net/html" 37 | ) 38 | 39 | const maxClassCount = 2 40 | 41 | var rxComment = regexp.MustCompile(`(?i)\bcomments?\b`) 42 | 43 | type ElementAction struct { 44 | Flush bool 45 | IsAnchor bool 46 | ChangesTagLevel bool 47 | Labels []string 48 | } 49 | 50 | func GetActionForElement(element *html.Node) ElementAction { 51 | tagName := dom.TagName(element) 52 | 53 | // NEED-COMPUTE-CSS 54 | // In original dom-distiller, the `flush` and `changesTagLevel` values are decided depending 55 | // on element display syle. For example, inline element shouldn't change tag level. Unfortunately, 56 | // this is not possible since we can't compute stylesheet. As fallback, here we simply use the 57 | // default display for the tag name 58 | action := ElementAction{} 59 | display := domutil.GetDisplayStyle(element) 60 | switch display { 61 | case "none", "inline": // do nothing 62 | case "inline-block", "inline-flex": 63 | action.ChangesTagLevel = true 64 | default: 65 | action.Flush = true 66 | action.ChangesTagLevel = true 67 | } 68 | 69 | // Check if item is inside
  • 70 | if domutil.HasAncestor(element, "li", "summary") { 71 | action.Flush = false 72 | action.ChangesTagLevel = false 73 | } 74 | 75 | if tagName != "html" && tagName != "body" && tagName != "article" { 76 | id := dom.GetAttribute(element, "id") 77 | className := dom.GetAttribute(element, "class") 78 | classCount := len(strings.Fields(className)) 79 | if (rxComment.MatchString(id) || rxComment.MatchString(className)) && classCount <= maxClassCount { 80 | action.Labels = append(action.Labels, label.StrictlyNotContent) 81 | } 82 | 83 | switch tagName { 84 | case "aside", "nav": 85 | action.Labels = append(action.Labels, label.StrictlyNotContent) 86 | case "li": 87 | action.Labels = append(action.Labels, label.Li) 88 | case "h1": 89 | action.Labels = append(action.Labels, label.H1, label.Heading) 90 | case "h2": 91 | action.Labels = append(action.Labels, label.H2, label.Heading) 92 | case "h3": 93 | action.Labels = append(action.Labels, label.H3, label.Heading) 94 | case "h4", "h5", "h6": 95 | action.Labels = append(action.Labels, label.Heading) 96 | case "a": 97 | // TODO: Anchors probably shouldn't unconditionally change the tag level. 98 | action.ChangesTagLevel = true 99 | action.IsAnchor = dom.HasAttribute(element, "href") 100 | } 101 | } 102 | 103 | return action 104 | } 105 | -------------------------------------------------------------------------------- /internal/webdoc/image_test.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: javatest/webdocument/WebImageTest.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2016 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package webdoc_test 28 | 29 | import ( 30 | nurl "net/url" 31 | "testing" 32 | 33 | "github.com/go-shiori/dom" 34 | "github.com/markusmobius/go-domdistiller/internal/webdoc" 35 | "github.com/stretchr/testify/assert" 36 | ) 37 | 38 | func Test_WebDoc_Image_GenerateOutput(t *testing.T) { 39 | html := `` + 40 | `` + 41 | `` + 42 | `` 43 | 44 | div := dom.CreateElement("div") 45 | dom.SetInnerHTML(div, html) 46 | 47 | picture := dom.QuerySelector(div, "picture") 48 | baseURL, _ := nurl.ParseRequestURI("http://example.com/") 49 | webImage := webdoc.Image{Element: picture, PageURL: baseURL} 50 | 51 | expected := `` 52 | assert.Equal(t, expected, webImage.GenerateOutput(false)) 53 | } 54 | 55 | func Test_WebDoc_Image_GetSrcList(t *testing.T) { 56 | img := dom.CreateElement("img") 57 | dom.SetAttribute(img, "src", "image") 58 | dom.SetAttribute(img, "srcset", "image200 200w, image400 400w") 59 | 60 | baseURL, _ := nurl.ParseRequestURI("http://example.com/") 61 | webImage := webdoc.Image{ 62 | Element: img, 63 | PageURL: baseURL, 64 | } 65 | 66 | urls := webImage.GetURLs() 67 | assert.Equal(t, 3, len(urls)) 68 | assert.Equal(t, "http://example.com/image", urls[0]) 69 | assert.Equal(t, "http://example.com/image200", urls[1]) 70 | assert.Equal(t, "http://example.com/image400", urls[2]) 71 | } 72 | 73 | func Test_WebDoc_Image_GetSrcListInPicture(t *testing.T) { 74 | html := `` + 75 | `` + 76 | `` + 77 | `` 78 | 79 | div := dom.CreateElement("div") 80 | dom.SetInnerHTML(div, html) 81 | 82 | picture := dom.QuerySelector(div, "picture") 83 | baseURL, _ := nurl.ParseRequestURI("http://example.com/") 84 | webImage := webdoc.Image{Element: picture, PageURL: baseURL} 85 | 86 | urls := webImage.GetURLs() 87 | assert.Equal(t, 2, len(urls)) 88 | assert.Equal(t, "http://example.com/image100", urls[0]) 89 | assert.Equal(t, "http://example.org/image300", urls[1]) 90 | } 91 | 92 | func Test_WebDoc_Image_PictureWithoutImg(t *testing.T) { 93 | html := `` + 94 | `` + 95 | `` 96 | 97 | div := dom.CreateElement("div") 98 | dom.SetInnerHTML(div, html) 99 | 100 | picture := dom.QuerySelector(div, "picture") 101 | baseURL, _ := nurl.ParseRequestURI("http://example.com/") 102 | webImage := webdoc.Image{Element: picture, PageURL: baseURL} 103 | 104 | expected := `` 105 | assert.Equal(t, expected, webImage.GenerateOutput(false)) 106 | } 107 | -------------------------------------------------------------------------------- /internal/markup/schemaorg/constant.go: -------------------------------------------------------------------------------- 1 | // ORIGINAL: java/SchemaOrgParser.java 2 | 3 | // Copyright (c) 2020 Markus Mobius 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | // Copyright 2014 The Chromium Authors. All rights reserved. 24 | // Use of this source code is governed by a BSD-style license that can be 25 | // found in the LICENSE file. 26 | 27 | package schemaorg 28 | 29 | const ( 30 | NameProp = "name" 31 | URLProp = "url" 32 | DescriptionProp = "description" 33 | ImageProp = "image" 34 | HeadlineProp = "headline" 35 | PublisherProp = "publisher" 36 | CopyrightHolderProp = "copyrightHolder" 37 | CopyrightYearProp = "copyrightYear" 38 | ContentURLProp = "contentUrl" 39 | EncodingFormatProp = "encodingFormat" 40 | CaptionProp = "caption" 41 | RepresentativeProp = "representativeOfPage" 42 | WidthProp = "width" 43 | HeightProp = "height" 44 | DatePublishedProp = "datePublished" 45 | DateModifiedProp = "dateModified" 46 | AuthorProp = "author" 47 | CreatorProp = "creator" 48 | SectionProp = "articleSection" 49 | AssociatedMediaProp = "associatedMedia" 50 | EncodingProp = "encoding" 51 | FamilyNameProp = "familyName" 52 | GivenNameProp = "givenName" 53 | LegalNameProp = "legalName" 54 | AuthorRel = "author" 55 | ) 56 | 57 | type SchemaType uint 58 | 59 | const ( 60 | Unsupported SchemaType = iota 61 | Image 62 | Article 63 | Person 64 | Organization 65 | ) 66 | 67 | var schemaTypeURLs = map[string]SchemaType{ 68 | "http://schema.org/ImageObject": Image, 69 | "http://schema.org/Article": Article, 70 | "http://schema.org/BlogPosting": Article, 71 | "http://schema.org/NewsArticle": Article, 72 | "http://schema.org/ScholarlyArticle": Article, 73 | "http://schema.org/TechArticle": Article, 74 | "http://schema.org/Person": Person, 75 | "http://schema.org/Organization": Organization, 76 | "http://schema.org/Corporation": Organization, 77 | "http://schema.org/EducationalOrganization": Organization, 78 | "http://schema.org/GovernmentOrganization": Organization, 79 | "http://schema.org/NGO": Organization, 80 | } 81 | 82 | // The key for `tagAttributeMap` is the tag name, while the entry value is an 83 | // array of attributes in the specified tag from which to extract information: 84 | // - 0th attribute: contains the value for the property specified in itemprop 85 | // - 1st attribute: if available, contains the value for the author property. 86 | var tagAttributeMap = map[string]string{ 87 | "img": "src", 88 | "audio": "src", 89 | "embed": "src", 90 | "iframe": "src", 91 | "source": "src", 92 | "track": "src", 93 | "video": "src", 94 | "a": "href", 95 | "link": "href", 96 | "area": "href", 97 | "meta": "content", 98 | "time": "datetime", 99 | "object": "data", 100 | "data": "value", 101 | "meter": "value", 102 | } 103 | --------------------------------------------------------------------------------