├── .gitignore
├── LICENSE-domdistiller.txt
├── internal
├── re2go
│ ├── base.go
│ ├── base.re
│ ├── word-counter.re
│ ├── document-title.re
│ ├── document-title.go
│ ├── dom-converter.re
│ ├── domutil.re
│ └── terminating-blocks.re
├── pagination
│ ├── info
│ │ ├── utils.go
│ │ ├── constant.go
│ │ ├── page-info.go
│ │ └── linear-formula.go
│ ├── parser
│ │ ├── constant.go
│ │ └── param-detector.go
│ ├── pattern
│ │ ├── utils.go
│ │ ├── page-pattern.go
│ │ └── constant.go
│ └── constant.go
├── tableclass
│ ├── table-type.go
│ ├── constant.go
│ └── type-reason.go
├── converter
│ └── utils.go
├── markup
│ ├── schemaorg
│ │ ├── thing-item-unsupported.go
│ │ ├── thing-item-org.go
│ │ ├── thing-item-person.go
│ │ ├── thing-item-image.go
│ │ └── constant.go
│ ├── opengraph
│ │ ├── prefixes.go
│ │ └── constant.go
│ └── accessor.go
├── filter
│ ├── heuristic
│ │ ├── constant_test.go
│ │ ├── list-at-end.go
│ │ ├── heading-fusion.go
│ │ ├── expand-title.go
│ │ └── large-block-around-level.go
│ ├── docfilter
│ │ ├── scorer
│ │ │ ├── image.go
│ │ │ ├── image-ratio.go
│ │ │ ├── image-area.go
│ │ │ ├── image-has-figure_test.go
│ │ │ ├── image-has-figure.go
│ │ │ ├── image-dom-distance_test.go
│ │ │ └── image-dom-distance.go
│ │ ├── relevant-elements.go
│ │ └── nested-element.go
│ ├── filter.go
│ ├── simple
│ │ ├── label-to-boilerplate.go
│ │ └── boilerplate-block.go
│ └── english
│ │ ├── terminating-blocks.go
│ │ ├── terminating-blocks_test.go
│ │ └── num-words.go
├── logutil
│ └── logger.go
├── testutil
│ ├── text-document.go
│ ├── text-block-builder.go
│ ├── page-param-content-info.go
│ ├── html_test.go
│ ├── text-document-builder.go
│ ├── fake-document-builder.go
│ ├── text-builder.go
│ └── document-builder.go
├── extractor
│ └── embed
│ │ ├── embed.go
│ │ ├── constant.go
│ │ └── embed-vimeo_test.go
├── webdoc
│ ├── element.go
│ ├── constant.go
│ ├── figure.go
│ ├── tag.go
│ ├── tag_test.go
│ ├── table.go
│ ├── text-document_test.go
│ ├── video.go
│ ├── embed.go
│ ├── text-document.go
│ ├── table_test.go
│ ├── image.go
│ ├── element-action.go
│ └── image_test.go
├── domutil
│ ├── walker.go
│ ├── tree-clone.go
│ └── tree-clone_test.go
└── label
│ └── label.go
├── Makefile
├── example
├── from-file.go
└── from-url.go
├── LICENSE-boilerpipe.txt
├── NOTICE-boilerpipe.txt
├── go.mod
├── LICENSE
├── IMPROVEMENTS.md
├── data
├── timing-info.go
└── data.go
└── logger.go
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
--------------------------------------------------------------------------------
/LICENSE-domdistiller.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/internal/re2go/base.go:
--------------------------------------------------------------------------------
1 | // Code generated by re2c 3.1, DO NOT EDIT.
2 | package re2go
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | generate:
2 | @for name in internal/re2go/*.re; do \
3 | RE_IN=$$name; \
4 | RE_OUT=$$(echo $$name | sed 's/\.re/.go/'); \
5 | re2go -W -F --input-encoding utf8 --utf8 --no-generation-date -i $$RE_IN -o $$RE_OUT; \
6 | gofmt -w $$RE_OUT; \
7 | done
8 |
9 | test: generate
10 | go test -timeout 30s ./...
--------------------------------------------------------------------------------
/example/from-file.go:
--------------------------------------------------------------------------------
1 | // +build ignore
2 |
3 | package main
4 |
5 | import (
6 | "fmt"
7 |
8 | "github.com/go-shiori/dom"
9 | distiller "github.com/markusmobius/go-domdistiller"
10 | )
11 |
12 | func main() {
13 | result, err := distiller.ApplyForFile("example/sample.html", nil)
14 | if err != nil {
15 | panic(err)
16 | }
17 |
18 | rawHTML := dom.OuterHTML(result.Node)
19 | fmt.Println(rawHTML)
20 | }
21 |
--------------------------------------------------------------------------------
/example/from-url.go:
--------------------------------------------------------------------------------
1 | // +build ignore
2 |
3 | package main
4 |
5 | import (
6 | "fmt"
7 | "time"
8 |
9 | "github.com/go-shiori/dom"
10 | distiller "github.com/markusmobius/go-domdistiller"
11 | )
12 |
13 | func main() {
14 | url := "https://arstechnica.com/gadgets/2020/10/iphone-12-and-12-pro-double-review-playing-apples-greatest-hits/"
15 |
16 | // Start distiller
17 | result, err := distiller.ApplyForURL(url, time.Minute, nil)
18 | if err != nil {
19 | panic(err)
20 | }
21 |
22 | rawHTML := dom.OuterHTML(result.Node)
23 | fmt.Println(rawHTML)
24 | }
25 |
--------------------------------------------------------------------------------
/internal/re2go/base.re:
--------------------------------------------------------------------------------
1 | package re2go
2 |
3 | /*!rules:re2c:base_template
4 | re2c:eof = 0;
5 | re2c:yyfill:enable = 0;
6 | re2c:posix-captures = 0;
7 | re2c:case-insensitive = 0;
8 |
9 | re2c:define:YYCTYPE = byte;
10 | re2c:define:YYPEEK = "input[cursor]";
11 | re2c:define:YYSKIP = "cursor++";
12 | re2c:define:YYBACKUP = "marker = cursor";
13 | re2c:define:YYRESTORE = "cursor = marker";
14 | re2c:define:YYLESSTHAN = "limit <= cursor";
15 | re2c:define:YYSTAGP = "@@{tag} = cursor";
16 | re2c:define:YYSTAGN = "@@{tag} = -1";
17 | re2c:define:YYSHIFTSTAG = "@@{tag} += @@{shift}";
18 | */
19 |
--------------------------------------------------------------------------------
/LICENSE-boilerpipe.txt:
--------------------------------------------------------------------------------
1 | boilerpipe
2 |
3 | Copyright (c) 2009-2011 Christian Kohlschütter
4 |
5 | The author licenses this file to You under the Apache License, Version 2.0
6 | (the "License"); you may not use this file except in compliance with
7 | the License. You may obtain a copy of the License at
8 |
9 | http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
--------------------------------------------------------------------------------
/NOTICE-boilerpipe.txt:
--------------------------------------------------------------------------------
1 | boilerpipe
2 |
3 | Copyright (c) 2009-2011 Christian Kohlschütter
4 |
5 | The author licenses this file to You under the Apache License, Version 2.0
6 | (the "License"); you may not use this file except in compliance with
7 | the License. You may obtain a copy of the License at
8 |
9 | http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 |
17 |
18 | This software contains the following parts which are also provided
19 | under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt):
20 |
21 | - NekoHTML
22 | - Xerces
--------------------------------------------------------------------------------
/internal/re2go/word-counter.re:
--------------------------------------------------------------------------------
1 | /*!include:re2c "base.re" */
2 |
3 | // Original pattern: [\x{3040}-\x{A4CF}]
4 | func UseFullWordCounter(input string) bool {
5 | var cursor, marker int
6 | input += string(rune(0)) // add terminating null
7 | limit := len(input) - 1 // limit points at the terminating null
8 | _ = marker
9 |
10 | for { /*!use:re2c:base_template
11 | re2c:case-insensitive = 1;
12 |
13 | [\u3040-\uA4CF] { return true }
14 | * { continue }
15 | $ { return false }
16 | */
17 | }
18 | }
19 |
20 | // Original pattern: [\x{AC00}-\x{D7AF}]
21 | func UseLetterWordCounter(input string) bool {
22 | var cursor, marker int
23 | input += string(rune(0)) // add terminating null
24 | limit := len(input) - 1 // limit points at the terminating null
25 | _ = marker
26 |
27 | for { /*!use:re2c:base_template
28 | re2c:case-insensitive = 1;
29 |
30 | [\uAC00-\uD7AF] { return true }
31 | * { continue }
32 | $ { return false }
33 | */
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/markusmobius/go-domdistiller
2 |
3 | go 1.20
4 |
5 | require (
6 | github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c
7 | github.com/rs/zerolog v1.33.0
8 | github.com/stretchr/testify v1.7.0
9 | github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4
10 | golang.org/x/net v0.29.0
11 | golang.org/x/text v0.18.0
12 | )
13 |
14 | require (
15 | github.com/andybalholm/cascadia v1.3.2 // indirect
16 | github.com/davecgh/go-spew v1.1.1 // indirect
17 | github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
18 | github.com/kr/text v0.2.0 // indirect
19 | github.com/mattn/go-colorable v0.1.13 // indirect
20 | github.com/mattn/go-isatty v0.0.20 // indirect
21 | github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect
22 | github.com/pmezard/go-difflib v1.0.0 // indirect
23 | golang.org/x/sys v0.25.0 // indirect
24 | gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b // indirect
25 | gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776 // indirect
26 | )
27 |
--------------------------------------------------------------------------------
/internal/re2go/document-title.re:
--------------------------------------------------------------------------------
1 | /*!include:re2c "base.re" */
2 |
3 | import "strings"
4 |
5 | // Original pattern: (?i)[\?\!\.\-\:]+
6 | func RemoveDtmCharacters(input string) string {
7 | var cursor, marker int
8 | input += string(rune(0)) // add terminating null
9 | limit := len(input) - 1 // limit points at the terminating null
10 | _ = marker
11 |
12 | // Variable for capturing parentheses (twice the number of groups).
13 | /*!maxnmatch:re2c*/
14 | yypmatch := make([]int, YYMAXNMATCH*2)
15 | var yynmatch int
16 | _ = yynmatch
17 |
18 | // Autogenerated tag variables used by the lexer to track tag values.
19 | /*!stags:re2c format = 'var @@ int; _ = @@\n'; */
20 |
21 | var start int
22 | var sb strings.Builder
23 | for { /*!use:re2c:base_template
24 | re2c:posix-captures = 1;
25 |
26 | [?!.\-:]+ {
27 | sb.WriteString(input[start:yypmatch[0]])
28 | start = yypmatch[1]
29 | continue
30 | }
31 |
32 | $ {
33 | sb.WriteString(input[start:limit])
34 | return sb.String()
35 | }
36 |
37 | * { continue }
38 | */
39 | }
40 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Markus Mobius
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/internal/pagination/info/utils.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2020 Markus Mobius
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | // SOFTWARE.
20 |
21 | package info
22 |
23 | func maxInt(a, b int) int {
24 | if a > b {
25 | return a
26 | }
27 | return b
28 | }
29 |
--------------------------------------------------------------------------------
/internal/pagination/parser/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParameterDetector.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package parser
28 |
29 | const (
30 | MaxPagingDocs = 100
31 | )
32 |
--------------------------------------------------------------------------------
/internal/tableclass/table-type.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/TableClassifier.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package tableclass
28 |
29 | type Type uint
30 |
31 | const (
32 | Data Type = iota
33 | Layout
34 | )
35 |
36 | func (t Type) String() string {
37 | switch t {
38 | case Data:
39 | return "Data"
40 | case Layout:
41 | return "Layout"
42 | }
43 | return ""
44 | }
45 |
--------------------------------------------------------------------------------
/internal/pagination/pattern/utils.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2020 Markus Mobius
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | // SOFTWARE.
20 |
21 | package pattern
22 |
23 | import (
24 | nurl "net/url"
25 | )
26 |
27 | // replaceUrlQueryValue replaces query value of the specified URL. The original URL
28 | // is preserved and not changed. Returns the mutated URL after its query changed.
29 | func replaceUrlQueryValue(url *nurl.URL, queryName string, queryValue string) *nurl.URL {
30 | clonedURL := *url
31 | queries := clonedURL.Query()
32 | queries.Set(queryName, PageParamPlaceholder)
33 | clonedURL.RawQuery = queries.Encode()
34 | return &clonedURL
35 | }
36 |
--------------------------------------------------------------------------------
/internal/converter/utils.go:
--------------------------------------------------------------------------------
1 | package converter
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/go-shiori/dom"
7 | "github.com/markusmobius/go-domdistiller/internal/re2go"
8 | "github.com/markusmobius/go-domdistiller/internal/stringutil"
9 | "golang.org/x/net/html"
10 | )
11 |
12 | var (
13 | unlikelyRoles = map[string]struct{}{
14 | "menu": {},
15 | "menubar": {},
16 | "complementary": {},
17 | "navigation": {},
18 | "alert": {},
19 | "alertdialog": {},
20 | "dialog": {},
21 | }
22 | )
23 |
24 | // isElementWithoutContent determines if node is empty
25 | // or only filled with and
.
26 | func isElementWithoutContent(node *html.Node) bool {
27 | brs := dom.GetElementsByTagName(node, "br")
28 | hrs := dom.GetElementsByTagName(node, "hr")
29 | childs := dom.Children(node)
30 |
31 | return node.Type == html.ElementNode &&
32 | strings.TrimSpace(dom.TextContent(node)) == "" &&
33 | (len(childs) == 0 || len(childs) == len(brs)+len(hrs))
34 | }
35 |
36 | func isByline(node *html.Node, matchString string) bool {
37 | rel := dom.GetAttribute(node, "rel")
38 | itemprop := dom.GetAttribute(node, "itemprop")
39 | nodeText := dom.TextContent(node)
40 | if (rel == "author" || strings.Contains(itemprop, "author") || re2go.IsByline(matchString)) &&
41 | isValidByline(nodeText) {
42 | return true
43 | }
44 |
45 | return false
46 | }
47 |
48 | // isValidByline checks whether the input string could be a byline.
49 | // This verifies that the input is a string, and that the length
50 | // is less than 100 chars.
51 | func isValidByline(byline string) bool {
52 | byline = strings.TrimSpace(byline)
53 | nChar := stringutil.CharCount(byline)
54 | return nChar > 0 && nChar < 100
55 | }
56 |
--------------------------------------------------------------------------------
/internal/markup/schemaorg/thing-item-unsupported.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/SchemaOrgParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package schemaorg
28 |
29 | import (
30 | "golang.org/x/net/html"
31 | )
32 |
33 | type UnsupportedItem struct {
34 | BaseThingItem
35 | }
36 |
37 | func NewUnsupportedItem(element *html.Node) *UnsupportedItem {
38 | item := &UnsupportedItem{}
39 | item.init(Unsupported, element)
40 | return item
41 | }
42 |
--------------------------------------------------------------------------------
/internal/filter/heuristic/constant_test.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2020 Markus Mobius
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | // SOFTWARE.
20 |
21 | package heuristic_test
22 |
23 | const (
24 | titleText = "I am the document title"
25 |
26 | contentText = "Lorem Ipsum Lorem Ipsum Lorem Ipsum."
27 |
28 | longText = "Lorem Ipsum Lorem Ipsum Lorem Ipsum. " +
29 | "Lorem Ipsum Lorem Ipsum Lorem Ipsum. " +
30 | "Lorem Ipsum Lorem Ipsum Lorem Ipsum."
31 |
32 | longLeadingText = "" +
33 | "Leading text that's used to start a document but just to offset a " +
34 | "few text blocks. This will allow testing in-page merges."
35 |
36 | shortText = "I might be a header."
37 |
38 | headingText = "Heading"
39 | )
40 |
--------------------------------------------------------------------------------
/internal/re2go/document-title.go:
--------------------------------------------------------------------------------
1 | // Code generated by re2c 3.1, DO NOT EDIT.
2 | package re2go
3 |
4 | import "strings"
5 |
6 | // Original pattern: (?i)[\?\!\.\-\:]+
7 | func RemoveDtmCharacters(input string) string {
8 | var cursor, marker int
9 | input += string(rune(0)) // add terminating null
10 | limit := len(input) - 1 // limit points at the terminating null
11 | _ = marker
12 |
13 | // Variable for capturing parentheses (twice the number of groups).
14 | var YYMAXNMATCH int = 1
15 |
16 | yypmatch := make([]int, YYMAXNMATCH*2)
17 | var yynmatch int
18 | _ = yynmatch
19 |
20 | // Autogenerated tag variables used by the lexer to track tag values.
21 | var yyt1 int
22 | _ = yyt1
23 |
24 | var start int
25 | var sb strings.Builder
26 | for {
27 | {
28 | var yych byte
29 | yych = input[cursor]
30 | switch yych {
31 | case '!':
32 | fallthrough
33 | case '-', '.':
34 | fallthrough
35 | case ':':
36 | fallthrough
37 | case '?':
38 | yyt1 = cursor
39 | goto yy2
40 | default:
41 | if limit <= cursor {
42 | goto yy4
43 | }
44 | goto yy1
45 | }
46 | yy1:
47 | cursor++
48 | {
49 | continue
50 | }
51 | yy2:
52 | cursor++
53 | yych = input[cursor]
54 | switch yych {
55 | case '!':
56 | fallthrough
57 | case '-', '.':
58 | fallthrough
59 | case ':':
60 | fallthrough
61 | case '?':
62 | goto yy2
63 | default:
64 | goto yy3
65 | }
66 | yy3:
67 | yynmatch = 1
68 | yypmatch[0] = yyt1
69 | yypmatch[1] = cursor
70 | {
71 | sb.WriteString(input[start:yypmatch[0]])
72 | start = yypmatch[1]
73 | continue
74 | }
75 | yy4:
76 | {
77 | sb.WriteString(input[start:limit])
78 | return sb.String()
79 | }
80 | }
81 |
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/internal/pagination/info/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParamInfo.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package info
28 |
29 | const (
30 | minLinksToJustifyLinearMap = 2
31 | )
32 |
33 | // ParamType is types of page parameter values in paging URLs.
34 | type ParamType uint
35 |
36 | const (
37 | Unset ParamType = iota // Initialized type to indicate empty PageParamInfo.
38 | PageNumber // Value is a page number.
39 | Unknown // None of the above.
40 | )
41 |
--------------------------------------------------------------------------------
/internal/logutil/logger.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/LogUtil.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package logutil
28 |
29 | // Logger is the base interface for logging process of distiller.
30 | type Logger interface {
31 | InternallyNil() bool
32 | IsLogExtraction() bool
33 | IsLogVisibility() bool
34 | IsLogPagination() bool
35 | IsLogTiming() bool
36 |
37 | PrintExtractionInfo(args ...interface{})
38 | PrintVisibilityInfo(args ...interface{})
39 | PrintPaginationInfo(args ...interface{})
40 | PrintTimingInfo(args ...interface{})
41 | }
42 |
--------------------------------------------------------------------------------
/internal/testutil/text-document.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/document/TextDocumentTestUtil.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package testutil
28 |
29 | import (
30 | "strings"
31 |
32 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
33 | )
34 |
35 | func GetContentFromTextDocument(doc *webdoc.TextDocument) string {
36 | var buffer strings.Builder
37 | for _, tb := range doc.TextBlocks {
38 | if tb.IsContent() {
39 | buffer.WriteString(tb.Text)
40 | buffer.WriteString("\n")
41 | }
42 | }
43 | return buffer.String()
44 | }
45 |
--------------------------------------------------------------------------------
/IMPROVEMENTS.md:
--------------------------------------------------------------------------------
1 | # Improvements
2 |
3 | After using both Readability.js and DOM Distiller, we found that there are several improvements that can be implemented into this port. Besides that, from our experiments we also found some possible bugs that we decided to fix.
4 |
5 | These so-called improvements are listed here as historical documentation and to explain the difference between the main branch and stable branch.
6 |
7 | ## From Readability
8 |
9 | - Implement function to check if a HTML element is probably visible or not. This is especially useful since one of the DOM Distiller strategy is to exclude invisible elements by computing the stylesheets (which is impossible to do in Go).
10 | - Exclude form and input element, since in distilled mode we only want to read.
11 | - Skip byline, empty div and unlikely elements by checking its class name, id and role attributes.
12 | - Convert anchors with Javascript URL into an ordinary text node.
13 | - Convert font to span elements. This is done because the font elements is usually only used for styling, so Readability.js decided to convert it.
14 | - Exclude identification and presentational attributes (eg. `id`, `class` and `style`) from each elements.
15 |
16 | ## From our own experiments
17 |
18 | - Make sure figure's caption doesn't contains noscript elements. This is done because noscript in Go is a bit weird, sometimes it detected as HTML element while the other times it detected as plain text, so we need additional schecks to clean it.
19 | - Mark large blocks around main content's tag level as content as well. In original DOM Distiller, they are looking for the most likely main content, then they mark text blocks that exist in the same tag level of the main content as content as well. Unfortunately, we found out that in some sites parts of the article are omitted by DOM Distiller. To fix this, we decided to make the filter more tolerant by checking text blocks in lower and upper tag levels as well.
--------------------------------------------------------------------------------
/internal/extractor/embed/embed.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/extractors/embeds/EmbedExtractor.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package embed
28 |
29 | import (
30 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
31 | "golang.org/x/net/html"
32 | )
33 |
34 | // EmbedExtractor is interface for extracting embedded nodes int webdoc.Element.
35 | type EmbedExtractor interface {
36 | // RelevantTagNames returns a set of HTML tag names that are relevant to this extractor.
37 | RelevantTagNames() []string
38 | // Extract detects if a node should be extracted as an embedded element; if not return nil.
39 | Extract(node *html.Node) webdoc.Element
40 | }
41 |
--------------------------------------------------------------------------------
/internal/pagination/info/page-info.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParamInfo.java and
2 | // java/MonotonicPageInfosGroups.java
3 |
4 | // Copyright (c) 2020 Markus Mobius
5 | //
6 | // Permission is hereby granted, free of charge, to any person obtaining a copy
7 | // of this software and associated documentation files (the "Software"), to deal
8 | // in the Software without restriction, including without limitation the rights
9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | // copies of the Software, and to permit persons to whom the Software is
11 | // furnished to do so, subject to the following conditions:
12 | //
13 | // The above copyright notice and this permission notice shall be included in all
14 | // copies or substantial portions of the Software.
15 | //
16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | // SOFTWARE.
23 |
24 | // Copyright 2015 The Chromium Authors. All rights reserved.
25 | // Use of this source code is governed by a BSD-style license that can be
26 | // found in the LICENSE file.
27 |
28 | package info
29 |
30 | import "fmt"
31 |
32 | // PageInfo stores potential pagination info:
33 | // - page number represented as original plain text in document URL
34 | // - if the info is extracted from an anchor, its href.
35 | type PageInfo struct {
36 | PageNumber int
37 | URL string
38 | }
39 |
40 | func (pi *PageInfo) String() string {
41 | return fmt.Sprintf("pg%d: %s", pi.PageNumber, pi.URL)
42 | }
43 |
44 | type PageInfoGroup struct {
45 | List []*PageInfo
46 | DeltaSign int
47 | }
48 |
--------------------------------------------------------------------------------
/internal/pagination/info/linear-formula.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParamInfo.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package info
28 |
29 | import (
30 | "fmt"
31 | )
32 |
33 | // LinearFormula stores the coefficient and delta values of the linear formula:
34 | // pageParamValue = coefficient * pageNum + delta.
35 | type LinearFormula struct {
36 | Coefficient int
37 | Delta int
38 | }
39 |
40 | func NewLinearFormula(coefficient, delta int) *LinearFormula {
41 | return &LinearFormula{
42 | Coefficient: coefficient,
43 | Delta: delta,
44 | }
45 | }
46 |
47 | func (lf *LinearFormula) String() string {
48 | return fmt.Sprintf("coefficient=%d, delta=%d", lf.Coefficient, lf.Delta)
49 | }
50 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/images/ImageScorer.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer
28 |
29 | import "golang.org/x/net/html"
30 |
31 | // ImageScorer is used to represent a single heuristic used in image extraction.
32 | // The provided image will be given a score based on the heuristic and a max score.
33 | type ImageScorer interface {
34 | // GetImageScore returns a particular image a score based on the heuristic
35 | // implemented in this ImageScorer and what the max score is set to.
36 | GetImageScore(e *html.Node) int
37 |
38 | // GetMaxScore returns the maximum possible score that this ImageScorer can return.
39 | GetMaxScore() int
40 | }
41 |
--------------------------------------------------------------------------------
/data/timing-info.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: Protobuf model in proto/dom_distiller.proto
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | package data
24 |
25 | import "time"
26 |
27 | type TimingEntry struct {
28 | Name string
29 | Time time.Duration
30 | }
31 |
32 | type TimingInfo struct {
33 | MarkupParsingTime time.Duration
34 | DocumentConstructionTime time.Duration
35 | ArticleProcessingTime time.Duration
36 | FormattingTime time.Duration
37 | TotalTime time.Duration
38 |
39 | // A place to hold arbitrary breakdowns of time. The perf scoring/server
40 | // should display these entries with appropriate names.
41 | OtherTimes []TimingEntry
42 | }
43 |
44 | func (ti *TimingInfo) AddEntry(start time.Time, name string) {
45 | if ti == nil {
46 | return
47 | }
48 |
49 | ti.OtherTimes = append(ti.OtherTimes, TimingEntry{
50 | Name: name,
51 | Time: time.Since(start),
52 | })
53 | }
54 |
--------------------------------------------------------------------------------
/internal/webdoc/element.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/WebElement.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc
28 |
29 | // Element is some logical part of a web document (text block, image, video, table, etc.)
30 | type Element interface {
31 | // GenerateOutput generates HTML output for this Element.
32 | GenerateOutput(textOnly bool) string
33 | IsContent() bool
34 | SetIsContent(bool)
35 | ElementType() string
36 | String() string
37 | }
38 |
39 | // BaseElement is base of any other element.
40 | type BaseElement struct {
41 | isContent bool
42 | }
43 |
44 | func (be *BaseElement) IsContent() bool {
45 | return be.isContent
46 | }
47 |
48 | func (be *BaseElement) SetIsContent(b bool) {
49 | be.isContent = b
50 | }
51 |
--------------------------------------------------------------------------------
/internal/markup/schemaorg/thing-item-org.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/SchemaOrgParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package schemaorg
28 |
29 | import (
30 | "golang.org/x/net/html"
31 | )
32 |
33 | type OrganizationItem struct {
34 | BaseThingItem
35 | }
36 |
37 | func NewOrganizationItem(element *html.Node) *OrganizationItem {
38 | item := &OrganizationItem{}
39 | item.init(Organization, element)
40 | item.addStringPropertyName(LegalNameProp)
41 | return item
42 | }
43 |
44 | func (oi *OrganizationItem) getName() string {
45 | // Returns either the value of NameProp, or LegalNameProp.
46 | if name := oi.getStringProperty(NameProp); name != "" {
47 | return name
48 | }
49 |
50 | return oi.getStringProperty(LegalNameProp)
51 | }
52 |
--------------------------------------------------------------------------------
/internal/re2go/dom-converter.re:
--------------------------------------------------------------------------------
1 | /*!include:re2c "base.re" */
2 |
3 | // Original pattern: (?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote
4 | func IsUnlikelyCandidates(input string) bool {
5 | var cursor, marker int
6 | input += string(rune(0)) // add terminating null
7 | limit := len(input) - 1 // limit points at the terminating null
8 | _ = marker
9 |
10 | for { /*!use:re2c:base_template
11 | re2c:case-insensitive = 1;
12 |
13 | unlikely = -ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote;
14 |
15 | {unlikely} { return true }
16 | * { continue }
17 | $ { return false }
18 | */
19 | }
20 | }
21 |
22 | // Original pattern: (?i)and|article|body|column|content|main|shadow
23 | func MaybeItsACandidate(input string) bool {
24 | var cursor, marker int
25 | input += string(rune(0)) // add terminating null
26 | limit := len(input) - 1 // limit points at the terminating null
27 | _ = marker
28 |
29 | for { /*!use:re2c:base_template
30 | re2c:case-insensitive = 1;
31 |
32 | maybe = and|article|body|column|content|main|shadow;
33 |
34 | {maybe} { return true }
35 | * { continue }
36 | $ { return false }
37 | */
38 | }
39 | }
40 |
41 | // Original pattern: (?i)byline|author|dateline|writtenby|p-author
42 | func IsByline(input string) bool {
43 | var cursor, marker int
44 | input += string(rune(0)) // add terminating null
45 | limit := len(input) - 1 // limit points at the terminating null
46 | _ = marker
47 |
48 | for { /*!use:re2c:base_template
49 | re2c:case-insensitive = 1;
50 |
51 | byline = byline|author|dateline|writtenby|p-author;
52 |
53 | {byline} { return true }
54 | * { continue }
55 | $ { return false }
56 | */
57 | }
58 | }
--------------------------------------------------------------------------------
/internal/filter/docfilter/relevant-elements.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/RelevantElements.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package docfilter
28 |
29 | import "github.com/markusmobius/go-domdistiller/internal/webdoc"
30 |
31 | type RelevantElements struct{}
32 |
33 | func NewRelevantElements() *RelevantElements {
34 | return &RelevantElements{}
35 | }
36 |
37 | func (f *RelevantElements) Process(doc *webdoc.Document) bool {
38 | changes := false
39 | inContent := false
40 |
41 | for _, e := range doc.Elements {
42 | if e.IsContent() {
43 | inContent = true
44 | } else if _, isText := e.(*webdoc.Text); isText {
45 | inContent = false
46 | } else {
47 | if inContent {
48 | e.SetIsContent(true)
49 | changes = true
50 | }
51 | }
52 | }
53 |
54 | return changes
55 | }
56 |
--------------------------------------------------------------------------------
/data/data.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: Protobuf model in proto/dom_distiller.proto
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | package data
24 |
25 | type PaginationInfo struct {
26 | NextPage string
27 | PrevPage string
28 | }
29 |
30 | // MarkupArticle is object to contains the properties of an article document.
31 | type MarkupArticle struct {
32 | PublishedTime string
33 | ModifiedTime string
34 | ExpirationTime string
35 | Section string
36 | Authors []string
37 | }
38 |
39 | // MarkupImage is used to contains the properties of an image in the document.
40 | type MarkupImage struct {
41 | Root string
42 | URL string
43 | SecureURL string
44 | Type string
45 | Caption string
46 | Width int
47 | Height int
48 | }
49 |
50 | type MarkupInfo struct {
51 | Title string
52 | Type string
53 | URL string
54 | Description string
55 | Publisher string
56 | Copyright string
57 | Author string
58 | Article MarkupArticle
59 | Images []MarkupImage
60 | }
61 |
--------------------------------------------------------------------------------
/internal/webdoc/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/WebText.java, java/webdocument/WebTag.java,
2 | // java/webdocument/WebImage.java
3 |
4 | // Copyright (c) 2020 Markus Mobius
5 | //
6 | // Permission is hereby granted, free of charge, to any person obtaining a copy
7 | // of this software and associated documentation files (the "Software"), to deal
8 | // in the Software without restriction, including without limitation the rights
9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | // copies of the Software, and to permit persons to whom the Software is
11 | // furnished to do so, subject to the following conditions:
12 | //
13 | // The above copyright notice and this permission notice shall be included in all
14 | // copies or substantial portions of the Software.
15 | //
16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | // SOFTWARE.
23 |
24 | // Copyright 2015 The Chromium Authors. All rights reserved.
25 | // Use of this source code is governed by a BSD-style license that can be
26 | // found in the LICENSE file.
27 |
28 | package webdoc
29 |
30 | type TagType uint
31 |
32 | const (
33 | TagStart TagType = iota
34 | TagEnd
35 | )
36 |
37 | var lazyImageAttrs = map[string]string{
38 | "data-srcset": "srcset",
39 | }
40 |
41 | func CanBeNested(tagName string) bool {
42 | switch tagName {
43 | case "ul", "ol", "li", "blockquote", "pre":
44 | return true
45 |
46 | default:
47 | return false
48 | }
49 | }
50 |
51 | // All inline elements except for impossible tags: br, object, and script.
52 | // Please refer to DomConverter.visitElement() for skipped tags.
53 | // Reference: https://developer.mozilla.org/en-US/docs/HTML/Inline_elements
54 | var inlineTagNames = map[string]struct{}{}
55 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image-ratio.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/images/DimensionsRatioScorer.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer
28 |
29 | import "golang.org/x/net/html"
30 |
31 | // ImageRatioScorer uses image ratio (length/width) as its heuristic.
32 | // Unfortunately to do that we need to compute CSS which is impossible
33 | // in Go, so this scorer do nothing. NEED-COMPUTE-CSS.
34 | type ImageRatioScorer struct {
35 | maxScore int
36 | }
37 |
38 | // NewImageRatioScorer returns and initiates the ImageRatioScorer.
39 | func NewImageRatioScorer(maxScore int) *ImageRatioScorer {
40 | return &ImageRatioScorer{
41 | maxScore: maxScore,
42 | }
43 | }
44 |
45 | func (s *ImageRatioScorer) GetImageScore(_ *html.Node) int {
46 | return 0
47 | }
48 |
49 | func (s *ImageRatioScorer) GetMaxScore() int {
50 | return s.maxScore
51 | }
52 |
--------------------------------------------------------------------------------
/internal/testutil/text-block-builder.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/TestTextBlockBuilder.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package testutil
28 |
29 | import (
30 | "github.com/markusmobius/go-domdistiller/internal/stringutil"
31 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
32 | )
33 |
34 | type TextBlockBuilder struct {
35 | textBuilder *TextBuilder
36 | }
37 |
38 | func NewTextBlockBuilder(wc stringutil.WordCounter) *TextBlockBuilder {
39 | return &TextBlockBuilder{
40 | textBuilder: NewTextBuilder(wc),
41 | }
42 | }
43 |
44 | func (tbb *TextBlockBuilder) CreateForText(text string) *webdoc.TextBlock {
45 | wt := tbb.textBuilder.CreateForText(text)
46 | return webdoc.NewTextBlock(wt)
47 | }
48 |
49 | func (tbb *TextBlockBuilder) CreateForAnchorText(text string) *webdoc.TextBlock {
50 | wt := tbb.textBuilder.CreateForAnchorText(text)
51 | return webdoc.NewTextBlock(wt)
52 | }
53 |
--------------------------------------------------------------------------------
/internal/webdoc/figure.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/WebImage.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc
28 |
29 | import (
30 | "github.com/go-shiori/dom"
31 | "github.com/markusmobius/go-domdistiller/internal/domutil"
32 | "golang.org/x/net/html"
33 | )
34 |
35 | type Figure struct {
36 | Image
37 | Caption *html.Node
38 | }
39 |
40 | func (f *Figure) ElementType() string {
41 | return "figure"
42 | }
43 |
44 | func (f *Figure) GenerateOutput(textOnly bool) string {
45 | figCaption := domutil.CloneAndProcessTree(f.Caption, f.PageURL)
46 | if textOnly {
47 | return domutil.InnerText(figCaption)
48 | }
49 |
50 | figure := dom.CreateElement("figure")
51 | dom.AppendChild(figure, f.getProcessedNode())
52 | if dom.InnerHTML(f.Caption) != "" {
53 | dom.AppendChild(figure, figCaption)
54 | }
55 |
56 | domutil.StripAttributes(figure)
57 | return dom.OuterHTML(figure)
58 | }
59 |
--------------------------------------------------------------------------------
/internal/webdoc/tag.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/WebTag.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc
28 |
29 | import "fmt"
30 |
31 | // Tag represents HTML tags that need to be preserved over.
32 | type Tag struct {
33 | BaseElement
34 | Name string
35 | Type TagType
36 | }
37 |
38 | func NewTag(name string, tagType TagType) *Tag {
39 | return &Tag{Name: name, Type: tagType}
40 | }
41 |
42 | func (t *Tag) ElementType() string {
43 | return "tag"
44 | }
45 |
46 | func (t *Tag) GenerateOutput(textOnly bool) string {
47 | if textOnly {
48 | return ""
49 | }
50 |
51 | if t.Type == TagStart {
52 | return "<" + t.Name + ">"
53 | }
54 | return "" + t.Name + ">"
55 | }
56 |
57 | func (t *Tag) String() string {
58 | tp := "tag_start"
59 | if t.Type == TagEnd {
60 | tp = "tag_end"
61 | }
62 |
63 | return fmt.Sprintf("ELEMENT %q: name=%q, type=%s, is_content=%v",
64 | t.ElementType(), t.Name, tp, t.isContent)
65 | }
66 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image-area.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/images/AreaScorer.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer
28 |
29 | import "golang.org/x/net/html"
30 |
31 | // ImageAreaScorer uses image area (length*width) as its heuristic.
32 | // Unfortunately to do that we need to compute CSS which is impossible
33 | // in Go, so this scorer do nothing. NEED-COMPUTE-CSS.
34 | type ImageAreaScorer struct {
35 | maxScore int
36 | minArea int
37 | maxArea int
38 | }
39 |
40 | // NewImageAreaScorer returns and initiates the ImageAreaScorer.
41 | func NewImageAreaScorer(maxScore, minArea, maxArea int) *ImageAreaScorer {
42 | return &ImageAreaScorer{
43 | maxScore: maxScore,
44 | minArea: minArea,
45 | maxArea: maxArea,
46 | }
47 | }
48 |
49 | func (s *ImageAreaScorer) GetImageScore(_ *html.Node) int {
50 | return 0
51 | }
52 |
53 | func (s *ImageAreaScorer) GetMaxScore() int {
54 | return s.maxScore
55 | }
56 |
--------------------------------------------------------------------------------
/internal/testutil/page-param-content-info.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/PageParamContentInfo.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package testutil
28 |
29 | type PageParamContentType uint
30 |
31 | const (
32 | UnrelatedTerms PageParamContentType = iota
33 | NumberInPlainText
34 | NumericOutlink
35 | )
36 |
37 | type PageParamContentInfo struct {
38 | Type PageParamContentType
39 | TargetURL string
40 | Number int
41 | }
42 |
43 | func PPCIUnrelatedTerms() *PageParamContentInfo {
44 | return &PageParamContentInfo{Type: UnrelatedTerms}
45 | }
46 |
47 | func PPCINumberInPlainText(number int) *PageParamContentInfo {
48 | return &PageParamContentInfo{
49 | Type: NumberInPlainText,
50 | Number: number,
51 | }
52 | }
53 |
54 | func PPCINumericOutlink(targetURL string, number int) *PageParamContentInfo {
55 | return &PageParamContentInfo{
56 | Type: NumericOutlink,
57 | TargetURL: targetURL,
58 | Number: number,
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/internal/markup/schemaorg/thing-item-person.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/SchemaOrgParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package schemaorg
28 |
29 | import (
30 | "golang.org/x/net/html"
31 | )
32 |
33 | type PersonItem struct {
34 | BaseThingItem
35 | }
36 |
37 | func NewPersonItem(element *html.Node) *PersonItem {
38 | item := &PersonItem{}
39 | item.init(Person, element)
40 | item.addStringPropertyName(FamilyNameProp)
41 | item.addStringPropertyName(GivenNameProp)
42 | return item
43 | }
44 |
45 | func (pi *PersonItem) getName() string {
46 | // Returns either the value of NameProp, or concatenated values
47 | // of GivenNameProp and FamilyNameProp delimited by a whitespace.
48 | if name := pi.getStringProperty(NameProp); name != "" {
49 | return name
50 | }
51 |
52 | givenName := pi.getStringProperty(GivenNameProp)
53 | familyName := pi.getStringProperty(FamilyNameProp)
54 | if givenName != "" && familyName != "" {
55 | givenName += " "
56 | }
57 |
58 | return givenName + familyName
59 | }
60 |
--------------------------------------------------------------------------------
/internal/pagination/pattern/page-pattern.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParameterDetector.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package pattern
28 |
29 | import (
30 | nurl "net/url"
31 | )
32 |
33 | // PagePattern is the interface that page pattern handlers must implement to detect
34 | // page parameter from potential pagination URLs.
35 | type PagePattern interface {
36 | // String returns the string of the URL page pattern.
37 | String() string
38 |
39 | // PageNumber returns the page number extracted from the URL during creation of
40 | // object that implements this interface.
41 | PageNumber() int
42 |
43 | // IsValidFor validates this page pattern according to the current document URL
44 | // through a pipeline of rules. Returns true if page pattern is valid.
45 | // docUrl is the current document URL.
46 | IsValidFor(docURL *nurl.URL) bool
47 |
48 | // IsPagingURL returns true if a URL matches this page pattern based on a pipeline of rules.
49 | // url is the URL to evaluate.
50 | IsPagingURL(url string) bool
51 | }
52 |
--------------------------------------------------------------------------------
/internal/webdoc/tag_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/webdocument/WebTagTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc_test
28 |
29 | import (
30 | "testing"
31 |
32 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
33 | "github.com/stretchr/testify/assert"
34 | )
35 |
36 | func Test_WebDoc_Tag_OLGenerateOutput(t *testing.T) {
37 | olStartTag := webdoc.Tag{Name: "ol", Type: webdoc.TagStart}
38 | olEndTag := webdoc.Tag{Name: "ol", Type: webdoc.TagEnd}
39 | startResult := olStartTag.GenerateOutput(false)
40 | endResult := olEndTag.GenerateOutput(false)
41 | assert.Equal(t, "", startResult)
42 | assert.Equal(t, "", endResult)
43 | }
44 |
45 | func Test_WebDoc_Tag_GenerateOutput(t *testing.T) {
46 | startTag := webdoc.Tag{Name: "anytext", Type: webdoc.TagStart}
47 | endTag := webdoc.Tag{Name: "anytext", Type: webdoc.TagEnd}
48 | startResult := startTag.GenerateOutput(false)
49 | endResult := endTag.GenerateOutput(false)
50 | assert.Equal(t, "", startResult)
51 | assert.Equal(t, "", endResult)
52 | }
53 |
--------------------------------------------------------------------------------
/internal/domutil/walker.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/DomWalker.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package domutil
28 |
29 | import (
30 | "golang.org/x/net/html"
31 | )
32 |
33 | // WalkNodes used to walk the subtree of the DOM rooted at a particular root. It has two
34 | // function parameters, i.e. fnVisit and fnExit :
35 | // - fnVisit is called when we reach a node during the walk. If it returns false, children
36 | // of the node will be skipped and fnExit won't be called for this node.
37 | // - fnExit is called when exiting a node, after visiting all of its children.
38 | func WalkNodes(root *html.Node, fnVisit func(*html.Node) bool, fnExit func(*html.Node)) {
39 | if root == nil {
40 | return
41 | }
42 |
43 | visitChildren := false
44 | if fnVisit != nil {
45 | visitChildren = fnVisit(root)
46 | }
47 |
48 | if !visitChildren {
49 | return
50 | }
51 |
52 | for child := root.FirstChild; child != nil; child = child.NextSibling {
53 | WalkNodes(child, fnVisit, fnExit)
54 | }
55 |
56 | if fnExit != nil {
57 | fnExit(root)
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/internal/testutil/html_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/TestUtilTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package testutil_test
28 |
29 | import (
30 | "regexp"
31 | "testing"
32 |
33 | "github.com/go-shiori/dom"
34 | "github.com/markusmobius/go-domdistiller/internal/testutil"
35 | "github.com/stretchr/testify/assert"
36 | )
37 |
38 | var (
39 | rxCleanWhitespaces = regexp.MustCompile(`(?mi)^\s+`)
40 | rxNewlines = regexp.MustCompile(`(?i)\n`)
41 | )
42 |
43 | func Test_TestUtil_CreateDivTree(t *testing.T) {
44 | expectedHTML := `
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
`
67 |
68 | expectedHTML = rxCleanWhitespaces.ReplaceAllString(expectedHTML, "")
69 | expectedHTML = rxNewlines.ReplaceAllString(expectedHTML, "")
70 |
71 | divs := testutil.CreateDivTree()
72 | assert.Equal(t, expectedHTML, dom.OuterHTML(divs[0]))
73 | }
74 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image-has-figure_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/ImageHeuristicsTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer_test
28 |
29 | import (
30 | "testing"
31 |
32 | "github.com/go-shiori/dom"
33 | "github.com/markusmobius/go-domdistiller/internal/filter/docfilter/scorer"
34 | "github.com/markusmobius/go-domdistiller/internal/testutil"
35 | "github.com/stretchr/testify/assert"
36 | )
37 |
38 | func Test_Filter_DocFilter_Scorer_ImageHasFigureScorer(t *testing.T) {
39 | root := testutil.CreateDiv(0)
40 | fig := dom.CreateElement("figure")
41 |
42 | goodImage := dom.CreateElement("img")
43 | dom.SetAttribute(goodImage, "style", "width: 100px; height: 100px; display: block;")
44 |
45 | badImage := dom.CreateElement("img")
46 | dom.SetAttribute(badImage, "style", "width: 100px; height: 100px; display: block;")
47 |
48 | dom.AppendChild(fig, goodImage)
49 | dom.AppendChild(root, fig)
50 | dom.AppendChild(root, badImage)
51 |
52 | imgScorer := scorer.NewImageHasFigureScorer(50)
53 |
54 | assert.True(t, imgScorer.GetImageScore(goodImage) > 0)
55 | assert.Equal(t, 0, imgScorer.GetImageScore(badImage))
56 | assert.Equal(t, 0, imgScorer.GetImageScore(nil))
57 | }
58 |
--------------------------------------------------------------------------------
/internal/markup/opengraph/prefixes.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/OpenGraphProtocolParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package opengraph
28 |
29 | import "strings"
30 |
31 | type Prefix uint
32 |
33 | const (
34 | OG Prefix = iota
35 | Profile
36 | Article
37 | )
38 |
39 | type PrefixNameList map[Prefix]string
40 |
41 | func (prefixes PrefixNameList) addObjectType(prefix, objType string) {
42 | if objType == "" {
43 | prefixes[OG] = prefix
44 | return
45 | }
46 |
47 | objType = strings.TrimPrefix(objType, "/")
48 | if objType == ProfileObjtype {
49 | prefixes[Profile] = prefix
50 | return
51 | }
52 |
53 | if objType == ArticleObjtype {
54 | prefixes[Article] = prefix
55 | }
56 | }
57 |
58 | func (prefixes PrefixNameList) setDefault() {
59 | // For any unspecified prefix, use common ones:
60 | // - "og": http://ogp.me/ns#
61 | // - "profile": http://ogp.me/ns/profile#
62 | // - "article": http://ogp.me/ns/article#.
63 | if _, exist := prefixes[OG]; !exist {
64 | prefixes[OG] = "og"
65 | }
66 |
67 | if _, exist := prefixes[Profile]; !exist {
68 | prefixes[Profile] = ProfileObjtype
69 | }
70 |
71 | if _, exist := prefixes[Article]; !exist {
72 | prefixes[Article] = ArticleObjtype
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image-has-figure.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/images/HasFigureScorer.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer
28 |
29 | import (
30 | "github.com/markusmobius/go-domdistiller/internal/domutil"
31 | "golang.org/x/net/html"
32 | )
33 |
34 | // ImageHasFigureScorer scores based on if the image has a "figure" node as an ancestor.
35 | type ImageHasFigureScorer struct {
36 | maxScore int
37 | }
38 |
39 | // NewImageHasFigureScorer returns and initiates the ImageHasFigureScorer.
40 | func NewImageHasFigureScorer(maxScore int) *ImageHasFigureScorer {
41 | return &ImageHasFigureScorer{
42 | maxScore: maxScore,
43 | }
44 | }
45 |
46 | func (s *ImageHasFigureScorer) GetImageScore(node *html.Node) int {
47 | var score int
48 | if node != nil {
49 | score = s.compute(node)
50 | }
51 |
52 | if score < s.maxScore {
53 | return score
54 | }
55 |
56 | return s.maxScore
57 | }
58 |
59 | func (s *ImageHasFigureScorer) GetMaxScore() int {
60 | return s.maxScore
61 | }
62 |
63 | func (s *ImageHasFigureScorer) compute(node *html.Node) int {
64 | parents := domutil.GetParentNodes(node)
65 | for _, n := range parents {
66 | if n.Type == html.ElementNode && n.Data == "figure" {
67 | return s.maxScore
68 | }
69 | }
70 | return 0
71 | }
72 |
--------------------------------------------------------------------------------
/internal/re2go/domutil.re:
--------------------------------------------------------------------------------
1 | /*!include:re2c "base.re" */
2 |
3 | import "strings"
4 |
5 | // Original pattern: \s+([.?!,;])\s*(\S*)
6 | func TidyUpPunctuation(input string) string {
7 | var cursor, marker int
8 | input += string(rune(0)) // add terminating null
9 | limit := len(input) - 1 // limit points at the terminating null
10 | _ = marker
11 |
12 | // Variable for capturing parentheses (twice the number of groups).
13 | /*!maxnmatch:re2c*/
14 | yypmatch := make([]int, YYMAXNMATCH*2)
15 | var yynmatch int
16 | _ = yynmatch
17 |
18 | // Autogenerated tag variables used by the lexer to track tag values.
19 | /*!stags:re2c format = 'var @@ int; _ = @@\n'; */
20 |
21 | var start int
22 | var sb strings.Builder
23 | for { /*!use:re2c:base_template
24 | re2c:posix-captures = 1;
25 |
26 | space = [\t\n\f\r ];
27 | nonSpace = [^\t\n\f\r ];
28 |
29 | quant1 = {space}+;
30 | punctuation = {space}+([.?!,;]){space}*({nonSpace}*);
31 |
32 | {quant1} { continue }
33 | {punctuation} {
34 | before := input[start:yypmatch[0]]
35 | submatch1 := input[yypmatch[2]:yypmatch[3]]
36 | submatch2 := input[yypmatch[4]:yypmatch[5]]
37 |
38 | sb.WriteString(before)
39 | sb.WriteString(submatch1)
40 | sb.WriteString(" ")
41 | sb.WriteString(submatch2)
42 |
43 | start = yypmatch[1]
44 | continue
45 | }
46 |
47 | $ {
48 | sb.WriteString(input[start:limit])
49 | return sb.String()
50 | }
51 |
52 | * { continue }
53 | */
54 | }
55 | }
56 |
57 | // Original pattern: \s*\|\\/\|\s*
58 | func FixTempNewline(input string) string {
59 | var cursor, marker int
60 | input += string(rune(0)) // add terminating null
61 | limit := len(input) - 1 // limit points at the terminating null
62 | _ = marker
63 |
64 | // Variable for capturing parentheses (twice the number of groups).
65 | /*!maxnmatch:re2c*/
66 | yypmatch := make([]int, YYMAXNMATCH*2)
67 | var yynmatch int
68 | _ = yynmatch
69 |
70 | // Autogenerated tag variables used by the lexer to track tag values.
71 | /*!stags:re2c format = 'var @@ int; _ = @@\n'; */
72 |
73 | var start int
74 | var sb strings.Builder
75 | for { /*!use:re2c:base_template
76 | re2c:posix-captures = 1;
77 |
78 | space = [\t\n\f\r ];
79 | quant1 = {space}*[^|];
80 | tmpNewline = {space}*[|][\\][/][|]{space}*;
81 |
82 | {quant1} { continue }
83 |
84 | {tmpNewline} {
85 | sb.WriteString(input[start:yypmatch[0]])
86 | sb.WriteString("\n")
87 | start = yypmatch[1]
88 | continue
89 | }
90 |
91 | $ {
92 | sb.WriteString(input[start:limit])
93 | return sb.String()
94 | }
95 |
96 | * { continue }
97 | */
98 | }
99 | }
--------------------------------------------------------------------------------
/internal/filter/docfilter/nested-element.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/NestedElementRetainer.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package docfilter
28 |
29 | import (
30 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
31 | )
32 |
33 | type NestedElementRetainer struct{}
34 |
35 | func NewNestedElementRetainer() *NestedElementRetainer {
36 | return &NestedElementRetainer{}
37 | }
38 |
39 | func (f *NestedElementRetainer) Process(doc *webdoc.Document) bool {
40 | isContent := false
41 | stackMark := -1
42 | stack := []*webdoc.Tag{}
43 |
44 | for _, e := range doc.Elements {
45 | if webTag, isTag := e.(*webdoc.Tag); !isTag {
46 | if !isContent {
47 | isContent = e.IsContent()
48 | }
49 | } else {
50 | if webTag.Type == webdoc.TagStart {
51 | webTag.SetIsContent(isContent)
52 | stack = append(stack, webTag)
53 | isContent = false
54 | } else {
55 | startWebTag := stack[len(stack)-1]
56 | stack = stack[:len(stack)-1]
57 |
58 | isContent = isContent || stackMark >= len(stack)
59 | if isContent {
60 | stackMark = len(stack) - 1
61 | }
62 |
63 | wasContent := startWebTag.IsContent()
64 | startWebTag.SetIsContent(isContent)
65 | webTag.SetIsContent(isContent)
66 | isContent = wasContent
67 | }
68 | }
69 | }
70 |
71 | return true
72 | }
73 |
--------------------------------------------------------------------------------
/internal/tableclass/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/TableClassifier.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package tableclass
28 |
29 | var headerTags = map[string]bool{
30 | "colgroup": false,
31 | "col": false,
32 | "th": true,
33 | }
34 |
35 | var objectTags = map[string]bool{
36 | "embed": false,
37 | "object": false,
38 | "applet": false,
39 | "iframe": false,
40 | }
41 |
42 | // ARIA roles for table, see http://www.w3.org/TR/wai-aria/roles#widget_roles_header.
43 | var ariaTableRoles = map[string]struct{}{
44 | "grid": {},
45 | "treegrid": {},
46 | }
47 |
48 | // ARIA roles for descendants of table, see :
49 | // - http://www.w3.org/TR/wai-aria/roles#widget_roles_header.
50 | // - http://www.w3.org/TR/wai-aria/roles#document_structure_roles_header.
51 | var ariaTableDescendantRoles = map[string]struct{}{
52 | "gridcell": {},
53 | "columnheader": {},
54 | "row": {},
55 | "rowgroup": {},
56 | "rowheader": {},
57 | }
58 |
59 | // ARIA landmark roles, applicable to both table and its descendants
60 | // - http://www.w3.org/TR/wai-aria/roles#landmark_roles_header.
61 | var ariaRoles = map[string]struct{}{
62 | "application": {},
63 | "banner": {},
64 | "complementary": {},
65 | "contentinfo": {},
66 | "form": {},
67 | "main": {},
68 | "navigation": {},
69 | "search": {},
70 | }
71 |
--------------------------------------------------------------------------------
/internal/filter/filter.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/BoilerpipeFilter.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | // boilerpipe
28 | //
29 | // Copyright (c) 2009 Christian Kohlschütter
30 | //
31 | // The author licenses this file to You under the Apache License, Version 2.0
32 | // (the "License"); you may not use this file except in compliance with
33 | // the License. You may obtain a copy of the License at
34 | //
35 | // http://www.apache.org/licenses/LICENSE-2.0
36 | //
37 | // Unless required by applicable law or agreed to in writing, software
38 | // distributed under the License is distributed on an "AS IS" BASIS,
39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 | // See the License for the specific language governing permissions and
41 | // limitations under the License.
42 |
43 | package filter
44 |
45 | import "github.com/markusmobius/go-domdistiller/internal/webdoc"
46 |
47 | // TextDocumentFilter is interface for filter that process a TextDocument.
48 | type TextDocumentFilter interface {
49 | // Process processes the given document.
50 | // Returns true if changes have been made to the document.
51 | Process(doc *webdoc.TextDocument) bool
52 | }
53 |
54 | // DocumentFilter is interface for filter that process a Document.
55 | type DocumentFilter interface {
56 | // Process processes the given document.
57 | Process(doc *webdoc.Document) bool
58 | }
59 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image-dom-distance_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/ImageHeuristicsTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer_test
28 |
29 | import (
30 | "testing"
31 |
32 | "github.com/go-shiori/dom"
33 | "github.com/markusmobius/go-domdistiller/internal/filter/docfilter/scorer"
34 | "github.com/markusmobius/go-domdistiller/internal/testutil"
35 | "github.com/stretchr/testify/assert"
36 | )
37 |
38 | func Test_Filter_DocFilter_Scorer_ImageDomDistanceScorer(t *testing.T) {
39 | root := testutil.CreateDiv(0)
40 | content := testutil.CreateDiv(1)
41 | image := dom.CreateElement("img")
42 | dom.SetAttribute(image, "style", "width: 100px; height: 100px; display: block;")
43 |
44 | dom.AppendChild(content, image)
45 | dom.AppendChild(root, content)
46 |
47 | // Build long chain of divs to separate image from content.
48 | currentDiv := testutil.CreateDiv(3)
49 | dom.AppendChild(root, currentDiv)
50 | for i := 0; i < 7; i++ {
51 | child := testutil.CreateDiv(i + 4)
52 | dom.AppendChild(currentDiv, child)
53 | currentDiv = child
54 | }
55 |
56 | normalScorer := scorer.NewImageDomDistanceScorer(50, content)
57 | farContentScorer := scorer.NewImageDomDistanceScorer(50, currentDiv)
58 |
59 | assert.True(t, normalScorer.GetImageScore(image) > 0)
60 | assert.Equal(t, 0, farContentScorer.GetImageScore(image))
61 | assert.Equal(t, 0, normalScorer.GetImageScore(nil))
62 | }
63 |
--------------------------------------------------------------------------------
/internal/pagination/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParameterParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package pagination
28 |
29 | import (
30 | "regexp"
31 | "strconv"
32 | "strings"
33 | "unicode"
34 | )
35 |
36 | const (
37 | // If the numeric value of a link's anchor text is greater than this number,
38 | // we don't think it represents the page number of the link.
39 | MaxNumForPageParam = 100
40 | )
41 |
42 | var (
43 | // Regex for page number finder. If you are looking for regex for prev next finder,
44 | // they are compiled to re2go because it's quite slow.
45 | rxLinkNumberCleaner = regexp.MustCompile(`[()\[\]{}]`)
46 | rxInvalidParentWrapper = regexp.MustCompile(`(?i)(body)|(html)`)
47 | rxTerms = regexp.MustCompile(`(?i)(\S*[\w\x{00C0}-\x{1FFF}\x{2C00}-\x{D7FF}]\S*)`)
48 | rxSurroundingDigits = regexp.MustCompile(`(?i)^[\W_]*(\d+)[\W_]*$`)
49 | )
50 |
51 | func containsNumber(s string) bool {
52 | for _, r := range s {
53 | if unicode.IsDigit(r) {
54 | return true
55 | }
56 | }
57 | return false
58 | }
59 |
60 | func getStartingNumber(s string) (int, bool) {
61 | var b strings.Builder
62 | for _, r := range s {
63 | if !unicode.IsDigit(r) {
64 | break
65 | }
66 | b.WriteRune(r)
67 | }
68 |
69 | str := b.String()
70 | if str == "" {
71 | return 0, false
72 | }
73 |
74 | i, err := strconv.Atoi(b.String())
75 | return i, err == nil
76 | }
77 |
--------------------------------------------------------------------------------
/internal/pagination/pattern/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/PageParameterDetector.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package pattern
28 |
29 | import "regexp"
30 |
31 | const (
32 | PageParamPlaceholder = "[*!]"
33 | )
34 |
35 | var (
36 | rxNumber = regexp.MustCompile(`(?i)(\d+)`)
37 | rxEndOrHasSHTML = regexp.MustCompile(`(?i)(.s?html?)?$`)
38 | rxLastPathComponent = regexp.MustCompile(`(?i)([^/]*)/$`)
39 | rxTrailingSlashHTML = regexp.MustCompile(`(?i)(?:/|(.html?))$`)
40 | rxPageParamSeparator = regexp.MustCompile(`[-_;,]`)
41 | )
42 |
43 | var badPageParamNames = map[string]struct{}{
44 | "baixar-gratis": {},
45 | "category": {},
46 | "content": {},
47 | "day": {},
48 | "date": {},
49 | "definition": {},
50 | "etiket": {},
51 | "film-seyret": {},
52 | "key": {},
53 | "keys": {},
54 | "keyword": {},
55 | "label": {},
56 | "news": {},
57 | "q": {},
58 | "query": {},
59 | "rating": {},
60 | "s": {},
61 | "search": {},
62 | "seasons": {},
63 | "search_keyword": {},
64 | "search_query": {},
65 | "sortby": {},
66 | "subscriptions": {},
67 | "tag": {},
68 | "tags": {},
69 | "video": {},
70 | "videos": {},
71 | "w": {},
72 | "wiki": {},
73 | }
74 |
--------------------------------------------------------------------------------
/internal/extractor/embed/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/extractors/embeds/*.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package embed
28 |
29 | import "regexp"
30 |
31 | var (
32 | rxB64DataURL = regexp.MustCompile(`(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*`)
33 | rxSrcsetURL = regexp.MustCompile(`(?i)(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))`)
34 | rxImgExtensions = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)`)
35 | rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
36 | rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
37 |
38 | figureImageSelectors = []string{
39 | "noscript picture",
40 | "noscript img",
41 | "picture",
42 | "img",
43 | }
44 |
45 | lazyImageSrcAttrs = []string{
46 | "data-src",
47 | "data-original",
48 | "datasrc",
49 | "data-url",
50 | }
51 |
52 | lazyImageSrcsetAttrs = []string{
53 | "data-srcset",
54 | "datasrcset",
55 | }
56 |
57 | relevantImageTags = map[string]struct{}{
58 | // TODO: Add "div" to this list for css images and possibly captions.
59 | "img": {},
60 | "picture": {},
61 | "figure": {},
62 | "span": {},
63 | }
64 |
65 | relevantTwitterTags = map[string]struct{}{
66 | "blockquote": {},
67 | "iframe": {},
68 | }
69 |
70 | relevantVimeoTags = map[string]struct{}{
71 | "iframe": {},
72 | }
73 |
74 | relevantYouTubeTags = map[string]struct{}{
75 | "iframe": {},
76 | "object": {},
77 | }
78 | )
79 |
--------------------------------------------------------------------------------
/internal/markup/accessor.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/MarkupParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package markup
28 |
29 | import "github.com/markusmobius/go-domdistiller/data"
30 |
31 | // Accessor is the interface that all parsers must implement so that Parser
32 | // can retrieve their properties.
33 | type Accessor interface {
34 | // Title returns the markup title of the document, empty if none.
35 | Title() string
36 |
37 | // Type returns the markup type of the document, empty if none.
38 | Type() string
39 |
40 | // URL returns the markup url of the document, empty if none.
41 | URL() string
42 |
43 | // Images returns the properties of all markup images in the document.
44 | // The first image is the dominant (i.e. top or salient) one.
45 | Images() []data.MarkupImage
46 |
47 | // Description returns the markup description of the document, empty if none.
48 | Description() string
49 |
50 | // Publisher returns the markup publisher of the document, empty if none.
51 | Publisher() string
52 |
53 | // Copyright returns the markup copyright of the document, empty if none.
54 | Copyright() string
55 |
56 | // Author returns the full name of the markup author, empty if none.
57 | Author() string
58 |
59 | // Article returns the properties of the markup "article" object, null if none.
60 | Article() *data.MarkupArticle
61 |
62 | // OptOut returns true if page owner has opted out of distillation.
63 | OptOut() bool
64 | }
65 |
--------------------------------------------------------------------------------
/internal/markup/schemaorg/thing-item-image.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/SchemaOrgParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package schemaorg
28 |
29 | import (
30 | "strconv"
31 | "strings"
32 |
33 | "github.com/markusmobius/go-domdistiller/data"
34 | "golang.org/x/net/html"
35 | )
36 |
37 | type ImageItem struct {
38 | BaseThingItem
39 | }
40 |
41 | func NewImageItem(element *html.Node) *ImageItem {
42 | item := &ImageItem{}
43 | item.init(Image, element)
44 | item.addStringPropertyName(ContentURLProp)
45 | item.addStringPropertyName(EncodingFormatProp)
46 | item.addStringPropertyName(CaptionProp)
47 | item.addStringPropertyName(RepresentativeProp)
48 | item.addStringPropertyName(WidthProp)
49 | item.addStringPropertyName(HeightProp)
50 | return item
51 | }
52 |
53 | func (ii *ImageItem) isRepresentativeOfPage() bool {
54 | propValue := ii.getStringProperty(RepresentativeProp)
55 | return strings.ToLower(propValue) == "true"
56 | }
57 |
58 | func (ii *ImageItem) getImage() *data.MarkupImage {
59 | width, _ := strconv.Atoi(ii.getStringProperty(WidthProp))
60 | height, _ := strconv.Atoi(ii.getStringProperty(HeightProp))
61 | imageURL := ii.getStringProperty(ContentURLProp)
62 | if imageURL == "" {
63 | imageURL = ii.getStringProperty(URLProp)
64 | }
65 |
66 | return &data.MarkupImage{
67 | URL: imageURL,
68 | Type: ii.getStringProperty(EncodingFormatProp),
69 | Caption: ii.getStringProperty(CaptionProp),
70 | Width: width,
71 | Height: height,
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/internal/webdoc/table.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/WebTable.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc
28 |
29 | import (
30 | "fmt"
31 | nurl "net/url"
32 |
33 | "github.com/go-shiori/dom"
34 | "github.com/markusmobius/go-domdistiller/internal/domutil"
35 | "golang.org/x/net/html"
36 | )
37 |
38 | type Table struct {
39 | BaseElement
40 |
41 | Element *html.Node
42 | PageURL *nurl.URL
43 |
44 | cloned *html.Node
45 | }
46 |
47 | func (t *Table) ElementType() string {
48 | return "table"
49 | }
50 |
51 | func (t *Table) GenerateOutput(textOnly bool) string {
52 | if t.cloned == nil {
53 | t.cloned = domutil.CloneAndProcessTree(t.Element, t.PageURL)
54 | }
55 |
56 | if textOnly {
57 | return domutil.InnerText(t.cloned)
58 | }
59 |
60 | return dom.OuterHTML(t.cloned)
61 | }
62 |
63 | // GetImageURLs returns list of source URLs of all image inside the table.
64 | func (t *Table) GetImageURLs() []string {
65 | if t.cloned == nil {
66 | t.cloned = domutil.CloneAndProcessTree(t.Element, t.PageURL)
67 | }
68 |
69 | imgURLs := []string{}
70 | for _, img := range dom.QuerySelectorAll(t.cloned, "img,source") {
71 | src := dom.GetAttribute(img, "src")
72 | if src != "" {
73 | imgURLs = append(imgURLs, src)
74 | }
75 |
76 | imgURLs = append(imgURLs, domutil.GetAllSrcSetURLs(img)...)
77 | }
78 |
79 | return imgURLs
80 | }
81 |
82 | func (t *Table) String() string {
83 | return fmt.Sprintf("ELEMENT %q: html=%q, is_content=%v",
84 | t.ElementType(), dom.OuterHTML(t.Element), t.isContent)
85 | }
86 |
--------------------------------------------------------------------------------
/internal/webdoc/text-document_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/TextDocumentStatisticsTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc_test
28 |
29 | import (
30 | "testing"
31 |
32 | "github.com/markusmobius/go-domdistiller/internal/stringutil"
33 | "github.com/markusmobius/go-domdistiller/internal/testutil"
34 | "github.com/stretchr/testify/assert"
35 | )
36 |
37 | const ThreeWords = "I love statistics"
38 |
39 | func Test_WebDoc_TextDocument_OnlyContent(t *testing.T) {
40 | builder := testutil.NewTextDocumentBuilder(stringutil.FastWordCounter{})
41 | builder.AddContentBlock(ThreeWords)
42 | builder.AddContentBlock(ThreeWords)
43 | builder.AddContentBlock(ThreeWords)
44 |
45 | doc := builder.Build()
46 | assert.Equal(t, 9, doc.CountWordsInContent())
47 | }
48 |
49 | func Test_WebDoc_TextDocument_OnlyNonContent(t *testing.T) {
50 | builder := testutil.NewTextDocumentBuilder(stringutil.FastWordCounter{})
51 | builder.AddNonContentBlock(ThreeWords)
52 | builder.AddNonContentBlock(ThreeWords)
53 | builder.AddNonContentBlock(ThreeWords)
54 |
55 | doc := builder.Build()
56 | assert.Equal(t, 0, doc.CountWordsInContent())
57 | }
58 |
59 | func Test_WebDoc_TextDocument_MixedContent(t *testing.T) {
60 | builder := testutil.NewTextDocumentBuilder(stringutil.FastWordCounter{})
61 | builder.AddContentBlock(ThreeWords)
62 | builder.AddNonContentBlock(ThreeWords)
63 | builder.AddContentBlock(ThreeWords)
64 | builder.AddNonContentBlock(ThreeWords)
65 |
66 | doc := builder.Build()
67 | assert.Equal(t, 6, doc.CountWordsInContent())
68 | }
69 |
--------------------------------------------------------------------------------
/internal/domutil/tree-clone.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/TreeCloneBuilder.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package domutil
28 |
29 | import (
30 | "golang.org/x/net/html"
31 | )
32 |
33 | // TreeClone takes a list of nodes and returns a clone of the minimum tree in the
34 | // DOM that contains all of them. This is done by going through each node, cloning its
35 | // parent and adding children to that parent until the next node is not contained in
36 | // that parent (originally). The list cannot contain a parent of any of the other nodes.
37 | // Children of the nodes in the provided list are excluded.
38 | //
39 | // This implementation doesn't come from the original dom-distiller code. Instead I
40 | // created it from scratch to make it simpler and more Go idiomatic.
41 | func TreeClone(nodes []*html.Node) *html.Node {
42 | // Get the nearest ancestor
43 | allAncestors, nearestAncestor := GetAncestors(nodes...)
44 | if nearestAncestor == nil {
45 | return nil
46 | }
47 |
48 | // Clone the ancestor and childrens that required to reach specified nodes
49 | var fnClone func(src *html.Node) *html.Node
50 | fnClone = func(src *html.Node) *html.Node {
51 | clone := &html.Node{
52 | Type: src.Type,
53 | DataAtom: src.DataAtom,
54 | Data: src.Data,
55 | Attr: append([]html.Attribute{}, src.Attr...),
56 | }
57 |
58 | for child := src.FirstChild; child != nil; child = child.NextSibling {
59 | if _, exist := allAncestors[child]; exist {
60 | clone.AppendChild(fnClone(child))
61 | }
62 | }
63 |
64 | return clone
65 | }
66 |
67 | return fnClone(nearestAncestor)
68 | }
69 |
--------------------------------------------------------------------------------
/internal/filter/simple/label-to-boilerplate.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/filters/simple/LabelToBoilerplateFilter.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | // boilerpipe
28 | //
29 | // Copyright (c) 2009 Christian Kohlschütter
30 | //
31 | // The author licenses this file to You under the Apache License, Version 2.0
32 | // (the "License"); you may not use this file except in compliance with
33 | // the License. You may obtain a copy of the License at
34 | //
35 | // http://www.apache.org/licenses/LICENSE-2.0
36 | //
37 | // Unless required by applicable law or agreed to in writing, software
38 | // distributed under the License is distributed on an "AS IS" BASIS,
39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 | // See the License for the specific language governing permissions and
41 | // limitations under the License.
42 |
43 | package simple
44 |
45 | import (
46 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
47 | )
48 |
49 | // LabelToBoilerplate marks all blocks that contain a given label as "boilerplate".
50 | type LabelToBoilerplate struct {
51 | labels []string
52 | }
53 |
54 | func NewLabelToBoilerplate(labels ...string) *LabelToBoilerplate {
55 | return &LabelToBoilerplate{labels: labels}
56 | }
57 |
58 | func (f *LabelToBoilerplate) Process(doc *webdoc.TextDocument) bool {
59 | changes := false
60 |
61 | blockLoop:
62 | for _, tb := range doc.TextBlocks {
63 | if tb.IsContent() {
64 | for _, label := range f.labels {
65 | if tb.HasLabel(label) {
66 | tb.SetIsContent(false)
67 | changes = true
68 | continue blockLoop
69 | }
70 | }
71 | }
72 | }
73 |
74 | return changes
75 | }
76 |
--------------------------------------------------------------------------------
/internal/filter/docfilter/scorer/image-dom-distance.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/filters/images/DomDistanceScorer.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package scorer
28 |
29 | import (
30 | "github.com/markusmobius/go-domdistiller/internal/domutil"
31 | "golang.org/x/net/html"
32 | )
33 |
34 | // ImageDomDistanceScorer uses DOM distance as its heuristic.
35 | type ImageDomDistanceScorer struct {
36 | maxScore int
37 | firstContentNode *html.Node
38 | }
39 |
40 | // NewImageDomDistanceScorer returns and initiates the ImageDomDistanceScorer.
41 | func NewImageDomDistanceScorer(maxScore int, firstContent *html.Node) *ImageDomDistanceScorer {
42 | return &ImageDomDistanceScorer{
43 | maxScore: maxScore,
44 | firstContentNode: firstContent,
45 | }
46 | }
47 |
48 | func (s *ImageDomDistanceScorer) GetImageScore(node *html.Node) int {
49 | var score int
50 | if node != nil {
51 | score = s.compute(node)
52 | }
53 |
54 | if score < s.maxScore {
55 | return score
56 | }
57 |
58 | return s.maxScore
59 | }
60 |
61 | func (s *ImageDomDistanceScorer) GetMaxScore() int {
62 | return s.maxScore
63 | }
64 |
65 | func (s *ImageDomDistanceScorer) compute(node *html.Node) int {
66 | if s.firstContentNode == nil {
67 | return 0
68 | }
69 |
70 | depthDiff := domutil.GetNodeDepth(s.firstContentNode) -
71 | domutil.GetNodeDepth(domutil.GetNearestCommonAncestor(s.firstContentNode, node))
72 |
73 | var multiplier float64
74 | if depthDiff < 4 {
75 | multiplier = 1
76 | } else if depthDiff < 6 {
77 | multiplier = 0.6
78 | } else if depthDiff < 8 {
79 | multiplier = 0.2
80 | }
81 |
82 | return int(float64(s.maxScore) * multiplier)
83 | }
84 |
--------------------------------------------------------------------------------
/internal/domutil/tree-clone_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/TreeCloneBuilderTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package domutil_test
28 |
29 | import (
30 | "regexp"
31 | "testing"
32 |
33 | "github.com/go-shiori/dom"
34 | "github.com/markusmobius/go-domdistiller/internal/domutil"
35 | "github.com/markusmobius/go-domdistiller/internal/testutil"
36 | "github.com/stretchr/testify/assert"
37 | "golang.org/x/net/html"
38 | )
39 |
40 | func Test_DomUtil_TreeClone_FullTreeBuilder(t *testing.T) {
41 | expectedHTML := `
42 |
")
69 |
70 | table := dom.QuerySelector(div, "table")
71 | wt := &webdoc.Table{Element: table}
72 | db.document.AddElements(wt)
73 | return wt
74 | }
75 |
76 | func (db *WebDocumentBuilder) AddImage() *webdoc.Image {
77 | image := dom.CreateElement("img")
78 | dom.SetAttribute(image, "src", "http://www.example.com/foo.jpg")
79 |
80 | wi := &webdoc.Image{Element: image}
81 | db.document.AddElements(wi)
82 | return wi
83 | }
84 |
85 | func (db *WebDocumentBuilder) AddLeadImage() *webdoc.Image {
86 | image := dom.CreateElement("img")
87 | dom.SetAttribute(image, "width", "600")
88 | dom.SetAttribute(image, "height", "400")
89 | dom.SetAttribute(image, "src", "http://www.example.com/lead.bmp")
90 |
91 | wi := &webdoc.Image{Element: image}
92 | db.document.AddElements(wi)
93 | return wi
94 | }
95 |
96 | func (db *WebDocumentBuilder) AddTagStart(tagName string) *webdoc.Tag {
97 | wt := webdoc.NewTag(tagName, webdoc.TagStart)
98 | db.document.AddElements(wt)
99 | return wt
100 | }
101 |
102 | func (db *WebDocumentBuilder) AddTagEnd(tagName string) *webdoc.Tag {
103 | wt := webdoc.NewTag(tagName, webdoc.TagEnd)
104 | db.document.AddElements(wt)
105 | return wt
106 | }
107 |
108 | func (db *WebDocumentBuilder) Build() *webdoc.Document {
109 | return db.document
110 | }
111 |
--------------------------------------------------------------------------------
/internal/filter/english/num-words.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/filters/english/NumWordsRulesClassifier.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2015 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | // boilerpipe
28 | //
29 | // Copyright (c) 2009 Christian Kohlschütter
30 | //
31 | // The author licenses this file to You under the Apache License, Version 2.0
32 | // (the "License"); you may not use this file except in compliance with
33 | // the License. You may obtain a copy of the License at
34 | //
35 | // http://www.apache.org/licenses/LICENSE-2.0
36 | //
37 | // Unless required by applicable law or agreed to in writing, software
38 | // distributed under the License is distributed on an "AS IS" BASIS,
39 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
40 | // See the License for the specific language governing permissions and
41 | // limitations under the License.
42 |
43 | package english
44 |
45 | import (
46 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
47 | )
48 |
49 | // NumWordsRulesClassifier classifies several TextBlock as content or not-content through
50 | // rules that have been determined using the C4.8 machine learning algorithm, as described
51 | // in the paper "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
52 | // using number of words per block and link density per block.
53 | type NumWordsRulesClassifier struct{}
54 |
55 | func NewNumWordsRulesClassifier() *NumWordsRulesClassifier {
56 | return &NumWordsRulesClassifier{}
57 | }
58 |
59 | func (f *NumWordsRulesClassifier) Process(doc *webdoc.TextDocument) bool {
60 | textBlocks := doc.TextBlocks
61 | if len(textBlocks) == 0 {
62 | return false
63 | }
64 |
65 | hasChanges := false
66 | for i, block := range textBlocks {
67 | var prevBlock, nextBlock *webdoc.TextBlock
68 | if i > 0 {
69 | prevBlock = textBlocks[i-1]
70 | }
71 | if i+1 < len(textBlocks) {
72 | nextBlock = textBlocks[i+1]
73 | }
74 |
75 | changed := f.classify(prevBlock, block, nextBlock)
76 | hasChanges = hasChanges || changed
77 | }
78 |
79 | return hasChanges
80 | }
81 |
82 | func (f *NumWordsRulesClassifier) classify(prev, current, next *webdoc.TextBlock) bool {
83 | isContent := false
84 |
85 | if current.LinkDensity <= 0.333333 {
86 | if prev == nil || prev.LinkDensity <= 0.555556 {
87 | if current.NumWords <= 16 {
88 | if next == nil || next.NumWords <= 15 {
89 | isContent = prev != nil && prev.NumWords > 4
90 | } else {
91 | isContent = true
92 | }
93 | } else {
94 | isContent = true
95 | }
96 | } else {
97 | if current.NumWords <= 40 {
98 | isContent = next != nil && next.NumWords > 17
99 | } else {
100 | isContent = true
101 | }
102 | }
103 | } else {
104 | isContent = false
105 | }
106 |
107 | return current.SetIsContent(isContent)
108 | }
109 |
--------------------------------------------------------------------------------
/internal/webdoc/element-action.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/webdocument/ElementAction.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc
28 |
29 | import (
30 | "regexp"
31 | "strings"
32 |
33 | "github.com/go-shiori/dom"
34 | "github.com/markusmobius/go-domdistiller/internal/domutil"
35 | "github.com/markusmobius/go-domdistiller/internal/label"
36 | "golang.org/x/net/html"
37 | )
38 |
39 | const maxClassCount = 2
40 |
41 | var rxComment = regexp.MustCompile(`(?i)\bcomments?\b`)
42 |
43 | type ElementAction struct {
44 | Flush bool
45 | IsAnchor bool
46 | ChangesTagLevel bool
47 | Labels []string
48 | }
49 |
50 | func GetActionForElement(element *html.Node) ElementAction {
51 | tagName := dom.TagName(element)
52 |
53 | // NEED-COMPUTE-CSS
54 | // In original dom-distiller, the `flush` and `changesTagLevel` values are decided depending
55 | // on element display syle. For example, inline element shouldn't change tag level. Unfortunately,
56 | // this is not possible since we can't compute stylesheet. As fallback, here we simply use the
57 | // default display for the tag name
58 | action := ElementAction{}
59 | display := domutil.GetDisplayStyle(element)
60 | switch display {
61 | case "none", "inline": // do nothing
62 | case "inline-block", "inline-flex":
63 | action.ChangesTagLevel = true
64 | default:
65 | action.Flush = true
66 | action.ChangesTagLevel = true
67 | }
68 |
69 | // Check if item is inside
70 | if domutil.HasAncestor(element, "li", "summary") {
71 | action.Flush = false
72 | action.ChangesTagLevel = false
73 | }
74 |
75 | if tagName != "html" && tagName != "body" && tagName != "article" {
76 | id := dom.GetAttribute(element, "id")
77 | className := dom.GetAttribute(element, "class")
78 | classCount := len(strings.Fields(className))
79 | if (rxComment.MatchString(id) || rxComment.MatchString(className)) && classCount <= maxClassCount {
80 | action.Labels = append(action.Labels, label.StrictlyNotContent)
81 | }
82 |
83 | switch tagName {
84 | case "aside", "nav":
85 | action.Labels = append(action.Labels, label.StrictlyNotContent)
86 | case "li":
87 | action.Labels = append(action.Labels, label.Li)
88 | case "h1":
89 | action.Labels = append(action.Labels, label.H1, label.Heading)
90 | case "h2":
91 | action.Labels = append(action.Labels, label.H2, label.Heading)
92 | case "h3":
93 | action.Labels = append(action.Labels, label.H3, label.Heading)
94 | case "h4", "h5", "h6":
95 | action.Labels = append(action.Labels, label.Heading)
96 | case "a":
97 | // TODO: Anchors probably shouldn't unconditionally change the tag level.
98 | action.ChangesTagLevel = true
99 | action.IsAnchor = dom.HasAttribute(element, "href")
100 | }
101 | }
102 |
103 | return action
104 | }
105 |
--------------------------------------------------------------------------------
/internal/webdoc/image_test.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: javatest/webdocument/WebImageTest.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2016 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package webdoc_test
28 |
29 | import (
30 | nurl "net/url"
31 | "testing"
32 |
33 | "github.com/go-shiori/dom"
34 | "github.com/markusmobius/go-domdistiller/internal/webdoc"
35 | "github.com/stretchr/testify/assert"
36 | )
37 |
38 | func Test_WebDoc_Image_GenerateOutput(t *testing.T) {
39 | html := `` +
40 | `` +
41 | `` +
42 | ``
43 |
44 | div := dom.CreateElement("div")
45 | dom.SetInnerHTML(div, html)
46 |
47 | picture := dom.QuerySelector(div, "picture")
48 | baseURL, _ := nurl.ParseRequestURI("http://example.com/")
49 | webImage := webdoc.Image{Element: picture, PageURL: baseURL}
50 |
51 | expected := ``
52 | assert.Equal(t, expected, webImage.GenerateOutput(false))
53 | }
54 |
55 | func Test_WebDoc_Image_GetSrcList(t *testing.T) {
56 | img := dom.CreateElement("img")
57 | dom.SetAttribute(img, "src", "image")
58 | dom.SetAttribute(img, "srcset", "image200 200w, image400 400w")
59 |
60 | baseURL, _ := nurl.ParseRequestURI("http://example.com/")
61 | webImage := webdoc.Image{
62 | Element: img,
63 | PageURL: baseURL,
64 | }
65 |
66 | urls := webImage.GetURLs()
67 | assert.Equal(t, 3, len(urls))
68 | assert.Equal(t, "http://example.com/image", urls[0])
69 | assert.Equal(t, "http://example.com/image200", urls[1])
70 | assert.Equal(t, "http://example.com/image400", urls[2])
71 | }
72 |
73 | func Test_WebDoc_Image_GetSrcListInPicture(t *testing.T) {
74 | html := `` +
75 | `` +
76 | `` +
77 | ``
78 |
79 | div := dom.CreateElement("div")
80 | dom.SetInnerHTML(div, html)
81 |
82 | picture := dom.QuerySelector(div, "picture")
83 | baseURL, _ := nurl.ParseRequestURI("http://example.com/")
84 | webImage := webdoc.Image{Element: picture, PageURL: baseURL}
85 |
86 | urls := webImage.GetURLs()
87 | assert.Equal(t, 2, len(urls))
88 | assert.Equal(t, "http://example.com/image100", urls[0])
89 | assert.Equal(t, "http://example.org/image300", urls[1])
90 | }
91 |
92 | func Test_WebDoc_Image_PictureWithoutImg(t *testing.T) {
93 | html := `` +
94 | `` +
95 | ``
96 |
97 | div := dom.CreateElement("div")
98 | dom.SetInnerHTML(div, html)
99 |
100 | picture := dom.QuerySelector(div, "picture")
101 | baseURL, _ := nurl.ParseRequestURI("http://example.com/")
102 | webImage := webdoc.Image{Element: picture, PageURL: baseURL}
103 |
104 | expected := ``
105 | assert.Equal(t, expected, webImage.GenerateOutput(false))
106 | }
107 |
--------------------------------------------------------------------------------
/internal/markup/schemaorg/constant.go:
--------------------------------------------------------------------------------
1 | // ORIGINAL: java/SchemaOrgParser.java
2 |
3 | // Copyright (c) 2020 Markus Mobius
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | // Copyright 2014 The Chromium Authors. All rights reserved.
24 | // Use of this source code is governed by a BSD-style license that can be
25 | // found in the LICENSE file.
26 |
27 | package schemaorg
28 |
29 | const (
30 | NameProp = "name"
31 | URLProp = "url"
32 | DescriptionProp = "description"
33 | ImageProp = "image"
34 | HeadlineProp = "headline"
35 | PublisherProp = "publisher"
36 | CopyrightHolderProp = "copyrightHolder"
37 | CopyrightYearProp = "copyrightYear"
38 | ContentURLProp = "contentUrl"
39 | EncodingFormatProp = "encodingFormat"
40 | CaptionProp = "caption"
41 | RepresentativeProp = "representativeOfPage"
42 | WidthProp = "width"
43 | HeightProp = "height"
44 | DatePublishedProp = "datePublished"
45 | DateModifiedProp = "dateModified"
46 | AuthorProp = "author"
47 | CreatorProp = "creator"
48 | SectionProp = "articleSection"
49 | AssociatedMediaProp = "associatedMedia"
50 | EncodingProp = "encoding"
51 | FamilyNameProp = "familyName"
52 | GivenNameProp = "givenName"
53 | LegalNameProp = "legalName"
54 | AuthorRel = "author"
55 | )
56 |
57 | type SchemaType uint
58 |
59 | const (
60 | Unsupported SchemaType = iota
61 | Image
62 | Article
63 | Person
64 | Organization
65 | )
66 |
67 | var schemaTypeURLs = map[string]SchemaType{
68 | "http://schema.org/ImageObject": Image,
69 | "http://schema.org/Article": Article,
70 | "http://schema.org/BlogPosting": Article,
71 | "http://schema.org/NewsArticle": Article,
72 | "http://schema.org/ScholarlyArticle": Article,
73 | "http://schema.org/TechArticle": Article,
74 | "http://schema.org/Person": Person,
75 | "http://schema.org/Organization": Organization,
76 | "http://schema.org/Corporation": Organization,
77 | "http://schema.org/EducationalOrganization": Organization,
78 | "http://schema.org/GovernmentOrganization": Organization,
79 | "http://schema.org/NGO": Organization,
80 | }
81 |
82 | // The key for `tagAttributeMap` is the tag name, while the entry value is an
83 | // array of attributes in the specified tag from which to extract information:
84 | // - 0th attribute: contains the value for the property specified in itemprop
85 | // - 1st attribute: if available, contains the value for the author property.
86 | var tagAttributeMap = map[string]string{
87 | "img": "src",
88 | "audio": "src",
89 | "embed": "src",
90 | "iframe": "src",
91 | "source": "src",
92 | "track": "src",
93 | "video": "src",
94 | "a": "href",
95 | "link": "href",
96 | "area": "href",
97 | "meta": "content",
98 | "time": "datetime",
99 | "object": "data",
100 | "data": "value",
101 | "meter": "value",
102 | }
103 |
--------------------------------------------------------------------------------