├── .gitignore ├── LICENSE ├── README.md ├── examples └── main.go ├── go.mod ├── go.sum ├── internal ├── archiver │ ├── archiver.go │ └── http-client.go └── processor │ ├── css.go │ ├── general.go │ ├── html.go │ ├── js.go │ └── processor.go ├── reader.go └── writer.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Exclude config 2 | /.vscode -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-present Radhi Fadlillah 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | WARC 2 | === 3 | 4 | [![GoDoc](https://godoc.org/github.com/go-shiori/warc?status.png)](https://godoc.org/github.com/go-shiori/warc) 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/go-shiori/warc)](https://goreportcard.com/report/github.com/go-shiori/warc) 6 | 7 | **This project is now archived**. If you want to archive, consider checking out [`obelisk`](https://github.com/go-shiori/obelisk). It has better output format (plain HTML) and IMHO better written than this. 8 | 9 | WARC is a Go package that archive a web page and its resources into a single [`bolt`](https://github.com/etcd-io/bbolt) database file. Developed as part of [Shiori](https://github.com/go-shiori/shiori) bookmarks manager. 10 | 11 | It still in development phase but should be stable enough to use. The `bolt` database that used by this project is also stable both in API and file format. Unfortunately, right now WARC will disable Javascript when archiving a page so it still doesn't not work in SPA site like Twitter or Reddit. 12 | 13 | ## Installation 14 | 15 | To install this package, just run `go get` : 16 | 17 | ``` 18 | go get -u -v github.com/go-shiori/warc 19 | ``` 20 | 21 | ## Licenses 22 | 23 | WARC is distributed under [MIT license](https://choosealicense.com/licenses/mit/), which means you can use and modify it however you want. However, if you make an enhancement for it, if possible, please send a pull request. 24 | -------------------------------------------------------------------------------- /examples/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/go-shiori/warc" 7 | ) 8 | 9 | func main() { 10 | // Define variables 11 | url := "https://apnews.com/6e151296fb194f85ba69a8babd972e4b" 12 | userAgent := "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)" 13 | 14 | // Ceate archival request 15 | req := warc.ArchivalRequest{ 16 | URL: url, 17 | UserAgent: userAgent, 18 | LogEnabled: true, 19 | } 20 | 21 | // Start archival 22 | err := warc.NewArchive(req, "ap-news") 23 | if err != nil { 24 | log.Fatalln(err) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/go-shiori/warc 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/go-shiori/dom v0.0.0-20190930082056-9d974a4f8b25 7 | github.com/konsorten/go-windows-terminal-sequences v1.0.2 // indirect 8 | github.com/sirupsen/logrus v1.4.2 9 | github.com/stretchr/testify v1.3.0 // indirect 10 | github.com/tdewolff/parse v2.3.4+incompatible 11 | github.com/tdewolff/test v1.0.0 // indirect 12 | go.etcd.io/bbolt v1.3.3 13 | golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 14 | golang.org/x/sys v0.0.0-20190927073244-c990c680b611 // indirect 15 | ) 16 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/go-shiori/dom v0.0.0-20190930082056-9d974a4f8b25 h1:1ZfeL7TG+z4cjtC6XT+drfe23JxaVMwdqyGBh4O4foo= 5 | github.com/go-shiori/dom v0.0.0-20190930082056-9d974a4f8b25/go.mod h1:360KoNl36ftFYhjLHuEty78kWUGw8i1opEicvIDLfRk= 6 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= 7 | github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s= 8 | github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= 9 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 10 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 11 | github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4= 12 | github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= 13 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 14 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 15 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 16 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 17 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 18 | github.com/tdewolff/parse v2.3.4+incompatible h1:x05/cnGwIMf4ceLuDMBOdQ1qGniMoxpP46ghf0Qzh38= 19 | github.com/tdewolff/parse v2.3.4+incompatible/go.mod h1:8oBwCsVmUkgHO8M5iCzSIDtpzXOT0WXX9cWhz+bIzJQ= 20 | github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU= 21 | github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4= 22 | go.etcd.io/bbolt v1.3.3 h1:MUGmc65QhB3pIlaQ5bB4LwqSj6GIonVJXpZiaKNyaKk= 23 | go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= 24 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 25 | golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 h1:qPnAdmjNA41t3QBTx2mFGf/SD1IoslhYu7AmdsVzCcs= 26 | golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 27 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 28 | golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 29 | golang.org/x/sys v0.0.0-20190927073244-c990c680b611 h1:q9u40nxWT5zRClI/uU9dHCiYGottAg6Nzz4YUQyHxdA= 30 | golang.org/x/sys v0.0.0-20190927073244-c990c680b611/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 31 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 32 | -------------------------------------------------------------------------------- /internal/archiver/archiver.go: -------------------------------------------------------------------------------- 1 | package archiver 2 | 3 | import ( 4 | "bytes" 5 | "compress/gzip" 6 | "fmt" 7 | "io" 8 | "net/http" 9 | "strings" 10 | "sync" 11 | 12 | "github.com/go-shiori/warc/internal/processor" 13 | "github.com/sirupsen/logrus" 14 | "go.etcd.io/bbolt" 15 | ) 16 | 17 | // Request is struct that contains page data that want to be archived. 18 | type Request struct { 19 | Reader io.Reader 20 | URL string 21 | ContentType string 22 | } 23 | 24 | // Archiver is struct that do the archival. 25 | type Archiver struct { 26 | sync.RWMutex 27 | 28 | DB *bbolt.DB 29 | UserAgent string 30 | LogEnabled bool 31 | 32 | resourceMap map[string]struct{} 33 | } 34 | 35 | // Start starts the archival process 36 | func (arc *Archiver) Start(req Request) error { 37 | if arc.resourceMap == nil { 38 | arc.resourceMap = make(map[string]struct{}) 39 | } 40 | 41 | return arc.archive(req, true) 42 | } 43 | 44 | func (arc *Archiver) archive(req Request, root bool) error { 45 | // Check if this request already processed before 46 | arc.RLock() 47 | _, processed := arc.resourceMap[req.URL] 48 | arc.RUnlock() 49 | 50 | if processed { 51 | return nil 52 | } 53 | 54 | // Download page if needed 55 | if req.Reader == nil || req.ContentType == "" { 56 | arc.logInfo("Downloading %s\n", req.URL) 57 | 58 | resp, err := arc.downloadPage(req.URL) 59 | if err != nil { 60 | return fmt.Errorf("failed to download %s: %v", req.URL, err) 61 | } 62 | defer resp.Body.Close() 63 | 64 | req.Reader = resp.Body 65 | req.ContentType = resp.Header.Get("Content-Type") 66 | } 67 | 68 | // Process input 69 | var err error 70 | resource := processor.Resource{} 71 | subResources := []processor.Resource{} 72 | processorRequest := processor.Request{ 73 | Reader: req.Reader, 74 | URL: req.URL, 75 | } 76 | 77 | switch { 78 | case strings.Contains(req.ContentType, "text/html"): 79 | resource, subResources, err = processor.ProcessHTMLFile(processorRequest) 80 | if !root && !resource.IsEmbed { 81 | subResources = []processor.Resource{} 82 | } 83 | case strings.Contains(req.ContentType, "text/css") && !root: 84 | resource, subResources, err = processor.ProcessCSSFile(processorRequest) 85 | default: 86 | resource, err = processor.ProcessGeneralFile(processorRequest) 87 | } 88 | 89 | if err != nil { 90 | return fmt.Errorf("failed to archive %s: %v", req.URL, err) 91 | } 92 | 93 | // Save resource to storage 94 | if root { 95 | resource.Name = "archive-root" 96 | } 97 | 98 | err = arc.saveResource(resource, req.ContentType) 99 | if err != nil { 100 | return fmt.Errorf("failed to save %s: %v", req.URL, err) 101 | } 102 | 103 | // Save this resource to map 104 | arc.Lock() 105 | arc.resourceMap[req.URL] = struct{}{} 106 | arc.Unlock() 107 | 108 | arc.logInfo("Saved %s (%d)\n", resource.URL, len(resource.Content)) 109 | 110 | // Archive the sub resources 111 | wg := sync.WaitGroup{} 112 | wg.Add(len(subResources)) 113 | 114 | semaphore := make(chan struct{}, 5) 115 | defer close(semaphore) 116 | 117 | for _, subResource := range subResources { 118 | go func(subResource processor.Resource) { 119 | // Make sure to finish the WG 120 | defer wg.Done() 121 | 122 | // Register goroutine to semaphore 123 | semaphore <- struct{}{} 124 | defer func() { 125 | <-semaphore 126 | }() 127 | 128 | // Archive the sub resource 129 | var subResContent io.Reader 130 | if len(subResource.Content) > 0 { 131 | subResContent = bytes.NewBuffer(subResource.Content) 132 | } 133 | 134 | subResRequest := Request{ 135 | Reader: subResContent, 136 | URL: subResource.URL, 137 | } 138 | 139 | err := arc.archive(subResRequest, false) 140 | if err != nil { 141 | arc.logWarning("Failed to save %s: %v\n", subResource.URL, err) 142 | } 143 | }(subResource) 144 | } 145 | 146 | wg.Wait() 147 | 148 | return nil 149 | } 150 | 151 | // DownloadData downloads data from the specified URL. 152 | func (arc *Archiver) downloadPage(url string) (*http.Response, error) { 153 | // Prepare request 154 | req, err := http.NewRequest("GET", url, nil) 155 | if err != nil { 156 | return nil, err 157 | } 158 | 159 | // Send request 160 | req.Header.Set("User-Agent", arc.UserAgent) 161 | return httpClient.Do(req) 162 | } 163 | 164 | func (arc *Archiver) saveResource(resource processor.Resource, contentType string) error { 165 | // Compress content 166 | buffer := bytes.NewBuffer(nil) 167 | gzipper := gzip.NewWriter(buffer) 168 | 169 | _, err := gzipper.Write(resource.Content) 170 | if err != nil { 171 | return fmt.Errorf("compress failed: %v", err) 172 | } 173 | 174 | err = gzipper.Close() 175 | if err != nil { 176 | return fmt.Errorf("compress failed: %v", err) 177 | } 178 | 179 | err = arc.DB.Batch(func(tx *bbolt.Tx) error { 180 | bucket := tx.Bucket([]byte(resource.Name)) 181 | if bucket != nil { 182 | return nil 183 | } 184 | 185 | bucket, err := tx.CreateBucketIfNotExists([]byte(resource.Name)) 186 | if err != nil { 187 | return err 188 | } 189 | 190 | err = bucket.Put([]byte("content"), buffer.Bytes()) 191 | if err != nil { 192 | return err 193 | } 194 | 195 | err = bucket.Put([]byte("type"), []byte(contentType)) 196 | if err != nil { 197 | return err 198 | } 199 | 200 | return nil 201 | }) 202 | 203 | return err 204 | } 205 | 206 | func (arc *Archiver) logInfo(format string, args ...interface{}) { 207 | if arc.LogEnabled { 208 | logrus.Infof(format, args...) 209 | } 210 | } 211 | 212 | func (arc *Archiver) logWarning(format string, args ...interface{}) { 213 | if arc.LogEnabled { 214 | logrus.Warnf(format, args...) 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /internal/archiver/http-client.go: -------------------------------------------------------------------------------- 1 | package archiver 2 | 3 | import ( 4 | "crypto/tls" 5 | "net/http" 6 | "net/http/cookiejar" 7 | "time" 8 | ) 9 | 10 | var httpClient *http.Client 11 | 12 | func init() { 13 | jar, _ := cookiejar.New(nil) 14 | httpClient = &http.Client{ 15 | Timeout: time.Minute, 16 | Transport: &http.Transport{ 17 | TLSClientConfig: &tls.Config{ 18 | InsecureSkipVerify: true, 19 | }, 20 | }, 21 | Jar: jar, 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /internal/processor/css.go: -------------------------------------------------------------------------------- 1 | package processor 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | nurl "net/url" 8 | "regexp" 9 | "strings" 10 | 11 | "github.com/tdewolff/parse/css" 12 | ) 13 | 14 | var ( 15 | rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`) 16 | ) 17 | 18 | // ProcessCSSFile process CSS file. 19 | func ProcessCSSFile(req Request) (Resource, []Resource, error) { 20 | // Parse URL, then use it to extract CSS rules 21 | parsedURL, err := nurl.ParseRequestURI(req.URL) 22 | if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { 23 | return Resource{}, nil, fmt.Errorf("url %s is not valid", req.URL) 24 | } 25 | 26 | cssRules, subResources := processCSS(req.Reader, parsedURL) 27 | resource, err := createResource([]byte(cssRules), req.URL, nil) 28 | 29 | return resource, subResources, err 30 | } 31 | 32 | // processCSSRules extract resource URLs from the specified CSS input. 33 | // Returns the new rules with all CSS URLs updated to the archival link. 34 | func processCSS(input io.Reader, baseURL *nurl.URL) (string, []Resource) { 35 | // Prepare buffers 36 | buffer := bytes.NewBuffer(nil) 37 | 38 | // Scan CSS file and process the resource's URL 39 | lexer := css.NewLexer(input) 40 | subResources := []Resource{} 41 | 42 | for { 43 | token, bt := lexer.Next() 44 | 45 | // Check for error 46 | if token == css.ErrorToken { 47 | break 48 | } 49 | 50 | // If it's not an URL, just write it to buffer as it is 51 | if token != css.URLToken { 52 | buffer.Write(bt) 53 | continue 54 | } 55 | 56 | // Sanitize the URL by removing `url()`, quotation mark and trailing slash 57 | cssURL := string(bt) 58 | cssURL = rxStyleURL.ReplaceAllString(cssURL, "$1") 59 | cssURL = strings.TrimSpace(cssURL) 60 | cssURL = strings.Trim(cssURL, `'`) 61 | cssURL = strings.Trim(cssURL, `"`) 62 | 63 | // Create subresource from CSS URL 64 | subResource, err := createResource(nil, cssURL, baseURL) 65 | if err != nil { 66 | buffer.Write(bt) 67 | continue 68 | } 69 | 70 | // Write resource name instead of CSS URL 71 | buffer.WriteString(`url("` + subResource.Name + `")`) 72 | 73 | // Save sub resource 74 | subResources = append(subResources, subResource) 75 | } 76 | 77 | // Return the new rule after all URL has been processed 78 | return buffer.String(), subResources 79 | } 80 | -------------------------------------------------------------------------------- /internal/processor/general.go: -------------------------------------------------------------------------------- 1 | package processor 2 | 3 | import ( 4 | "io/ioutil" 5 | ) 6 | 7 | // ProcessGeneralFile process files that not HTML, JS or CSS. 8 | func ProcessGeneralFile(req Request) (Resource, error) { 9 | // Read content from request input 10 | content, err := ioutil.ReadAll(req.Reader) 11 | if err != nil { 12 | return Resource{}, err 13 | } 14 | 15 | return createResource(content, req.URL, nil) 16 | } 17 | -------------------------------------------------------------------------------- /internal/processor/html.go: -------------------------------------------------------------------------------- 1 | package processor 2 | 3 | import ( 4 | "fmt" 5 | nurl "net/url" 6 | "regexp" 7 | "strings" 8 | 9 | "github.com/go-shiori/dom" 10 | "golang.org/x/net/html" 11 | ) 12 | 13 | var ( 14 | rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) 15 | rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) 16 | rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`) 17 | ) 18 | 19 | // ProcessHTMLFile process HTML file. 20 | func ProcessHTMLFile(req Request) (Resource, []Resource, error) { 21 | // Parse URL 22 | pageURL, err := nurl.ParseRequestURI(req.URL) 23 | if err != nil || pageURL.Scheme == "" || pageURL.Hostname() == "" { 24 | return Resource{}, nil, fmt.Errorf("url %s is not valid", req.URL) 25 | } 26 | 27 | // Parse HTML document 28 | doc, err := html.Parse(req.Reader) 29 | if err != nil { 30 | return Resource{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", req.URL, err) 31 | } 32 | 33 | // TODO: I'm still not really sure, but IMHO it's safer to 34 | // disable Javascript. Ideally, we only want to remove XHR request 35 | // using disableXHR(). Unfortunately, the result is not that good for now. 36 | dom.RemoveNodes(dom.GetElementsByTagName(doc, "script"), nil) 37 | 38 | // Convert lazy loaded image to normal 39 | fixLazyImages(doc) 40 | 41 | // Convert hyperlinks with relative URL 42 | fixRelativeURIs(doc, pageURL) 43 | 44 | // Extract subresources from each nodes 45 | subResources := []Resource{} 46 | for _, node := range dom.GetElementsByTagName(doc, "*") { 47 | // First extract resources from inline style 48 | cssResources := processInlineCSS(node, pageURL) 49 | subResources = append(subResources, cssResources...) 50 | 51 | // Next extract resources from tag's specific attribute 52 | nodeResources := []Resource{} 53 | switch dom.TagName(node) { 54 | case "style": 55 | nodeResources = processStyleTag(node, pageURL) 56 | case "script": 57 | nodeResources = processScriptTag(node, pageURL) 58 | case "meta": 59 | nodeResources = processMetaTag(node, pageURL) 60 | case "img", "picture", "figure", "video", "audio", "source": 61 | nodeResources = processMediaTag(node, pageURL) 62 | case "link": 63 | nodeResources = processGenericTag(node, "href", pageURL) 64 | case "iframe": 65 | nodeResources = processGenericTag(node, "src", pageURL) 66 | case "object": 67 | nodeResources = processGenericTag(node, "data", pageURL) 68 | default: 69 | continue 70 | } 71 | subResources = append(subResources, nodeResources...) 72 | } 73 | 74 | // Return outer HTML of the doc 75 | outerHTML := dom.OuterHTML(doc) 76 | resource, err := createResource([]byte(outerHTML), req.URL, nil) 77 | 78 | return resource, subResources, err 79 | } 80 | 81 | func disableXHR(doc *html.Node) { 82 | var head *html.Node 83 | heads := dom.GetElementsByTagName(doc, "head") 84 | if len(heads) > 0 { 85 | head = heads[0] 86 | } else { 87 | head = dom.CreateElement("head") 88 | dom.PrependChild(doc, head) 89 | } 90 | 91 | xhrDisabler := ` 92 | fetch = new Promise(); 93 | 94 | XMLHttpRequest = function() {}; 95 | XMLHttpRequest.prototype = { 96 | open: function(){}, 97 | send: function(){}, 98 | abort: function(){}, 99 | setRequestHeader: function(){}, 100 | overrideMimeType: function(){}, 101 | getResponseHeaders(): function(){}, 102 | getAllResponseHeaders(): function(){}, 103 | };` 104 | 105 | script := dom.CreateElement("script") 106 | scriptContent := dom.CreateTextNode(xhrDisabler) 107 | dom.PrependChild(script, scriptContent) 108 | dom.PrependChild(head, script) 109 | } 110 | 111 | // fixRelativeURIs converts each in the given element 112 | // to an absolute URI, ignoring #ref URIs. 113 | func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) { 114 | links := dom.GetAllNodesWithTag(doc, "a") 115 | dom.ForEachNode(links, func(link *html.Node, _ int) { 116 | href := dom.GetAttribute(link, "href") 117 | if href == "" { 118 | return 119 | } 120 | 121 | // Replace links with javascript: URIs with text content, 122 | // since they won't work after scripts have been removed 123 | // from the page. 124 | if strings.HasPrefix(href, "javascript:") { 125 | text := dom.CreateTextNode(dom.TextContent(link)) 126 | dom.ReplaceChild(link.Parent, text, link) 127 | } else { 128 | newHref := createAbsoluteURL(href, pageURL) 129 | if newHref == "" { 130 | dom.RemoveAttribute(link, "href") 131 | } else { 132 | dom.SetAttribute(link, "href", newHref) 133 | } 134 | } 135 | }) 136 | } 137 | 138 | // fixLazyImages convert images and figures that have properties like 139 | // data-src into images that can be loaded without JS. 140 | func fixLazyImages(root *html.Node) { 141 | imageNodes := dom.GetAllNodesWithTag(root, "img", "picture", "figure") 142 | dom.ForEachNode(imageNodes, func(elem *html.Node, _ int) { 143 | src := dom.GetAttribute(elem, "src") 144 | srcset := dom.GetAttribute(elem, "srcset") 145 | nodeTag := dom.TagName(elem) 146 | nodeClass := dom.ClassName(elem) 147 | 148 | if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") { 149 | for i := 0; i < len(elem.Attr); i++ { 150 | attr := elem.Attr[i] 151 | if attr.Key == "src" || attr.Key == "srcset" { 152 | continue 153 | } 154 | 155 | copyTo := "" 156 | if rxLazyImageSrcset.MatchString(attr.Val) { 157 | copyTo = "srcset" 158 | } else if rxLazyImageSrc.MatchString(attr.Val) { 159 | copyTo = "src" 160 | } 161 | 162 | if copyTo == "" { 163 | continue 164 | } 165 | 166 | if nodeTag == "img" || nodeTag == "picture" { 167 | // if this is an img or picture, set the attribute directly 168 | dom.SetAttribute(elem, copyTo, attr.Val) 169 | } else if nodeTag == "figure" && len(dom.GetAllNodesWithTag(elem, "img", "picture")) == 0 { 170 | // if the item is a
that does not contain an image or picture, 171 | // create one and place it inside the figure see the nytimes-3 172 | // testcase for an example 173 | img := dom.CreateElement("img") 174 | dom.SetAttribute(img, copyTo, attr.Val) 175 | dom.AppendChild(elem, img) 176 | } 177 | } 178 | } 179 | }) 180 | } 181 | 182 | // processInlineCSS extract subresources from the CSS rules inside 183 | // style attribute. Once finished, all CSS URLs in the style attribute 184 | // will be updated to use the resource name. 185 | func processInlineCSS(node *html.Node, pageURL *nurl.URL) []Resource { 186 | // Make sure this node has inline style 187 | styleAttr := dom.GetAttribute(node, "style") 188 | styleAttr = strings.TrimSpace(styleAttr) 189 | if styleAttr == "" { 190 | return nil 191 | } 192 | 193 | // Extract resource URLs from the inline style 194 | // and update the CSS rules accordingly. 195 | reader := strings.NewReader(styleAttr) 196 | newStyleAttr, subResources := processCSS(reader, pageURL) 197 | dom.SetAttribute(node, "style", newStyleAttr) 198 | 199 | return subResources 200 | } 201 | 202 | // processStyleTag extract subresources from inside a