├── .gitignore ├── LICENSE ├── README.md ├── examples └── simple.go └── htmlinfo └── htmlinfo.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Vitaly Dyatlov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Go HTML Info 2 | === 3 | 4 | Go HTML Info provides a simple interface to extract meaningful information from an html page. 5 | 6 | source docs: http://godoc.org/github.com/dyatlov/go-htmlinfo/htmlinfo 7 | 8 | Install: `go get github.com/dyatlov/go-htmlinfo/htmlinfo` 9 | 10 | Use: `import "github.com/dyatlov/go-htmlinfo/htmlinfo"` 11 | 12 | `Parse` method parses all html content into structurized ata. 13 | `GenerateOembedFor` method generate oembed info from that structurized data. It generates that info based on available data, even if no oembed information found on the page. 14 | 15 | Example: 16 | 17 | ```go 18 | package main 19 | 20 | import ( 21 | "fmt" 22 | "net/http" 23 | 24 | "github.com/dyatlov/go-htmlinfo/htmlinfo" 25 | ) 26 | 27 | func main() { 28 | u := "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/" 29 | 30 | resp, err := http.Get(u) 31 | 32 | if err != nil { 33 | panic(err) 34 | } 35 | 36 | defer resp.Body.Close() 37 | 38 | info := htmlinfo.NewHTMLInfo() 39 | 40 | // if url can be nil too, just then we won't be able to fetch (and generate) oembed information 41 | err = info.Parse(resp.Body, &u, nil) 42 | 43 | if err != nil { 44 | panic(err) 45 | } 46 | 47 | fmt.Printf("Info:\n%s\n", info) 48 | 49 | fmt.Printf("Oembed information: %s\n", info.GenerateOembedFor(u)) 50 | } 51 | ``` 52 | 53 | Result would be: 54 | 55 | _Info:_ 56 | ```javascript 57 | { 58 | "title": "iPad Pro Coming In November, Pricing Starts At $799 | TechCrunch", 59 | "description": "Apple unveiled its new iPad Pro today. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro..", 60 | "author_name": "Anthony Ha", 61 | "canonical_url": "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/", 62 | "oembed_json_url": "https://public-api.wordpress.com/oembed/1.0/?format=json\u0026url=http%3A%2F%2Ftechcrunch.com%2F2015%2F09%2F09%2Fipad-pro-coming-in-november-pricing-starts-at-799%2F\u0026for=wpcom-auto-discovery", 63 | "oembed_xml_url": "https://public-api.wordpress.com/oembed/1.0/?format=xml\u0026url=http%3A%2F%2Ftechcrunch.com%2F2015%2F09%2F09%2Fipad-pro-coming-in-november-pricing-starts-at-799%2F\u0026for=wpcom-auto-discovery", 64 | "favicon_url": "https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/favicon.ico", 65 | "touch_icons": [{ 66 | { url: 'https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/favicon.ico', 67 | type: 'icon', 68 | width: 0, 69 | height: 0, 70 | is_scalable: false }, 71 | { url: 'https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/homescreen_TCIcon.png', 72 | type: 'apple-touch-icon-precomposed', 73 | width: 0, 74 | height: 0, 75 | is_scalable: false }, 76 | // ... 77 | ], 78 | "image_src_url": "", 79 | "main_content": "Apple unveiled its new iPad Pro today. If you’re wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro and related accessories will be available in November.\nPricing will start at $799 with 32 gigabytes of memory and WiFi-only connectivity, with a $949 price tag for 128 GB, and $1,079 for 128 GB and a cellular connection. If you want the company’s new stylus, dubbed the Apple Pencil, that’ll cost you $99, and the Smart Keyboard will cost $169.\nThat may seem pretty pricey compared to other iPads — in fact, Apple said today that it’s dropping pricing on its iPad Mini 2, which its considers to be the entry-level iPad, to $269. What you’re paying for (among other things) is a 12.9-inch screen with resolution of 2,732 x 2,048 pixels, Apple’s A9X chip and four speakers.\nAnd, as the name and on-stage demos suggest, Apple seems to be pitching this for enterprise use and productivity, not for casual use.\n\t\t\t\n\t\t\t\t\n\t\t\t\tSay Hello To The Brand-New iPad Pro\n\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t", 80 | "opengraph": { 81 | "type": "article", 82 | "url": "http://social.techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/", 83 | "title": "iPad Pro Coming In November, Pricing Starts At $799", 84 | "description": "Apple unveiled its new iPad Pro today. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro..", 85 | "determiner": "", 86 | "site_name": "TechCrunch", 87 | "locale": "", 88 | "locales_alternate": null, 89 | "images": [{ 90 | "url": "https://tctechcrunch2011.files.wordpress.com/2015/09/screen-shot-2015-09-09-at-1-49-10-pm.png?w=560\u0026h=292\u0026crop=1", 91 | "secure_url": "", 92 | "type": "", 93 | "width": 0, 94 | "height": 0 95 | }], 96 | "audios": null, 97 | "videos": null, 98 | "article": { 99 | "published_time": null, 100 | "modified_time": null, 101 | "expiration_time": null, 102 | "section": "", 103 | "tags": null, 104 | "authors": null 105 | } 106 | }, 107 | "oembed": { 108 | "type": "link", 109 | "url": "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/", 110 | "provider_url": "http://techcrunch.com", 111 | "provider_name": "TechCrunch", 112 | "title": "iPad Pro Coming In November, Pricing Starts At\u0026nbsp;$799", 113 | "description": "", 114 | "width": 0, 115 | "height": 0, 116 | "thumbnail_url": "https://i1.wp.com/tctechcrunch2011.files.wordpress.com/2015/09/screen-shot-2015-09-09-at-1-49-10-pm.png?fit=440%2C330", 117 | "thumbnail_width": 440, 118 | "thumbnail_height": 218, 119 | "author_name": "\u003ca href=\"/author/anthony-ha/\" title=\"Posts by Anthony Ha\" onclick=\"s_objectID='river_author';\" rel=\"author\"\u003eAnthony Ha\u003c/a\u003e", 120 | "author_url": "/author/anthony-ha/", 121 | "html": "Apple \u003ca href=\"http://techcrunch.com/2015/09/09/apple-unveils-the-ipad-pro/\"\u003eunveiled its new iPad Pro today\u003c/a\u003e. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro and related accessories will be available in November.\r\n\r\nPricing will start at $799 with 32 gigabytes of memory and WiFi-only connectivity, with a $949 price tag for 128 GB, and $1,079 for 128 GB and a cellular connection. If you want the company's new stylus, \u003ca href=\"http://techcrunch.com/2015/09/09/the-apple-pencil-is-the-ipad-pros-secret-weapon/#.91issd:LNXD\"\u003edubbed the Apple Pencil\u003c/a\u003e, that'll cost you $99, and the Smart Keyboard will cost $169.\r\n" 122 | } 123 | } 124 | ``` 125 | 126 | _Oembed information:_ 127 | ```javascript 128 | { 129 | "type": "link", 130 | "url": "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/", 131 | "provider_url": "http://techcrunch.com", 132 | "provider_name": "TechCrunch", 133 | "title": "iPad Pro Coming In November, Pricing Starts At\u0026nbsp;$799", 134 | "description": "Apple unveiled its new iPad Pro today. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro..", 135 | "width": 0, 136 | "height": 0, 137 | "thumbnail_url": "https://i1.wp.com/tctechcrunch2011.files.wordpress.com/2015/09/screen-shot-2015-09-09-at-1-49-10-pm.png?fit=440%2C330", 138 | "thumbnail_width": 440, 139 | "thumbnail_height": 218, 140 | "author_name": "\u003ca href=\"/author/anthony-ha/\" title=\"Posts by Anthony Ha\" onclick=\"s_objectID='river_author';\" rel=\"author\"\u003eAnthony Ha\u003c/a\u003e", 141 | "author_url": "/author/anthony-ha/", 142 | "html": "Apple \u003ca href=\"http://techcrunch.com/2015/09/09/apple-unveils-the-ipad-pro/\"\u003eunveiled its new iPad Pro today\u003c/a\u003e. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro and related accessories will be available in November.\r\n\r\nPricing will start at $799 with 32 gigabytes of memory and WiFi-only connectivity, with a $949 price tag for 128 GB, and $1,079 for 128 GB and a cellular connection. If you want the company's new stylus, \u003ca href=\"http://techcrunch.com/2015/09/09/the-apple-pencil-is-the-ipad-pros-secret-weapon/#.91issd:LNXD\"\u003edubbed the Apple Pencil\u003c/a\u003e, that'll cost you $99, and the Smart Keyboard will cost $169.\r\n" 143 | } 144 | ``` 145 | -------------------------------------------------------------------------------- /examples/simple.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | 7 | "github.com/dyatlov/go-htmlinfo/htmlinfo" 8 | ) 9 | 10 | func main() { 11 | u := "http://techcrunch.com/2010/11/02/365-days-10-million-3-rounds-2-companies-all-with-5-magic-slides/" 12 | 13 | resp, err := http.Get(u) 14 | 15 | if err != nil { 16 | panic(err) 17 | } 18 | 19 | defer resp.Body.Close() 20 | 21 | info := htmlinfo.NewHTMLInfo() 22 | info.AllowOembedFetching = true 23 | 24 | ct := resp.Header.Get("Content-Type") 25 | 26 | // if url and contentType are not provided it's fine too, just then we wont be able to fetch (and generate) oembed information 27 | err = info.Parse(resp.Body, &u, &ct) 28 | 29 | if err != nil { 30 | panic(err) 31 | } 32 | 33 | fmt.Printf("Info:\n%s\n", info) 34 | 35 | fmt.Printf("Oembed information: %s\n", info.GenerateOembedFor(u)) 36 | } 37 | -------------------------------------------------------------------------------- /htmlinfo/htmlinfo.go: -------------------------------------------------------------------------------- 1 | package htmlinfo 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "io" 7 | "net/http" 8 | "net/url" 9 | "regexp" 10 | "strconv" 11 | "strings" 12 | 13 | "golang.org/x/net/html/charset" 14 | 15 | "github.com/dyatlov/go-oembed/oembed" 16 | "github.com/dyatlov/go-opengraph/opengraph" 17 | "golang.org/x/net/html" 18 | 19 | "github.com/dyatlov/go-readability" 20 | ) 21 | 22 | // TouchIcon contains all icons parsed from page header, including Apple touch icons 23 | type TouchIcon struct { 24 | URL string `json:"url"` 25 | Type string `json:"type"` 26 | Width uint64 `json:"width"` 27 | Height uint64 `json:"height"` 28 | IsScalable bool `json:"is_scalable"` 29 | } 30 | 31 | // HTMLInfo contains information extracted from HTML page 32 | type HTMLInfo struct { 33 | url *url.URL 34 | // http.Client instance to use, if nil then will be used default client 35 | Client *http.Client `json:"-"` 36 | // If it's true then parser will fetch oembed data from oembed url if possible 37 | AllowOembedFetching bool `json:"-"` 38 | // If it's true parser will extract main page content from html 39 | AllowMainContentExtraction bool `json:"-"` 40 | // We'll forward it to Oembed' fetchOembed method 41 | AcceptLanguage string `json:"-"` 42 | 43 | Title string `json:"title"` 44 | Description string `json:"description"` 45 | AuthorName string `json:"author_name"` 46 | CanonicalURL string `json:"canonical_url"` 47 | OembedJSONURL string `json:"oembed_json_url"` 48 | OembedXMLURL string `json:"oembed_xml_url"` 49 | FaviconURL string `json:"favicon_url"` 50 | TouchIcons []*TouchIcon `json:"touch_icons"` 51 | ImageSrcURL string `json:"image_src_url"` 52 | // Readability package is being used inside 53 | MainContent string `json:"main_content"` 54 | OGInfo *opengraph.OpenGraph `json:"opengraph"` 55 | OembedInfo *oembed.Info `json:"oembed"` 56 | } 57 | 58 | var ( 59 | cleanHTMLTagsRegex = regexp.MustCompile(`<.*?>`) 60 | replaceNewLinesRegex = regexp.MustCompile(`[\r\n]+`) 61 | clearWhitespacesRegex = regexp.MustCompile(`\s+`) 62 | getImageRegex = regexp.MustCompile(`(?i)]+?src=("|')?(.*?)("|'|\s|>)`) 63 | linkWithIconsRegex = regexp.MustCompile(`\b(icon|image_src)\b`) 64 | sizesRegex = regexp.MustCompile(`(\d+)[^\d]+(\d+)`) // some websites use crazy unicode chars between height and width 65 | ) 66 | 67 | // NewHTMLInfo return new instance of HTMLInfo 68 | func NewHTMLInfo() *HTMLInfo { 69 | info := &HTMLInfo{AllowOembedFetching: true, AllowMainContentExtraction: true, OGInfo: opengraph.NewOpenGraph(), AcceptLanguage: "en-us"} 70 | return info 71 | } 72 | 73 | func (info *HTMLInfo) toAbsoluteURL(u string) string { 74 | if info.url == nil { 75 | return u 76 | } 77 | 78 | tu, _ := url.Parse(u) 79 | 80 | if tu != nil { 81 | if tu.Host == "" { 82 | tu.Scheme = info.url.Scheme 83 | tu.Host = info.url.Host 84 | tu.User = info.url.User 85 | tu.Opaque = info.url.Opaque 86 | if len(tu.Path) == 0 || tu.Path[0] != '/' { 87 | tu.Path = info.url.Path + tu.Path 88 | } 89 | } else if tu.Scheme == "" { 90 | tu.Scheme = info.url.Scheme 91 | } 92 | 93 | return tu.String() 94 | } 95 | 96 | return u 97 | } 98 | 99 | func (info *HTMLInfo) appendTouchIcons(url string, rel string, sizes []string) { 100 | for _, size := range sizes { 101 | icon := &TouchIcon{URL: url, Type: rel, IsScalable: (size == "any")} 102 | matches := sizesRegex.FindStringSubmatch(size) 103 | if len(matches) >= 3 { 104 | icon.Height, _ = strconv.ParseUint(matches[1], 10, 64) 105 | icon.Width, _ = strconv.ParseUint(matches[2], 10, 64) 106 | } 107 | info.TouchIcons = append(info.TouchIcons, icon) 108 | } 109 | } 110 | 111 | func (info *HTMLInfo) parseLinkIcon(attrs map[string]string) { 112 | rels := strings.Split(attrs["rel"], " ") 113 | url := info.toAbsoluteURL(attrs["href"]) 114 | sizesString, present := attrs["sizes"] 115 | if !present { 116 | sizesString = "0x0" 117 | } 118 | sizes := strings.Split(sizesString, " ") 119 | 120 | for _, rel := range rels { 121 | if rel == "image_src" { 122 | info.ImageSrcURL = url 123 | } else if rel == "icon" { 124 | info.FaviconURL = url 125 | info.appendTouchIcons(url, rel, sizes) 126 | } else if rel == "apple-touch-icon" || rel == "apple-touch-icon-precomposed" { 127 | info.appendTouchIcons(url, rel, sizes) 128 | } 129 | } 130 | } 131 | 132 | func (info *HTMLInfo) parseHead(n *html.Node) { 133 | for c := n.FirstChild; c != nil; c = c.NextSibling { 134 | if c.Type == html.ElementNode && c.Data == "title" { 135 | if c.FirstChild != nil { 136 | info.Title = c.FirstChild.Data 137 | } 138 | } else if c.Type == html.ElementNode && c.Data == "link" { 139 | m := make(map[string]string) 140 | for _, a := range c.Attr { 141 | m[a.Key] = a.Val 142 | } 143 | if m["rel"] == "canonical" { 144 | info.CanonicalURL = info.toAbsoluteURL(m["href"]) 145 | } else if m["rel"] == "alternate" && m["type"] == "application/json+oembed" { 146 | info.OembedJSONURL = info.toAbsoluteURL(m["href"]) 147 | } else if m["rel"] == "alternate" && m["type"] == "application/xml+oembed" { 148 | info.OembedXMLURL = info.toAbsoluteURL(m["href"]) 149 | } else if linkWithIconsRegex.MatchString(m["rel"]) { 150 | info.parseLinkIcon(m) 151 | } 152 | } else if c.Type == html.ElementNode && c.Data == "meta" { 153 | m := make(map[string]string) 154 | for _, a := range c.Attr { 155 | m[a.Key] = a.Val 156 | } 157 | 158 | if m["name"] == "description" { 159 | info.Description = m["content"] 160 | } else if m["name"] == "author" { 161 | info.AuthorName = m["content"] 162 | } 163 | 164 | info.OGInfo.ProcessMeta(m) 165 | } 166 | } 167 | } 168 | 169 | func (info *HTMLInfo) parseBody(n *html.Node) { 170 | if !info.AllowMainContentExtraction { 171 | return 172 | } 173 | 174 | buf := new(bytes.Buffer) 175 | err := html.Render(buf, n) 176 | if err != nil { 177 | return 178 | } 179 | bufStr := buf.String() 180 | doc, err := readability.NewDocument(bufStr) 181 | if err != nil { 182 | return 183 | } 184 | 185 | doc.WhitelistTags = []string{"div", "p", "img"} 186 | doc.WhitelistAttrs["img"] = []string{"src", "title", "alt"} 187 | 188 | content := doc.Content() 189 | content = html.UnescapeString(content) 190 | 191 | info.MainContent = strings.Trim(content, "\r\n\t ") 192 | } 193 | 194 | // Parse return information about page 195 | // @param s - contains page source 196 | // @params pageURL - contains URL from where the data was taken [optional] 197 | // @params contentType - contains Content-Type header value [optional] 198 | // if no url is given then parser won't attempt to parse oembed info 199 | func (info *HTMLInfo) Parse(s io.Reader, pageURL *string, contentType *string) error { 200 | contentTypeStr := "text/html" 201 | if contentType != nil && len(*contentType) > 0 { 202 | contentTypeStr = *contentType 203 | } 204 | utf8s, err := charset.NewReader(s, contentTypeStr) 205 | if err != nil { 206 | return err 207 | } 208 | 209 | if pageURL != nil { 210 | tu, _ := url.Parse(*pageURL) 211 | info.url = tu 212 | } 213 | 214 | doc, err := html.Parse(utf8s) 215 | if err != nil { 216 | return err 217 | } 218 | 219 | var f func(*html.Node) 220 | f = func(n *html.Node) { 221 | for c := n.FirstChild; c != nil; c = c.NextSibling { 222 | if c.Type == html.ElementNode { 223 | if c.Data == "head" { 224 | info.parseHead(c) 225 | continue 226 | } else if c.Data == "body" { 227 | info.parseBody(c) 228 | continue 229 | } 230 | } 231 | f(c) 232 | } 233 | } 234 | f(doc) 235 | 236 | if info.AllowOembedFetching && pageURL != nil && len(info.OembedJSONURL) > 0 { 237 | pu, _ := url.Parse(info.OembedJSONURL) 238 | siteName := info.OGInfo.SiteName 239 | siteURL := strings.ToLower(pu.Scheme) + "://" + pu.Host 240 | 241 | if len(siteName) == 0 { 242 | siteName = pu.Host 243 | } 244 | 245 | oiItem := &oembed.Item{EndpointURL: info.OembedJSONURL, ProviderName: siteName, ProviderURL: siteURL, IsEndpointURLComplete: true} 246 | oi, _ := oiItem.FetchOembed(oembed.Options{URL: *pageURL, Client: info.Client, AcceptLanguage: info.AcceptLanguage}) 247 | if oi != nil && oi.Status < 300 { 248 | info.OembedInfo = oi 249 | } 250 | } 251 | 252 | return nil 253 | } 254 | 255 | func (info *HTMLInfo) trimText(text string, maxLen int) string { 256 | var numRunes = 0 257 | runes := []rune(text) 258 | for index := range runes { 259 | numRunes++ 260 | if numRunes > maxLen { 261 | return string(runes[:index-3]) + "..." 262 | } 263 | } 264 | return text 265 | } 266 | 267 | // GenerateOembedFor return Oembed Info for given url based on previously parsed data 268 | // The returned oembed data is also updated in info.OembedInfo 269 | // Example: 270 | // 271 | // info := NewHTMLInfo() 272 | // info.Parse(dataReader, &sourceURL) 273 | // oembed := info.GenerateOembedFor(sourceURL) 274 | func (info *HTMLInfo) GenerateOembedFor(pageURL string) *oembed.Info { 275 | pu, _ := url.Parse(pageURL) 276 | 277 | if pu == nil { 278 | return nil 279 | } 280 | 281 | siteName := info.OGInfo.SiteName 282 | siteURL := strings.ToLower(pu.Scheme) + "://" + pu.Host 283 | 284 | if len(siteName) == 0 { 285 | siteName = pu.Host 286 | } 287 | 288 | title := info.OGInfo.Title 289 | if len(title) == 0 { 290 | title = info.Title 291 | } 292 | 293 | description := info.OGInfo.Description 294 | if len(description) == 0 { 295 | description = info.Description 296 | if len(description) == 0 { 297 | if len(info.MainContent) > 0 { 298 | description = cleanHTMLTagsRegex.ReplaceAllString(info.MainContent, " ") 299 | description = replaceNewLinesRegex.ReplaceAllString(description, " ") 300 | description = clearWhitespacesRegex.ReplaceAllString(description, " ") 301 | description = strings.Trim(description, " ") 302 | description = info.trimText(description, 200) 303 | } 304 | } 305 | } 306 | 307 | baseInfo := &oembed.Info{} 308 | 309 | baseInfo.Type = "link" 310 | baseInfo.URL = pageURL 311 | baseInfo.ProviderURL = siteURL 312 | baseInfo.ProviderName = siteName 313 | baseInfo.Title = title 314 | baseInfo.Description = description 315 | 316 | if len(info.ImageSrcURL) > 0 { 317 | baseInfo.ThumbnailURL = info.toAbsoluteURL(info.ImageSrcURL) 318 | } 319 | 320 | if len(info.OGInfo.Images) > 0 { 321 | baseInfo.ThumbnailURL = info.toAbsoluteURL(info.OGInfo.Images[0].URL) 322 | baseInfo.ThumbnailWidth = info.OGInfo.Images[0].Width 323 | baseInfo.ThumbnailHeight = info.OGInfo.Images[0].Height 324 | } 325 | 326 | if len(baseInfo.ThumbnailURL) == 0 && len(info.MainContent) > 0 { 327 | // get first image from body 328 | matches := getImageRegex.FindStringSubmatch(info.MainContent) 329 | if len(matches) > 0 { 330 | baseInfo.ThumbnailURL = info.toAbsoluteURL(matches[2]) 331 | } 332 | } 333 | 334 | // first we check if there is link to oembed resource 335 | if info.OembedInfo != nil { 336 | info.OembedInfo.MergeWith(baseInfo) 337 | return info.OembedInfo 338 | } 339 | 340 | return baseInfo 341 | } 342 | 343 | // ToJSON return json represenation of structure, simple wrapper around json package 344 | func (info *HTMLInfo) ToJSON() ([]byte, error) { 345 | return json.Marshal(info) 346 | } 347 | 348 | func (info *HTMLInfo) String() string { 349 | data, err := info.ToJSON() 350 | if err != nil { 351 | return err.Error() 352 | } 353 | return string(data[:]) 354 | } 355 | --------------------------------------------------------------------------------