├── .gitignore
├── LICENSE
├── README.md
├── examples
    └── simple.go
└── htmlinfo
    └── htmlinfo.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | *.test
24 | *.prof
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Vitaly Dyatlov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Go HTML Info
  2 | ===
  3 | 
  4 | Go HTML Info provides a simple interface to extract meaningful information from an html page.
  5 | 
  6 | source docs: http://godoc.org/github.com/dyatlov/go-htmlinfo/htmlinfo
  7 | 
  8 | Install: `go get github.com/dyatlov/go-htmlinfo/htmlinfo`
  9 | 
 10 | Use: `import "github.com/dyatlov/go-htmlinfo/htmlinfo"`
 11 | 
 12 | `Parse` method parses all html content into structurized ata.
 13 | `GenerateOembedFor` method generate oembed info from that structurized data. It generates that info based on available data, even if no oembed information found on the page.
 14 | 
 15 | Example:
 16 | 
 17 | ```go
 18 | package main
 19 | 
 20 | import (
 21 | 	"fmt"
 22 | 	"net/http"
 23 | 
 24 | 	"github.com/dyatlov/go-htmlinfo/htmlinfo"
 25 | )
 26 | 
 27 | func main() {
 28 | 	u := "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/"
 29 | 
 30 | 	resp, err := http.Get(u)
 31 | 
 32 | 	if err != nil {
 33 | 		panic(err)
 34 | 	}
 35 | 
 36 | 	defer resp.Body.Close()
 37 | 
 38 | 	info := htmlinfo.NewHTMLInfo()
 39 | 
 40 | 	// if url can be nil too, just then we won't be able to fetch (and generate) oembed information
 41 | 	err = info.Parse(resp.Body, &u, nil)
 42 | 
 43 | 	if err != nil {
 44 | 		panic(err)
 45 | 	}
 46 | 
 47 | 	fmt.Printf("Info:\n%s\n", info)
 48 | 
 49 | 	fmt.Printf("Oembed information: %s\n", info.GenerateOembedFor(u))
 50 | }
 51 | ```
 52 | 
 53 | Result would be:
 54 | 
 55 | _Info:_
 56 | ```javascript
 57 | {
 58 |     "title": "iPad Pro Coming In November, Pricing Starts At $799  |  TechCrunch",
 59 |     "description": "Apple unveiled its new iPad Pro today. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro..",
 60 |     "author_name": "Anthony Ha",
 61 |     "canonical_url": "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/",
 62 |     "oembed_json_url": "https://public-api.wordpress.com/oembed/1.0/?format=json\u0026url=http%3A%2F%2Ftechcrunch.com%2F2015%2F09%2F09%2Fipad-pro-coming-in-november-pricing-starts-at-799%2F\u0026for=wpcom-auto-discovery",
 63 |     "oembed_xml_url": "https://public-api.wordpress.com/oembed/1.0/?format=xml\u0026url=http%3A%2F%2Ftechcrunch.com%2F2015%2F09%2F09%2Fipad-pro-coming-in-november-pricing-starts-at-799%2F\u0026for=wpcom-auto-discovery",
 64 |     "favicon_url": "https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/favicon.ico",
 65 |     "touch_icons": [{
 66 |       { url: 'https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/favicon.ico',
 67 |        type: 'icon',
 68 |        width: 0,
 69 |        height: 0,
 70 |        is_scalable: false },
 71 |      { url: 'https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/homescreen_TCIcon.png',
 72 |        type: 'apple-touch-icon-precomposed',
 73 |        width: 0,
 74 |        height: 0,
 75 |        is_scalable: false },
 76 |        // ...
 77 |     ],
 78 |     "image_src_url": "",
 79 |     "main_content": "Apple unveiled its new iPad Pro today. If you’re wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro and related accessories will be available in November.\nPricing will start at $799 with 32 gigabytes of memory and WiFi-only connectivity, with a $949 price tag for 128 GB, and $1,079 for 128 GB and a cellular connection. If you want the company’s new stylus, dubbed the Apple Pencil, that’ll cost you $99, and the Smart Keyboard will cost $169.\nThat may seem pretty pricey compared to other iPads — in fact, Apple said today that it’s dropping pricing on its iPad Mini 2, which its considers to be the entry-level iPad, to $269. What you’re paying for (among other things) is a 12.9-inch screen with resolution of 2,732 x 2,048 pixels, Apple’s A9X chip and four speakers.\nAnd, as the name and on-stage demos suggest, Apple seems to be pitching this for enterprise use and productivity, not for casual use.\n\t\t\t\n\t\t\t\t\n\t\t\t\tSay Hello To The Brand-New iPad Pro\n\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t",
 80 |     "opengraph": {
 81 |         "type": "article",
 82 |         "url": "http://social.techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/",
 83 |         "title": "iPad Pro Coming In November, Pricing Starts At $799",
 84 |         "description": "Apple unveiled its new iPad Pro today. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro..",
 85 |         "determiner": "",
 86 |         "site_name": "TechCrunch",
 87 |         "locale": "",
 88 |         "locales_alternate": null,
 89 |         "images": [{
 90 |             "url": "https://tctechcrunch2011.files.wordpress.com/2015/09/screen-shot-2015-09-09-at-1-49-10-pm.png?w=560\u0026h=292\u0026crop=1",
 91 |             "secure_url": "",
 92 |             "type": "",
 93 |             "width": 0,
 94 |             "height": 0
 95 |         }],
 96 |         "audios": null,
 97 |         "videos": null,
 98 |         "article": {
 99 |             "published_time": null,
100 |             "modified_time": null,
101 |             "expiration_time": null,
102 |             "section": "",
103 |             "tags": null,
104 |             "authors": null
105 |         }
106 |     },
107 |     "oembed": {
108 |         "type": "link",
109 |         "url": "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/",
110 |         "provider_url": "http://techcrunch.com",
111 |         "provider_name": "TechCrunch",
112 |         "title": "iPad Pro Coming In November, Pricing Starts At\u0026nbsp;$799",
113 |         "description": "",
114 |         "width": 0,
115 |         "height": 0,
116 |         "thumbnail_url": "https://i1.wp.com/tctechcrunch2011.files.wordpress.com/2015/09/screen-shot-2015-09-09-at-1-49-10-pm.png?fit=440%2C330",
117 |         "thumbnail_width": 440,
118 |         "thumbnail_height": 218,
119 |         "author_name": "\u003ca href=\"/author/anthony-ha/\" title=\"Posts by Anthony Ha\" onclick=\"s_objectID='river_author';\" rel=\"author\"\u003eAnthony Ha\u003c/a\u003e",
120 |         "author_url": "/author/anthony-ha/",
121 |         "html": "Apple \u003ca href=\"http://techcrunch.com/2015/09/09/apple-unveils-the-ipad-pro/\"\u003eunveiled its new iPad Pro today\u003c/a\u003e. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro and related accessories will be available in November.\r\n\r\nPricing will start at $799 with 32 gigabytes of memory and WiFi-only connectivity, with a $949 price tag for 128 GB, and $1,079 for 128 GB and a cellular connection. If you want the company's new stylus, \u003ca href=\"http://techcrunch.com/2015/09/09/the-apple-pencil-is-the-ipad-pros-secret-weapon/#.91issd:LNXD\"\u003edubbed the Apple Pencil\u003c/a\u003e, that'll cost you $99, and the Smart Keyboard will cost $169.\r\n"
122 |     }
123 | }
124 | ```
125 | 
126 | _Oembed information:_
127 | ```javascript
128 | {
129 |     "type": "link",
130 |     "url": "http://techcrunch.com/2015/09/09/ipad-pro-coming-in-november-pricing-starts-at-799/",
131 |     "provider_url": "http://techcrunch.com",
132 |     "provider_name": "TechCrunch",
133 |     "title": "iPad Pro Coming In November, Pricing Starts At\u0026nbsp;$799",
134 |     "description": "Apple unveiled its new iPad Pro today. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro..",
135 |     "width": 0,
136 |     "height": 0,
137 |     "thumbnail_url": "https://i1.wp.com/tctechcrunch2011.files.wordpress.com/2015/09/screen-shot-2015-09-09-at-1-49-10-pm.png?fit=440%2C330",
138 |     "thumbnail_width": 440,
139 |     "thumbnail_height": 218,
140 |     "author_name": "\u003ca href=\"/author/anthony-ha/\" title=\"Posts by Anthony Ha\" onclick=\"s_objectID='river_author';\" rel=\"author\"\u003eAnthony Ha\u003c/a\u003e",
141 |     "author_url": "/author/anthony-ha/",
142 |     "html": "Apple \u003ca href=\"http://techcrunch.com/2015/09/09/apple-unveils-the-ipad-pro/\"\u003eunveiled its new iPad Pro today\u003c/a\u003e. If you're wondering when you can get your hands on it, and how much it will cost, here you go: Apple says the iPad Pro and related accessories will be available in November.\r\n\r\nPricing will start at $799 with 32 gigabytes of memory and WiFi-only connectivity, with a $949 price tag for 128 GB, and $1,079 for 128 GB and a cellular connection. If you want the company's new stylus, \u003ca href=\"http://techcrunch.com/2015/09/09/the-apple-pencil-is-the-ipad-pros-secret-weapon/#.91issd:LNXD\"\u003edubbed the Apple Pencil\u003c/a\u003e, that'll cost you $99, and the Smart Keyboard will cost $169.\r\n"
143 | }
144 | ```
145 | 


--------------------------------------------------------------------------------
/examples/simple.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/dyatlov/go-htmlinfo/htmlinfo"
 8 | )
 9 | 
10 | func main() {
11 | 	u := "http://techcrunch.com/2010/11/02/365-days-10-million-3-rounds-2-companies-all-with-5-magic-slides/"
12 | 
13 | 	resp, err := http.Get(u)
14 | 
15 | 	if err != nil {
16 | 		panic(err)
17 | 	}
18 | 
19 | 	defer resp.Body.Close()
20 | 
21 | 	info := htmlinfo.NewHTMLInfo()
22 | 	info.AllowOembedFetching = true
23 | 
24 | 	ct := resp.Header.Get("Content-Type")
25 | 
26 | 	// if url and contentType are not provided it's fine too, just then we wont be able to fetch (and generate) oembed information
27 | 	err = info.Parse(resp.Body, &u, &ct)
28 | 
29 | 	if err != nil {
30 | 		panic(err)
31 | 	}
32 | 
33 | 	fmt.Printf("Info:\n%s\n", info)
34 | 
35 | 	fmt.Printf("Oembed information: %s\n", info.GenerateOembedFor(u))
36 | }
37 | 


--------------------------------------------------------------------------------
/htmlinfo/htmlinfo.go:
--------------------------------------------------------------------------------
  1 | package htmlinfo
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/json"
  6 | 	"io"
  7 | 	"net/http"
  8 | 	"net/url"
  9 | 	"regexp"
 10 | 	"strconv"
 11 | 	"strings"
 12 | 
 13 | 	"golang.org/x/net/html/charset"
 14 | 
 15 | 	"github.com/dyatlov/go-oembed/oembed"
 16 | 	"github.com/dyatlov/go-opengraph/opengraph"
 17 | 	"golang.org/x/net/html"
 18 | 
 19 | 	"github.com/dyatlov/go-readability"
 20 | )
 21 | 
 22 | // TouchIcon contains all icons parsed from page header, including Apple touch icons
 23 | type TouchIcon struct {
 24 | 	URL        string `json:"url"`
 25 | 	Type       string `json:"type"`
 26 | 	Width      uint64 `json:"width"`
 27 | 	Height     uint64 `json:"height"`
 28 | 	IsScalable bool   `json:"is_scalable"`
 29 | }
 30 | 
 31 | // HTMLInfo contains information extracted from HTML page
 32 | type HTMLInfo struct {
 33 | 	url *url.URL
 34 | 	// http.Client instance to use, if nil then will be used default client
 35 | 	Client *http.Client `json:"-"`
 36 | 	// If it's true then parser will fetch oembed data from oembed url if possible
 37 | 	AllowOembedFetching bool `json:"-"`
 38 | 	// If it's true parser will extract main page content from html
 39 | 	AllowMainContentExtraction bool `json:"-"`
 40 | 	// We'll forward it to Oembed' fetchOembed method
 41 | 	AcceptLanguage string `json:"-"`
 42 | 
 43 | 	Title         string       `json:"title"`
 44 | 	Description   string       `json:"description"`
 45 | 	AuthorName    string       `json:"author_name"`
 46 | 	CanonicalURL  string       `json:"canonical_url"`
 47 | 	OembedJSONURL string       `json:"oembed_json_url"`
 48 | 	OembedXMLURL  string       `json:"oembed_xml_url"`
 49 | 	FaviconURL    string       `json:"favicon_url"`
 50 | 	TouchIcons    []*TouchIcon `json:"touch_icons"`
 51 | 	ImageSrcURL   string       `json:"image_src_url"`
 52 | 	// Readability package is being used inside
 53 | 	MainContent string               `json:"main_content"`
 54 | 	OGInfo      *opengraph.OpenGraph `json:"opengraph"`
 55 | 	OembedInfo  *oembed.Info         `json:"oembed"`
 56 | }
 57 | 
 58 | var (
 59 | 	cleanHTMLTagsRegex    = regexp.MustCompile(`<.*?>`)
 60 | 	replaceNewLinesRegex  = regexp.MustCompile(`[\r\n]+`)
 61 | 	clearWhitespacesRegex = regexp.MustCompile(`\s+`)
 62 | 	getImageRegex         = regexp.MustCompile(`(?i)<img[^>]+?src=("|')?(.*?)("|'|\s|>)`)
 63 | 	linkWithIconsRegex    = regexp.MustCompile(`\b(icon|image_src)\b`)
 64 | 	sizesRegex            = regexp.MustCompile(`(\d+)[^\d]+(\d+)`) // some websites use crazy unicode chars between height and width
 65 | )
 66 | 
 67 | // NewHTMLInfo return new instance of HTMLInfo
 68 | func NewHTMLInfo() *HTMLInfo {
 69 | 	info := &HTMLInfo{AllowOembedFetching: true, AllowMainContentExtraction: true, OGInfo: opengraph.NewOpenGraph(), AcceptLanguage: "en-us"}
 70 | 	return info
 71 | }
 72 | 
 73 | func (info *HTMLInfo) toAbsoluteURL(u string) string {
 74 | 	if info.url == nil {
 75 | 		return u
 76 | 	}
 77 | 
 78 | 	tu, _ := url.Parse(u)
 79 | 
 80 | 	if tu != nil {
 81 | 		if tu.Host == "" {
 82 | 			tu.Scheme = info.url.Scheme
 83 | 			tu.Host = info.url.Host
 84 | 			tu.User = info.url.User
 85 | 			tu.Opaque = info.url.Opaque
 86 | 			if len(tu.Path) == 0 || tu.Path[0] != '/' {
 87 | 				tu.Path = info.url.Path + tu.Path
 88 | 			}
 89 | 		} else if tu.Scheme == "" {
 90 | 			tu.Scheme = info.url.Scheme
 91 | 		}
 92 | 
 93 | 		return tu.String()
 94 | 	}
 95 | 
 96 | 	return u
 97 | }
 98 | 
 99 | func (info *HTMLInfo) appendTouchIcons(url string, rel string, sizes []string) {
100 | 	for _, size := range sizes {
101 | 		icon := &TouchIcon{URL: url, Type: rel, IsScalable: (size == "any")}
102 | 		matches := sizesRegex.FindStringSubmatch(size)
103 | 		if len(matches) >= 3 {
104 | 			icon.Height, _ = strconv.ParseUint(matches[1], 10, 64)
105 | 			icon.Width, _ = strconv.ParseUint(matches[2], 10, 64)
106 | 		}
107 | 		info.TouchIcons = append(info.TouchIcons, icon)
108 | 	}
109 | }
110 | 
111 | func (info *HTMLInfo) parseLinkIcon(attrs map[string]string) {
112 | 	rels := strings.Split(attrs["rel"], " ")
113 | 	url := info.toAbsoluteURL(attrs["href"])
114 | 	sizesString, present := attrs["sizes"]
115 | 	if !present {
116 | 		sizesString = "0x0"
117 | 	}
118 | 	sizes := strings.Split(sizesString, " ")
119 | 
120 | 	for _, rel := range rels {
121 | 		if rel == "image_src" {
122 | 			info.ImageSrcURL = url
123 | 		} else if rel == "icon" {
124 | 			info.FaviconURL = url
125 | 			info.appendTouchIcons(url, rel, sizes)
126 | 		} else if rel == "apple-touch-icon" || rel == "apple-touch-icon-precomposed" {
127 | 			info.appendTouchIcons(url, rel, sizes)
128 | 		}
129 | 	}
130 | }
131 | 
132 | func (info *HTMLInfo) parseHead(n *html.Node) {
133 | 	for c := n.FirstChild; c != nil; c = c.NextSibling {
134 | 		if c.Type == html.ElementNode && c.Data == "title" {
135 | 			if c.FirstChild != nil {
136 | 				info.Title = c.FirstChild.Data
137 | 			}
138 | 		} else if c.Type == html.ElementNode && c.Data == "link" {
139 | 			m := make(map[string]string)
140 | 			for _, a := range c.Attr {
141 | 				m[a.Key] = a.Val
142 | 			}
143 | 			if m["rel"] == "canonical" {
144 | 				info.CanonicalURL = info.toAbsoluteURL(m["href"])
145 | 			} else if m["rel"] == "alternate" && m["type"] == "application/json+oembed" {
146 | 				info.OembedJSONURL = info.toAbsoluteURL(m["href"])
147 | 			} else if m["rel"] == "alternate" && m["type"] == "application/xml+oembed" {
148 | 				info.OembedXMLURL = info.toAbsoluteURL(m["href"])
149 | 			} else if linkWithIconsRegex.MatchString(m["rel"]) {
150 | 				info.parseLinkIcon(m)
151 | 			}
152 | 		} else if c.Type == html.ElementNode && c.Data == "meta" {
153 | 			m := make(map[string]string)
154 | 			for _, a := range c.Attr {
155 | 				m[a.Key] = a.Val
156 | 			}
157 | 
158 | 			if m["name"] == "description" {
159 | 				info.Description = m["content"]
160 | 			} else if m["name"] == "author" {
161 | 				info.AuthorName = m["content"]
162 | 			}
163 | 
164 | 			info.OGInfo.ProcessMeta(m)
165 | 		}
166 | 	}
167 | }
168 | 
169 | func (info *HTMLInfo) parseBody(n *html.Node) {
170 | 	if !info.AllowMainContentExtraction {
171 | 		return
172 | 	}
173 | 
174 | 	buf := new(bytes.Buffer)
175 | 	err := html.Render(buf, n)
176 | 	if err != nil {
177 | 		return
178 | 	}
179 | 	bufStr := buf.String()
180 | 	doc, err := readability.NewDocument(bufStr)
181 | 	if err != nil {
182 | 		return
183 | 	}
184 | 
185 | 	doc.WhitelistTags = []string{"div", "p", "img"}
186 | 	doc.WhitelistAttrs["img"] = []string{"src", "title", "alt"}
187 | 
188 | 	content := doc.Content()
189 | 	content = html.UnescapeString(content)
190 | 
191 | 	info.MainContent = strings.Trim(content, "\r\n\t ")
192 | }
193 | 
194 | // Parse return information about page
195 | // @param s - contains page source
196 | // @params pageURL - contains URL from where the data was taken [optional]
197 | // @params contentType - contains Content-Type header value [optional]
198 | // if no url is given then parser won't attempt to parse oembed info
199 | func (info *HTMLInfo) Parse(s io.Reader, pageURL *string, contentType *string) error {
200 | 	contentTypeStr := "text/html"
201 | 	if contentType != nil && len(*contentType) > 0 {
202 | 		contentTypeStr = *contentType
203 | 	}
204 | 	utf8s, err := charset.NewReader(s, contentTypeStr)
205 | 	if err != nil {
206 | 		return err
207 | 	}
208 | 
209 | 	if pageURL != nil {
210 | 		tu, _ := url.Parse(*pageURL)
211 | 		info.url = tu
212 | 	}
213 | 
214 | 	doc, err := html.Parse(utf8s)
215 | 	if err != nil {
216 | 		return err
217 | 	}
218 | 
219 | 	var f func(*html.Node)
220 | 	f = func(n *html.Node) {
221 | 		for c := n.FirstChild; c != nil; c = c.NextSibling {
222 | 			if c.Type == html.ElementNode {
223 | 				if c.Data == "head" {
224 | 					info.parseHead(c)
225 | 					continue
226 | 				} else if c.Data == "body" {
227 | 					info.parseBody(c)
228 | 					continue
229 | 				}
230 | 			}
231 | 			f(c)
232 | 		}
233 | 	}
234 | 	f(doc)
235 | 
236 | 	if info.AllowOembedFetching && pageURL != nil && len(info.OembedJSONURL) > 0 {
237 | 		pu, _ := url.Parse(info.OembedJSONURL)
238 | 		siteName := info.OGInfo.SiteName
239 | 		siteURL := strings.ToLower(pu.Scheme) + "://" + pu.Host
240 | 
241 | 		if len(siteName) == 0 {
242 | 			siteName = pu.Host
243 | 		}
244 | 
245 | 		oiItem := &oembed.Item{EndpointURL: info.OembedJSONURL, ProviderName: siteName, ProviderURL: siteURL, IsEndpointURLComplete: true}
246 | 		oi, _ := oiItem.FetchOembed(oembed.Options{URL: *pageURL, Client: info.Client, AcceptLanguage: info.AcceptLanguage})
247 | 		if oi != nil && oi.Status < 300 {
248 | 			info.OembedInfo = oi
249 | 		}
250 | 	}
251 | 
252 | 	return nil
253 | }
254 | 
255 | func (info *HTMLInfo) trimText(text string, maxLen int) string {
256 | 	var numRunes = 0
257 | 	runes := []rune(text)
258 | 	for index := range runes {
259 | 		numRunes++
260 | 		if numRunes > maxLen {
261 | 			return string(runes[:index-3]) + "..."
262 | 		}
263 | 	}
264 | 	return text
265 | }
266 | 
267 | // GenerateOembedFor return Oembed Info for given url based on previously parsed data
268 | // The returned oembed data is also updated in info.OembedInfo
269 | // Example:
270 | //
271 | // info := NewHTMLInfo()
272 | // info.Parse(dataReader, &sourceURL)
273 | // oembed := info.GenerateOembedFor(sourceURL)
274 | func (info *HTMLInfo) GenerateOembedFor(pageURL string) *oembed.Info {
275 | 	pu, _ := url.Parse(pageURL)
276 | 
277 | 	if pu == nil {
278 | 		return nil
279 | 	}
280 | 
281 | 	siteName := info.OGInfo.SiteName
282 | 	siteURL := strings.ToLower(pu.Scheme) + "://" + pu.Host
283 | 
284 | 	if len(siteName) == 0 {
285 | 		siteName = pu.Host
286 | 	}
287 | 
288 | 	title := info.OGInfo.Title
289 | 	if len(title) == 0 {
290 | 		title = info.Title
291 | 	}
292 | 
293 | 	description := info.OGInfo.Description
294 | 	if len(description) == 0 {
295 | 		description = info.Description
296 | 		if len(description) == 0 {
297 | 			if len(info.MainContent) > 0 {
298 | 				description = cleanHTMLTagsRegex.ReplaceAllString(info.MainContent, " ")
299 | 				description = replaceNewLinesRegex.ReplaceAllString(description, " ")
300 | 				description = clearWhitespacesRegex.ReplaceAllString(description, " ")
301 | 				description = strings.Trim(description, " ")
302 | 				description = info.trimText(description, 200)
303 | 			}
304 | 		}
305 | 	}
306 | 
307 | 	baseInfo := &oembed.Info{}
308 | 
309 | 	baseInfo.Type = "link"
310 | 	baseInfo.URL = pageURL
311 | 	baseInfo.ProviderURL = siteURL
312 | 	baseInfo.ProviderName = siteName
313 | 	baseInfo.Title = title
314 | 	baseInfo.Description = description
315 | 
316 | 	if len(info.ImageSrcURL) > 0 {
317 | 		baseInfo.ThumbnailURL = info.toAbsoluteURL(info.ImageSrcURL)
318 | 	}
319 | 
320 | 	if len(info.OGInfo.Images) > 0 {
321 | 		baseInfo.ThumbnailURL = info.toAbsoluteURL(info.OGInfo.Images[0].URL)
322 | 		baseInfo.ThumbnailWidth = info.OGInfo.Images[0].Width
323 | 		baseInfo.ThumbnailHeight = info.OGInfo.Images[0].Height
324 | 	}
325 | 
326 | 	if len(baseInfo.ThumbnailURL) == 0 && len(info.MainContent) > 0 {
327 | 		// get first image from body
328 | 		matches := getImageRegex.FindStringSubmatch(info.MainContent)
329 | 		if len(matches) > 0 {
330 | 			baseInfo.ThumbnailURL = info.toAbsoluteURL(matches[2])
331 | 		}
332 | 	}
333 | 
334 | 	// first we check if there is link to oembed resource
335 | 	if info.OembedInfo != nil {
336 | 		info.OembedInfo.MergeWith(baseInfo)
337 | 		return info.OembedInfo
338 | 	}
339 | 
340 | 	return baseInfo
341 | }
342 | 
343 | // ToJSON return json represenation of structure, simple wrapper around json package
344 | func (info *HTMLInfo) ToJSON() ([]byte, error) {
345 | 	return json.Marshal(info)
346 | }
347 | 
348 | func (info *HTMLInfo) String() string {
349 | 	data, err := info.ToJSON()
350 | 	if err != nil {
351 | 		return err.Error()
352 | 	}
353 | 	return string(data[:])
354 | }
355 | 


--------------------------------------------------------------------------------