├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── opengraph ├── __init__.py ├── opengraph.py └── test.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | *.pyo 4 | build 5 | dist 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2011-2017 Erik Rivera 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | OpenGraph is a module of python for parsing the Open Graph Protocol, you can read more about the specification at http://ogp.me/ 2 | 3 | Installation 4 | ============= 5 | 6 | .. code-block:: console 7 | 8 | $ pip install opengraph 9 | 10 | Features 11 | ============= 12 | 13 | * Use it as a python dict 14 | * Input and parsing from a specific url 15 | * Input and parsung from html previous extracted 16 | * HTML output 17 | * JSON output 18 | 19 | Usage 20 | ============== 21 | 22 | **From an URL** 23 | 24 | .. code-block:: pycon 25 | 26 | >>> import opengraph 27 | >>> video = opengraph.OpenGraph(url="http://www.youtube.com/watch?v=q3ixBmDzylQ") 28 | >>> video.is_valid() 29 | True 30 | >>> for x,y in video.items(): 31 | ... print "%-15s => %s" % (x, y) 32 | ... 33 | site_name => YouTube 34 | description => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the... 35 | title => While My Guitar Gently Weeps 36 | url => http://www.youtube.com/watch?v=q3ixBmDzylQ 37 | image => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg 38 | video:type => application/x-shockwave-flash 39 | video:height => 224 40 | video => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1 41 | video:width => 398 42 | type => video 43 | 44 | **From HTML** 45 | 46 | .. code-block:: pycon 47 | 48 | >>> HTML = """ 49 | ... 50 | ... 51 | ... The Rock (1996) 52 | ... 53 | ... 54 | ... 55 | ... 56 | ... 57 | ... 58 | ... """ 59 | >>> movie = opengraph.OpenGraph() # or you can instantiate as follows: opengraph.OpenGraph(html=HTML) 60 | >>> movie.parser(HTML) 61 | >>> video.is_valid() 62 | True 63 | 64 | **Generate JSON or HTML** 65 | 66 | .. code-block:: pycon 67 | 68 | >>> ogp = opengraph.OpenGraph("http://ogp.me/") 69 | >>> print ogp.to_json() 70 | {"image:type": "image/png", "title": "Open Graph protocol", "url": "http://ogp.me/", "image": "http://ogp.me/logo.png", "scrape": false, "_url": "http://ogp.me/", "image:height": "300", "type": "website", "image:width": "300", "description": "The Open Graph protocol enables any web page to become a rich object in a social graph."} 71 | >>> print ogp.to_html() 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /opengraph/__init__.py: -------------------------------------------------------------------------------- 1 | from .opengraph import OpenGraph 2 | -------------------------------------------------------------------------------- /opengraph/opengraph.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import re 4 | 5 | try: 6 | import urllib2 7 | except ImportError: 8 | from urllib import request as urllib2 9 | 10 | try: 11 | from bs4 import BeautifulSoup 12 | except ImportError: 13 | from BeautifulSoup import BeautifulSoup 14 | 15 | global import_json 16 | try: 17 | import json 18 | import_json = True 19 | except ImportError: 20 | import_json = False 21 | 22 | class OpenGraph(dict): 23 | """ 24 | """ 25 | 26 | required_attrs = ['title', 'type', 'image', 'url', 'description'] 27 | 28 | def __init__(self, url=None, html=None, scrape=False, **kwargs): 29 | # If scrape == True, then will try to fetch missing attribtues 30 | # from the page's body 31 | 32 | self.scrape = scrape 33 | self._url = url 34 | 35 | for k in kwargs.keys(): 36 | self[k] = kwargs[k] 37 | 38 | dict.__init__(self) 39 | 40 | if url is not None: 41 | self.fetch(url) 42 | 43 | if html is not None: 44 | self.parser(html) 45 | 46 | def __setattr__(self, name, val): 47 | self[name] = val 48 | 49 | def __getattr__(self, name): 50 | return self[name] 51 | 52 | def fetch(self, url): 53 | """ 54 | """ 55 | raw = urllib2.urlopen(url) 56 | html = raw.read() 57 | return self.parser(html) 58 | 59 | def parser(self, html): 60 | """ 61 | """ 62 | if not isinstance(html,BeautifulSoup): 63 | doc = BeautifulSoup(html) 64 | else: 65 | doc = html 66 | ogs = doc.html.head.findAll(property=re.compile(r'^og')) 67 | for og in ogs: 68 | if og.has_attr(u'content'): 69 | self[og[u'property'][3:]]=og[u'content'] 70 | # Couldn't fetch all attrs from og tags, try scraping body 71 | if not self.is_valid() and self.scrape: 72 | for attr in self.required_attrs: 73 | if not self.valid_attr(attr): 74 | try: 75 | self[attr] = getattr(self, 'scrape_%s' % attr)(doc) 76 | except AttributeError: 77 | pass 78 | 79 | def valid_attr(self, attr): 80 | return self.get(attr) and len(self[attr]) > 0 81 | 82 | def is_valid(self): 83 | return all([self.valid_attr(attr) for attr in self.required_attrs]) 84 | 85 | def to_html(self): 86 | if not self.is_valid(): 87 | return u"" 88 | 89 | meta = u"" 90 | for key,value in self.iteritems(): 91 | meta += u"\n" %(key, value) 92 | meta += u"\n" 93 | 94 | return meta 95 | 96 | def to_json(self): 97 | # TODO: force unicode 98 | global import_json 99 | if not import_json: 100 | return "{'error':'there isn't json module'}" 101 | 102 | if not self.is_valid(): 103 | return json.dumps({'error':'og metadata is not valid'}) 104 | 105 | return json.dumps(self) 106 | 107 | def to_xml(self): 108 | pass 109 | 110 | def scrape_image(self, doc): 111 | images = [dict(img.attrs)['src'] 112 | for img in doc.html.body.findAll('img')] 113 | 114 | if images: 115 | return images[0] 116 | 117 | return u'' 118 | 119 | def scrape_title(self, doc): 120 | return doc.html.head.title.text 121 | 122 | def scrape_type(self, doc): 123 | return 'other' 124 | 125 | def scrape_url(self, doc): 126 | return self._url 127 | 128 | def scrape_description(self, doc): 129 | tag = doc.html.head.findAll('meta', attrs={"name":"description"}) 130 | result = "".join([t['content'] for t in tag]) 131 | return result 132 | -------------------------------------------------------------------------------- /opengraph/test.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import unittest 4 | import opengraph 5 | 6 | HTML = """ 7 | 8 | 9 | The Rock (1996) 10 | 11 | 12 | 13 | 14 | 15 | 16 | """ 17 | 18 | class test(unittest.TestCase): 19 | 20 | def test_url(self): 21 | data = opengraph.OpenGraph(url='https://vimeo.com/896837') 22 | self.assertEqual(data['url'], 'https://vimeo.com/896837') 23 | 24 | def test_isinstace(self): 25 | data = opengraph.OpenGraph() 26 | self.assertTrue(isinstance(data,dict)) 27 | 28 | def test_to_html(self): 29 | og = opengraph.OpenGraph(html=HTML) 30 | self.assertTrue(og.to_html()) 31 | 32 | def test_to_json(self): 33 | og = opengraph.OpenGraph(url='https://www.youtube.com/watch?v=XAyNT2bTFuI') 34 | self.assertTrue(og.to_json()) 35 | self.assertTrue(isinstance(og.to_json(),str)) 36 | 37 | def test_no_json(self): 38 | if getattr(opengraph, 'import_json', None) is not None: # python2 39 | opengraph.import_json = False 40 | else: # python3 41 | opengraph.opengraph.import_json = False 42 | og = opengraph.OpenGraph(url='http://www.ogp.me/') 43 | self.assertEqual(og.to_json(),"{'error':'there isn't json module'}") 44 | 45 | def test_is_valid(self): 46 | og = opengraph.OpenGraph(url='http://www.ogp.me/') 47 | self.assertTrue(og.is_valid()) 48 | 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys, os 3 | 4 | version = '0.5' 5 | 6 | setup(name='opengraph', 7 | version=version, 8 | description="A module to parse the Open Graph Protocol", 9 | long_description=open("README.rst").read() + "\n", 10 | classifiers=[ 11 | 'Development Status :: 3 - Alpha', 12 | 'Intended Audience :: Developers', 13 | 'Programming Language :: Python', 14 | 'Topic :: Text Processing :: Markup :: HTML', 15 | 'Topic :: Software Development :: Libraries :: Python Modules', 16 | ], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers 17 | keywords='opengraph protocol facebook', 18 | author='Erik Rivera', 19 | author_email='erik.river@gmail.com', 20 | url='https://github.com/erikriver/opengraph', 21 | license='MIT', 22 | packages=find_packages(exclude=['ez_setup', 'tests']), 23 | include_package_data=True, 24 | zip_safe=False, 25 | install_requires=[ 26 | 'beautifulsoup4' 27 | ], 28 | entry_points=""" 29 | # -*- Entry points: -*- 30 | """, 31 | ) 32 | --------------------------------------------------------------------------------