├── .coveragerc ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .isort.cfg ├── .vscode └── settings.json ├── AUTHORS.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── api.rst ├── conf.py ├── index.rst ├── make.bat └── usage │ ├── advanced_usage.rst │ ├── install.rst │ └── starting_out.rst ├── lassie ├── __init__.py ├── api.py ├── compat.py ├── core.py ├── exceptions.py ├── filters │ ├── __init__.py │ ├── apple.py │ ├── generic.py │ ├── oembed │ │ ├── __init__.py │ │ └── providers.py │ └── social.py └── utils.py ├── pyproject.toml ├── requirements.txt ├── setup.py ├── test_requirements.txt └── tests ├── __init__.py ├── base.py ├── json └── youtube │ ├── bad_html.json │ ├── good.json │ ├── no_thumb.json │ └── no_type.json ├── oembed ├── __init__.py └── test_youtube.py ├── templates ├── amp │ ├── all_properties.html │ ├── bad_json.html │ ├── list_image.html │ ├── list_image_empty.html │ ├── list_image_list.html │ ├── list_image_list_str.html │ ├── list_image_str.html │ ├── list_json.html │ ├── list_thumbnail_image.html │ ├── str_image.html │ ├── str_thumbnail_image.html │ ├── thumbnail_image.html │ └── video_objects.html ├── core │ ├── bad_image_dimensions.html │ ├── bad_keywords.html │ ├── class_setting_is_none.html │ ├── class_vs_method_settings.html │ ├── empty.html │ ├── image_dimensions.html │ ├── no_html_tag.html │ └── retrieve_all_images.html ├── generic │ ├── all_properties.html │ ├── bad_locale.html │ ├── canonical.html │ ├── favicon.html │ └── no_title.html ├── handle_file_content │ └── image_file.jpg ├── open_graph │ ├── all_properties.html │ ├── no_og_title_no_og_url.html │ ├── og_image_plus_two_body_images.html │ └── og_image_relative_url.html └── twitter_card │ ├── all_properties.html │ └── no_og_title_use_twitter_title.html ├── test_amp.py ├── test_core.py ├── test_generic.py ├── test_handle_file_content.py ├── test_open_graph.py └── test_twitter_card.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = ../lassie/compat.py 3 | 4 | [report] 5 | exclude_lines = 6 | pragma: no cover 7 | 8 | def __repr__ 9 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | release: 6 | types: 7 | - published 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [2.7, 3.8] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install Dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r test_requirements.txt 28 | 29 | - name: Test 30 | run: nosetests -v -w tests/ --logging-filter="lassie" --with-cov --cov lassie --cov-config .coveragerc --cov-report term-missing 31 | publish: 32 | needs: [test] 33 | if: github.event_name == 'release' && github.event.release.target_commitish == 'main' 34 | runs-on: ubuntu-18.04 35 | steps: 36 | - uses: actions/checkout@v2 37 | 38 | - uses: actions/setup-python@v2 39 | with: 40 | python-version: 3.7 41 | 42 | - name: Build binary wheel and a source tarball 43 | run: python setup.py sdist 44 | 45 | - name: Publish 📦 to PyPI 46 | uses: pypa/gh-action-pypi-publish@master 47 | with: 48 | password: ${{ secrets.PYPI_API_TOKEN }} 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | .DS_STORE 39 | test.py 40 | docs/_build 41 | 42 | .venv 43 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | line_length=99 3 | multi_line_output=5 4 | include_trailing_comma=True 5 | known_future_library=future,pies 6 | known_standard_library=std,std2 7 | known_first_party=lassie 8 | default_section=THIRDPARTY 9 | indent=' ' 10 | sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 11 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "yaml.customTags": [ 3 | "!And", 4 | "!And sequence", 5 | "!If", 6 | "!If sequence", 7 | "!Not", 8 | "!Not sequence", 9 | "!Equals", 10 | "!Equals sequence", 11 | "!Or", 12 | "!Or sequence", 13 | "!FindInMap", 14 | "!FindInMap sequence", 15 | "!Base64", 16 | "!Join", 17 | "!Join sequence", 18 | "!Cidr", 19 | "!Ref", 20 | "!Sub", 21 | "!Sub sequence", 22 | "!GetAtt", 23 | "!GetAZs", 24 | "!ImportValue", 25 | "!ImportValue sequence", 26 | "!Select", 27 | "!Select sequence", 28 | "!Split", 29 | "!Split sequence" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Lassie is written and maintained by Mike Helmick and various contributors: 2 | 3 | 4 | Development Lead 5 | ---------------- 6 | 7 | - Mike Helmick 8 | 9 | 10 | Patches and Suggestions 11 | ----------------------- 12 | 13 | - `Ramiro Gómez `_ - Made image URLs absolute, image width/heights are more lenient 14 | - `jay754 `_ - Updated import in setup.py 15 | - `Mark Beacom `_ - Update requirements in setup.py 16 | - `John Hobbs `_ - Support for canonical link tag 17 | - `Benjamin Kampmann `_ - Updating requirements, fixing Python 3.5 support 18 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 0.11.11 (2020-12-16) 7 | ++++++++++++++++++ 8 | - No changes. 9 | 10 | 0.11.10 (2020-12-16) 11 | ++++++++++++++++++ 12 | - Add `html` to response dict when available. 13 | - Upgrade to GitHub Actions 14 | 15 | 0.11.9 (2020-12-16) 16 | ++++++++++++++++++ 17 | - Upgrade beautifulsoup4 dependency 18 | 19 | 0.11.8 (2020-12-16) 20 | ++++++++++++++++++ 21 | - Upgrade requests dependency 22 | 23 | 0.11.7 (2018-08-03) 24 | ++++++++++++++++++ 25 | - Try and return the "best" url. (#75). 26 | - Fix issue where AMP image data was a list of strings. (#75). 27 | 28 | 0.11.6 (2018-05-24) 29 | ++++++++++++++++++ 30 | - Fix issue where AMP images was a list of dictionaries and being identified as an object. 31 | 32 | 0.11.5 (2017-12-27) 33 | ++++++++++++++++++ 34 | - Pin requests==2.18.4 35 | 36 | 0.11.4 (2017-11-01) 37 | ++++++++++++++++++ 38 | - Always get oembed AND html data. 39 | 40 | 0.11.3 (2017-11-01) 41 | ++++++++++++++++++ 42 | - Fix filters.oembed module once lassie is packaged. 43 | 44 | 0.11.0 (2017-11-01) 45 | ++++++++++++++++++ 46 | - Add support for OEmbed providers (YouTube) 47 | 48 | 0.10.1 (2017-06-02) 49 | ++++++++++++++++++ 50 | - Remove owl emoji from README.rst so installs on Windows don't fail. 51 | 52 | 0.10.0 (2017-02-03) 53 | ++++++++++++++++++ 54 | - Fix issue where a website may have malformed HTML and no tag causing soup.html to be None (#60) 55 | - Updated beautifulsoup4 to 4.5.3 56 | - Update html5lib to 1.0b10 57 | 58 | 0.9.0 (2017-01-29) 59 | ++++++++++++++++++ 60 | - Added a default fake user agent to use instead of using python-requests/version (some websites will mark certain user agents as bot attempts) 61 | - Updated requests to 2.13.0 62 | 63 | 0.8.7 (2016-12-21) 64 | ++++++++++++++++++ 65 | - Fix Python 3 support 66 | - Handle empty AMP image lists 67 | 68 | 0.8.6 (2016-11-17) 69 | ++++++++++++++++++ 70 | - Handle AMP image list of strings vs list of objects 71 | 72 | 0.8.5 (2016-11-03) 73 | ++++++++++++++++++ 74 | - Handle AMP data that is contained in a list 75 | - Retrieve videos and thumbnails (as images) from AMP VideoObjects 76 | 77 | 0.8.4 (2016-11-01) 78 | ++++++++++++++++++ 79 | - Fix issue where AMP images could be lists inside an object 80 | 81 | 0.8.3 (2016-10-21) 82 | ++++++++++++++++++ 83 | - Fix issue where some keys returned (i.e. description) would not be retrieved if the key existed with an empty value already 84 | 85 | 0.8.2 (2016-09-26) 86 | ++++++++++++++++++ 87 | - Fix issue where AMP images could be images and not objects 88 | 89 | 0.8.1 (2016-09-26) 90 | ++++++++++++++++++ 91 | - Add support for AMP "description" attribute 92 | - Fix issue where an error would be thrown if width/height of an image weren't strings 93 | - Fix duplicate AMP title request, should have been url 94 | 95 | 0.8.0 (2016-09-26) 96 | ++++++++++++++++++ 97 | - Add support for links that use AMP 98 | 99 | 0.7.2 (2016-08-01) 100 | ++++++++++++++++++ 101 | - Add `status_code` to response dictionary (for "file-like" responses, as well) 102 | 103 | 0.7.1 (2016-07-27) 104 | ++++++++++++++++++ 105 | - Add support for open graph `site_name` 106 | 107 | 108 | 0.7.0 (2016-07-01) 109 | ++++++++++++++++++ 110 | - Add `status_code` to response dictionary 111 | 112 | 113 | 0.6.2 (2015-11-11) 114 | ++++++++++++++++++ 115 | - Pinned `requests` library to version 2.8.1 116 | - Pinned `beautifulsoup4` library to version 4.4.1 117 | - Add Python 3.5 to Travis CI build matrix (officially support 3.5) 118 | 119 | 120 | 0.6.1 (2015-10-30) 121 | ++++++++++++++++++ 122 | - Catch and raise `LassieError` on HEAD requests when `handle_file_content` is passed to the Lassie API 123 | - Pinned `requests` library to version 2.8.0 124 | 125 | 126 | 0.6.0 (2015-08-19) 127 | ++++++++++++++++++ 128 | - Support for secure url image and videos from Open Graph 129 | - Simplified `merge_settings` and data updating internally 130 | 131 | 132 | 0.5.3 (2015-07-02) 133 | ++++++++++++++++++ 134 | - Handle when a website doesn't set a value on the "keywords" meta tag 135 | 136 | 137 | 0.5.2 (2015-04-16) 138 | ++++++++++++++++++ 139 | - Updated `requests` and `beautifulsoup4` library versions 140 | 141 | 142 | 0.5.1 (2014-08-05) 143 | ++++++++++++++++++ 144 | - Fix issue where headers didn't always have 'Content-Type' key 145 | 146 | 147 | 0.5.0 (2014-06-23) 148 | ++++++++++++++++++ 149 | - Added ability to `fetch` links that are image files (jpg, gif, png, bmp) 150 | - Renamed `_retreive_content` to `_retrieve_content` because I evidently don't know how to spell correctly 151 | 152 | 153 | 0.4.0 (2013-09-30) 154 | ++++++++++++++++++ 155 | - Updated `requests` and `beautifulsoup4` library versions 156 | - Added support for manipulating the request, see Advanced Usage docs 157 | - Fixed issue where `lassie.fetch` would break if the page had no title 158 | - Lassie is now more lenient when it comes to width and height values of images (now accepts integers (100) or integer with px (100px) 159 | - Image URLs for all images are now absolute 160 | 161 | 0.3.0 (2013-08-15) 162 | ++++++++++++++++++ 163 | 164 | - Added support for `locale` to be returned. If `lang` is specified in the `html` tag and it normalizes to an actual locale, it will be added to the returned data. 165 | - Fixed bug where height was not being returned for body images 166 | - Added test coverage, we're 100% covered! :D 167 | 168 | 169 | 0.2.1 (2013-08-13) 170 | ++++++++++++++++++ 171 | 172 | - Remove spaces from the returned keywords list 173 | - Fixed issue where favicon was not being retrieved 174 | - Fixed priority for class level vs method level params 175 | 176 | 177 | 0.2.0 (2013-08-06) 178 | ++++++++++++++++++ 179 | 180 | - Fix package error when importing 181 | 182 | 183 | 0.1.0 (2013-08-05) 184 | ++++++++++++++++++ 185 | 186 | - Initial Release 187 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 Mike Helmick 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst HISTORY.rst LICENSE requirements.txt 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Lassie 2 | ====== 3 | 4 | .. image:: https://img.shields.io/pypi/v/lassie.svg?style=flat-square 5 | :target: https://pypi.python.org/pypi/lassie 6 | 7 | .. image:: https://img.shields.io/travis/michaelhelmick/lassie.svg?style=flat-square 8 | :target: https://travis-ci.org/michaelhelmick/lassie 9 | 10 | .. image:: https://img.shields.io/coveralls/michaelhelmick/lassie/master.svg?style=flat-square 11 | :target: https://coveralls.io/r/michaelhelmick/lassie?branch=master 12 | 13 | .. image:: https://img.shields.io/badge/Say%20Thanks!-:)-1EAEDB.svg?style=flat-square 14 | :target: https://saythanks.io/to/michaelhelmick 15 | 16 | Lassie is a Python library for retrieving basic content from websites. 17 | 18 | .. image:: https://i.imgur.com/QrvNfAX.gif 19 | 20 | Usage 21 | ----- 22 | 23 | .. code-block:: python 24 | 25 | >>> import lassie 26 | >>> lassie.fetch('http://www.youtube.com/watch?v=dQw4w9WgXcQ') 27 | { 28 | 'description': u'Music video by Rick Astley performing Never Gonna Give You Up. YouTube view counts pre-VEVO: 2,573,462 (C) 1987 PWL', 29 | 'videos': [{ 30 | 'src': u'http://www.youtube.com/v/dQw4w9WgXcQ?autohide=1&version=3', 31 | 'height': 480, 32 | 'type': u'application/x-shockwave-flash', 33 | 'width': 640 34 | }, { 35 | 'src': u'https://www.youtube.com/embed/dQw4w9WgXcQ', 36 | 'height': 480, 37 | 'width': 640 38 | }], 39 | 'title': u'Rick Astley - Never Gonna Give You Up', 40 | 'url': u'http://www.youtube.com/watch?v=dQw4w9WgXcQ', 41 | 'keywords': [u'Rick', u'Astley', u'Sony', u'BMG', u'Music', u'UK', u'Pop'], 42 | 'images': [{ 43 | 'src': u'http://i1.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg?feature=og', 44 | 'type': u'og:image' 45 | }, { 46 | 'src': u'http://i1.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg', 47 | 'type': u'twitter:image' 48 | }, { 49 | 'src': u'http://s.ytimg.com/yts/img/favicon-vfldLzJxy.ico', 50 | 'type': u'favicon' 51 | }, { 52 | 'src': u'http://s.ytimg.com/yts/img/favicon_32-vflWoMFGx.png', 53 | 'type': u'favicon' 54 | }], 55 | 'locale': u'en_US' 56 | } 57 | 58 | Install 59 | ------- 60 | 61 | Install Lassie via `pip `_ 62 | 63 | .. code-block:: bash 64 | 65 | $ pip install lassie 66 | 67 | or, with `easy_install `_ 68 | 69 | .. code-block:: bash 70 | 71 | $ easy_install lassie 72 | 73 | But, hey... `that's up to you `_. 74 | 75 | Documentation 76 | ------------- 77 | 78 | Documentation can be found here: https://lassie.readthedocs.org/ 79 | 80 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Lassie.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Lassie.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Lassie" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Lassie" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | Developer Interface 4 | =================== 5 | 6 | .. module:: lassie 7 | 8 | This page of the documentation will cover all methods and classes available to the developer. 9 | 10 | Core Interface 11 | -------------- 12 | 13 | .. autoclass:: Lassie 14 | :special-members: __init__ 15 | :inherited-members: 16 | 17 | Exceptions 18 | ---------- 19 | 20 | .. autoexception:: lassie.LassieError -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Lassie documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Aug 2 00:23:04 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import os 15 | import sys 16 | 17 | import lassie 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | #sys.path.insert(0, os.path.abspath('.')) 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | 26 | # -- General configuration ----------------------------------------------------- 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be extensions 32 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 33 | extensions = ['sphinx.ext.autodoc'] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ['_templates'] 37 | 38 | # The suffix of source filenames. 39 | source_suffix = '.rst' 40 | 41 | # The encoding of source files. 42 | #source_encoding = 'utf-8-sig' 43 | 44 | # The master toctree document. 45 | master_doc = 'index' 46 | 47 | # General information about the project. 48 | project = u'Lassie' 49 | copyright = u'2014, Mike Helmick' 50 | 51 | # The version info for the project you're documenting, acts as replacement for 52 | # |version| and |release|, also used in various other places throughout the 53 | # built documents. 54 | # 55 | # The short X.Y version. 56 | version = '0.11.11' 57 | # The full version, including alpha/beta/rc tags. 58 | release = '0.11.11' 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | #language = None 63 | 64 | # There are two options for replacing |today|: either, you set today to some 65 | # non-false value, then it is used: 66 | #today = '' 67 | # Else, today_fmt is used as the format for a strftime call. 68 | #today_fmt = '%B %d, %Y' 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | exclude_patterns = ['_build'] 73 | 74 | # The reST default role (used for this markup: `text`) to use for all documents. 75 | #default_role = None 76 | 77 | # If true, '()' will be appended to :func: etc. cross-reference text. 78 | #add_function_parentheses = True 79 | 80 | # If true, the current module name will be prepended to all description 81 | # unit titles (such as .. function::). 82 | #add_module_names = True 83 | 84 | # If true, sectionauthor and moduleauthor directives will be shown in the 85 | # output. They are ignored by default. 86 | #show_authors = False 87 | 88 | # The name of the Pygments (syntax highlighting) style to use. 89 | pygments_style = 'sphinx' 90 | 91 | # A list of ignored prefixes for module index sorting. 92 | #modindex_common_prefix = [] 93 | 94 | # If true, keep warnings as "system message" paragraphs in the built documents. 95 | #keep_warnings = False 96 | 97 | 98 | # -- Options for HTML output --------------------------------------------------- 99 | 100 | # The theme to use for HTML and HTML Help pages. See the documentation for 101 | # a list of builtin themes. 102 | html_theme = 'default' 103 | 104 | # Theme options are theme-specific and customize the look and feel of a theme 105 | # further. For a list of options available for each theme, see the 106 | # documentation. 107 | #html_theme_options = {} 108 | 109 | # Add any paths that contain custom themes here, relative to this directory. 110 | #html_theme_path = [] 111 | 112 | # The name for this set of Sphinx documents. If None, it defaults to 113 | # " v documentation". 114 | #html_title = None 115 | 116 | # A shorter title for the navigation bar. Default is the same as html_title. 117 | #html_short_title = None 118 | 119 | # The name of an image file (relative to this directory) to place at the top 120 | # of the sidebar. 121 | #html_logo = None 122 | 123 | # The name of an image file (within the static path) to use as favicon of the 124 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 125 | # pixels large. 126 | #html_favicon = None 127 | 128 | # Add any paths that contain custom static files (such as style sheets) here, 129 | # relative to this directory. They are copied after the builtin static files, 130 | # so a file named "default.css" will overwrite the builtin "default.css". 131 | html_static_path = ['_static'] 132 | 133 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 134 | # using the given strftime format. 135 | #html_last_updated_fmt = '%b %d, %Y' 136 | 137 | # If true, SmartyPants will be used to convert quotes and dashes to 138 | # typographically correct entities. 139 | #html_use_smartypants = True 140 | 141 | # Custom sidebar templates, maps document names to template names. 142 | #html_sidebars = {} 143 | 144 | # Additional templates that should be rendered to pages, maps page names to 145 | # template names. 146 | #html_additional_pages = {} 147 | 148 | # If false, no module index is generated. 149 | #html_domain_indices = True 150 | 151 | # If false, no index is generated. 152 | #html_use_index = True 153 | 154 | # If true, the index is split into individual pages for each letter. 155 | #html_split_index = False 156 | 157 | # If true, links to the reST sources are added to the pages. 158 | #html_show_sourcelink = True 159 | 160 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 161 | #html_show_sphinx = True 162 | 163 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 164 | #html_show_copyright = True 165 | 166 | # If true, an OpenSearch description file will be output, and all pages will 167 | # contain a tag referring to it. The value of this option must be the 168 | # base URL from which the finished HTML is served. 169 | #html_use_opensearch = '' 170 | 171 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 172 | #html_file_suffix = None 173 | 174 | # Output file base name for HTML help builder. 175 | htmlhelp_basename = 'Lassiedoc' 176 | 177 | 178 | # -- Options for LaTeX output -------------------------------------------------- 179 | 180 | latex_elements = { 181 | # The paper size ('letterpaper' or 'a4paper'). 182 | #'papersize': 'letterpaper', 183 | 184 | # The font size ('10pt', '11pt' or '12pt'). 185 | #'pointsize': '10pt', 186 | 187 | # Additional stuff for the LaTeX preamble. 188 | #'preamble': '', 189 | } 190 | 191 | # Grouping the document tree into LaTeX files. List of tuples 192 | # (source start file, target name, title, author, documentclass [howto/manual]). 193 | latex_documents = [ 194 | ('index', 'Lassie.tex', u'Lassie Documentation', 195 | u'Mike Helmick', 'manual'), 196 | ] 197 | 198 | # The name of an image file (relative to this directory) to place at the top of 199 | # the title page. 200 | #latex_logo = None 201 | 202 | # For "manual" documents, if this is true, then toplevel headings are parts, 203 | # not chapters. 204 | #latex_use_parts = False 205 | 206 | # If true, show page references after internal links. 207 | #latex_show_pagerefs = False 208 | 209 | # If true, show URL addresses after external links. 210 | #latex_show_urls = False 211 | 212 | # Documents to append as an appendix to all manuals. 213 | #latex_appendices = [] 214 | 215 | # If false, no module index is generated. 216 | #latex_domain_indices = True 217 | 218 | 219 | # -- Options for manual page output -------------------------------------------- 220 | 221 | # One entry per manual page. List of tuples 222 | # (source start file, name, description, authors, manual section). 223 | man_pages = [ 224 | ('index', 'lassie', u'Lassie Documentation', 225 | [u'Mike Helmick'], 1) 226 | ] 227 | 228 | # If true, show URL addresses after external links. 229 | #man_show_urls = False 230 | 231 | 232 | # -- Options for Texinfo output ------------------------------------------------ 233 | 234 | # Grouping the document tree into Texinfo files. List of tuples 235 | # (source start file, target name, title, author, 236 | # dir menu entry, description, category) 237 | texinfo_documents = [ 238 | ('index', 'Lassie', u'Lassie Documentation', 239 | u'Mike Helmick', 'Lassie', 'One line description of project.', 240 | 'Miscellaneous'), 241 | ] 242 | 243 | # Documents to append as an appendix to all manuals. 244 | #texinfo_appendices = [] 245 | 246 | # If false, no module index is generated. 247 | #texinfo_domain_indices = True 248 | 249 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 250 | #texinfo_show_urls = 'footnote' 251 | 252 | # If true, do not generate a @detailmenu in the "Top" node's menu. 253 | #texinfo_no_detailmenu = False 254 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Lassie documentation master file, created by 2 | sphinx-quickstart on Fri Aug 2 00:23:04 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Lassie 7 | ====== 8 | 9 | | Lassie is a Python library for retrieving basic content from websites. 10 | 11 | Usage 12 | ----- 13 | 14 | .. code-block:: python 15 | 16 | >>> import lassie 17 | >>> lassie.fetch('http://www.youtube.com/watch?v=dQw4w9WgXcQ') 18 | { 19 | 'description': u'Music video by Rick Astley performing Never Gonna Give You Up. YouTube view counts pre-VEVO: 2,573,462 (C) 1987 PWL', 20 | 'videos': [{ 21 | 'src': u'http://www.youtube.com/v/dQw4w9WgXcQ?autohide=1&version=3', 22 | 'height': 480, 23 | 'type': u'application/x-shockwave-flash', 24 | 'width': 640 25 | }, { 26 | 'src': u'https://www.youtube.com/embed/dQw4w9WgXcQ', 27 | 'height': 480, 28 | 'width': 640 29 | }], 30 | 'title': u'Rick Astley - Never Gonna Give You Up', 31 | 'url': u'http://www.youtube.com/watch?v=dQw4w9WgXcQ', 32 | 'keywords': [u'Rick', u'Astley', u'Sony', u'BMG', u'Music', u'UK', u'Pop'], 33 | 'images': [{ 34 | 'src': u'http://i1.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg?feature=og', 35 | 'type': u'og:image' 36 | }, { 37 | 'src': u'http://i1.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg', 38 | 'type': u'twitter:image' 39 | }, { 40 | 'src': u'http://s.ytimg.com/yts/img/favicon-vfldLzJxy.ico', 41 | 'type': u'favicon' 42 | }, { 43 | 'src': u'http://s.ytimg.com/yts/img/favicon_32-vflWoMFGx.png', 44 | 'type': u'favicon' 45 | }], 46 | 'locale': u'en_US' 47 | } 48 | 49 | 50 | User Guide 51 | ---------- 52 | 53 | .. toctree:: 54 | :maxdepth: 2 55 | 56 | usage/install 57 | 58 | .. toctree:: 59 | :maxdepth: 2 60 | 61 | usage/starting_out 62 | 63 | .. toctree:: 64 | :maxdepth: 2 65 | 66 | usage/advanced_usage 67 | 68 | 69 | Lassie API Documentation 70 | ------------------------ 71 | 72 | .. toctree:: 73 | :maxdepth: 2 74 | 75 | api 76 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Lassie.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Lassie.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/usage/advanced_usage.rst: -------------------------------------------------------------------------------- 1 | .. _advanced-usage: 2 | 3 | Advanced Usage 4 | ============== 5 | 6 | This section will cover how to use the ``Lassie`` class to maintain settings across all ``fetch`` calls. 7 | 8 | 9 | Class Level Attributes 10 | ---------------------- 11 | 12 | Constructing a ``Lassie`` class and calling ``fetch`` will use all the default params that are available to ``fetch``. 13 | 14 | .. code-block:: python 15 | 16 | >>> from lassie import Lassie 17 | >>> l = Lassie() 18 | 19 | >>> l.fetch('https://github.com/michaelhelmick') 20 | { 21 | 'images': [{ 22 | 'src': u'https://github.global.ssl.fastly.net/images/modules/logos_page/Octocat.png', 23 | 'type': u'og:image' 24 | }, { 25 | 'src': u'https://github.com/favicon.ico', 26 | 'type': u'favicon' 27 | }], 28 | 'url': 'https://github.com/michaelhelmick', 29 | 'description': u'michaelhelmick has 22 repositories written in Python, Shell, and JavaScript. Follow their code on GitHub.', 30 | 'videos': [], 31 | 'title': u'michaelhelmick (Mike Helmick) \xb7 GitHub' 32 | } 33 | >>> l.fetch('https://github.com/ashibble') 34 | { 35 | 'images': [{ 36 | 'src': u'https://github.global.ssl.fastly.net/images/modules/logos_page/Octocat.png', 37 | 'type': u'og:image' 38 | }, { 39 | 'src': u'https://github.com/favicon.ico', 40 | 'type': u'favicon' 41 | }], 42 | 'url': 'https://github.com/ashibble', 43 | 'description': u'Follow ashibble on GitHub and watch them build beautiful projects.', 44 | 'videos': [], 45 | 'title': u'ashibble (Alexander Shibble) \xb7 GitHub' 46 | } 47 | 48 | If you decide that you don't want to filter for Open Graph data, instead of declaring ``open_graph=False`` in every ``fetch`` call: 49 | 50 | .. code-block:: python 51 | 52 | >>> import lassie 53 | >>> l = Lassie() 54 | >>> l.fetch('https://github.com/michaelhelmick', open_graph=False) 55 | >>> l.fetch('https://github.com/ashibble', open_graph=False) 56 | 57 | You can use the ``Lassie`` class and set attibutes on the class. 58 | 59 | .. code-block:: python 60 | 61 | >>> from lassie import Lassie 62 | >>> l = Lassie() 63 | >>> l.open_graph = False 64 | 65 | >>> l.fetch('https://github.com/michaelhelmick') 66 | { 67 | 'images': [{ 68 | 'src': u'https://github.com/favicon.ico', 69 | 'type': u'favicon' 70 | }], 71 | 'url': 'https://github.com/michaelhelmick', 72 | 'description': u'michaelhelmick has 22 repositories written in Python, Shell, and JavaScript. Follow their code on GitHub.', 73 | 'videos': [], 74 | 'title': u'michaelhelmick (Mike Helmick) \xb7 GitHub' 75 | } 76 | >>> l.fetch('https://github.com/ashibble') 77 | { 78 | 'images': [{ 79 | 'src': u'https://github.com/favicon.ico', 80 | 'type': u'favicon' 81 | }], 82 | 'url': 'https://github.com/ashibble', 83 | 'description': u'Follow ashibble on GitHub and watch them build beautiful projects.', 84 | 'videos': [], 85 | 'title': u'ashibble (Alexander Shibble) \xb7 GitHub' 86 | } 87 | 88 | You'll notice the data for the Open Graph properties wasn't returned in the last responses. That's because passing ``open_graph=False`` tells Lassie to not filter for those properties. 89 | 90 | In the edge case that there is a time or two you want to override the class attribute, just pass the parameter to ``fetch`` and Lassie will use that parameter. 91 | 92 | .. code-block:: python 93 | 94 | >>> from lassie import Lassie 95 | >>> l = Lassie() 96 | >>> l.open_graph = False 97 | 98 | >>> l.fetch('https://github.com/michaelhelmick') 99 | { 100 | 'images': [{ 101 | 'src': u'https://github.com/favicon.ico', 102 | 'type': u'favicon' 103 | }], 104 | 'url': 'https://github.com/michaelhelmick', 105 | 'description': u'michaelhelmick has 22 repositories written in Python, Shell, and JavaScript. Follow their code on GitHub.', 106 | 'videos': [], 107 | 'title': u'michaelhelmick (Mike Helmick) \xb7 GitHub' 108 | } 109 | >>> l.fetch('https://github.com/ashibble', open_graph=True) 110 | { 111 | 'images': [{ 112 | 'src': u'https://github.global.ssl.fastly.net/images/modules/logos_page/Octocat.png', 113 | 'type': u'og:image' 114 | }, { 115 | 'src': u'https://github.com/favicon.ico', 116 | 'type': u'favicon' 117 | }], 118 | 'url': 'https://github.com/ashibble', 119 | 'description': u'Follow ashibble on GitHub and watch them build beautiful projects.', 120 | 'videos': [], 121 | 'title': u'ashibble (Alexander Shibble) \xb7 GitHub' 122 | } 123 | 124 | 125 | Manipulate the Request (headers, proxies, etc.) 126 | ----------------------------------------------- 127 | 128 | There are times when you may want to turn SSL verification off, send custom headers, or add proxies for the request to go through. 129 | 130 | Lassie uses the `requests `_ library to make web requests. ``requests`` accepts a few parameters to allow developers to manipulate the acutal HTTP request. 131 | 132 | Here is an example of sending custom headers to a lassie request: 133 | 134 | .. code-block:: python 135 | 136 | from lassie import Lassie 137 | 138 | l = Lassie() 139 | l.request_opts = { 140 | 'headers': { 141 | 'User-Agent': 'python lassie' 142 | } 143 | } 144 | l.fetch('http://google.com') 145 | 146 | Maybe you want to set a request timeout, here's another example: 147 | 148 | .. code-block:: python 149 | 150 | from lassie import Lassie 151 | 152 | l = Lassie() 153 | l.request_opts = { 154 | 'timeout': 10 # 10 seconds 155 | } 156 | 157 | # If the response takes longer than 10 seconds this request will fail 158 | l.fetch('http://google.com') 159 | 160 | 161 | Playing Nice with non-HTML Files 162 | -------------------------------- 163 | 164 | Sometimes, you may want to grab information about an image or other type of file. Although only images are supported, you can retrieve a nicely structured ``dict`` 165 | 166 | Pass ``handle_file_content=True`` to ``lassie.fetch`` or set it on a ``Lassie`` instance 167 | 168 | .. code-block:: python 169 | 170 | >>> from lassie import Lassie 171 | 172 | >>> lassie.fetch('https://camo.githubusercontent.com/d19b279de191489445d8cfd39faf93e19ca2df14/68747470733a2f2f692e696d6775722e636f6d2f5172764e6641582e676966', handle_file_content=True) 173 | { 174 | 'title': '68747470733a2f2f692e696d6775722e636f6d2f5172764e6641582e676966', 175 | 'videos': [], 176 | 'url': 'https://camo.githubusercontent.com/d19b279de191489445d8cfd39faf93e19ca2df14/68747470733a2f2f692e696d6775722e636f6d2f5172764e6641582e676966', 177 | 'images': [{ 178 | 'type': 'body_image', 179 | 'src': 'https://camo.githubusercontent.com/d19b279de191489445d8cfd39faf93e19ca2df14/68747470733a2f2f692e696d6775722e636f6d2f5172764e6641582e676966' 180 | }] 181 | } 182 | 183 | >>> lassie.fetch('http://2.bp.blogspot.com/-vzGgFFtW-VY/Tz-eozaHw3I/AAAAAAAAM3k/OMvxpFYr23s/s1600/The-best-top-desktop-cat-wallpapers-10.jpg', handle_file_content=True) 184 | { 185 | 'title': 'The-best-top-desktop-cat-wallpapers-10.jpg', 186 | 'images': [{ 187 | 'type': 'body_image', 188 | 'src': 'http://2.bp.blogspot.com/-vzGgFFtW-VY/Tz-eozaHw3I/AAAAAAAAM3k/OMvxpFYr23s/s1600/The-best-top-desktop-cat-wallpapers-10.jpg' 189 | }], 190 | 'videos': [], 191 | 'url': 'http://2.bp.blogspot.com/-vzGgFFtW-VY/Tz-eozaHw3I/AAAAAAAAM3k/OMvxpFYr23s/s1600/The-best-top-desktop-cat-wallpapers-10.jpg' 192 | } 193 | -------------------------------------------------------------------------------- /docs/usage/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | Installation 4 | ============ 5 | 6 | Information on how to properly install Lassie 7 | 8 | ******************************************************************************* 9 | 10 | Pip or Easy Install 11 | ------------------- 12 | 13 | Install Lassie via `pip `_ 14 | 15 | .. code-block:: bash 16 | 17 | $ pip install lassie 18 | 19 | or, with `easy_install `_ 20 | 21 | .. code-block:: bash 22 | 23 | $ easy_install lassie 24 | 25 | But, hey... `that's up to you `_. 26 | 27 | 28 | Source Code 29 | ----------- 30 | 31 | Lassie is actively maintained on GitHub 32 | 33 | Feel free to clone the repository 34 | 35 | .. code-block:: bash 36 | 37 | git clone git://github.com/michaelhelmick/lassie.git 38 | 39 | `tarball `_ 40 | 41 | .. code-block:: bash 42 | 43 | $ curl -OL https://github.com/michaelhelmick/lassie/tarball/master 44 | 45 | `zipball `_ 46 | 47 | .. code-block:: bash 48 | 49 | $ curl -OL https://github.com/michaelhelmick/lassie/zipball/master 50 | 51 | Now that you have the source code, install it into your site-packages directory 52 | 53 | .. code-block:: bash 54 | 55 | $ python setup.py install 56 | 57 | ******************************************************************************* 58 | 59 | So Lassie is installed! Now, head over to the :ref:`starting out ` section. 60 | -------------------------------------------------------------------------------- /docs/usage/starting_out.rst: -------------------------------------------------------------------------------- 1 | .. _starting-out: 2 | 3 | Starting Out 4 | ============ 5 | 6 | This section out lines the most basic uses of Lassie 7 | 8 | ******************************************************************************* 9 | 10 | What Lassie Returns 11 | ------------------- 12 | 13 | Lassie aims to return the most beautifully crafted dictionary of important information about the web page. 14 | 15 | Beginning 16 | --------- 17 | 18 | So, let's say you want to retrieve details about a YouTube video. 19 | 20 | Specifically: http://www.youtube.com/watch?v=dQw4w9WgXcQ 21 | 22 | .. code-block:: python 23 | 24 | >>> import lassie 25 | >>> lassie.fetch('http://www.youtube.com/watch?v=dQw4w9WgXcQ') 26 | { 27 | 'description': u'Music video by Rick Astley performing Never Gonna Give You Up. YouTube view counts pre-VEVO: 2,573,462 (C) 1987 PWL', 28 | 'videos': [{ 29 | 'src': u'http://www.youtube.com/v/dQw4w9WgXcQ?version=3&autohide=1', 30 | 'height': 480, 31 | 'type': u'application/x-shockwave-flash', 32 | 'width': 640 33 | }, { 34 | 'src': u'https://www.youtube.com/embed/dQw4w9WgXcQ', 35 | 'height': 480, 36 | 'width': 640 37 | }], 38 | 'title': u'Rick Astley - Never Gonna Give You Up', 39 | 'url': u'http://www.youtube.com/watch?v=dQw4w9WgXcQ', 40 | 'keywords': [u'Rick', u' Astley', u' Sony', u' BMG', u' Music', u' UK', u' Pop'], 41 | 'images': [{ 42 | 'src': u'http://i1.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg?feature=og', 43 | 'type': u'og:image' 44 | }, { 45 | 'src': u'http://i1.ytimg.com/vi/dQw4w9WgXcQ/hqdefault.jpg', 46 | 'type': u'twitter:image' 47 | }, { 48 | 'src': u'http://s.ytimg.com/yts/img/favicon-vfldLzJxy.ico', 49 | 'type': u'favicon' 50 | }, { 51 | 'src': u'http://s.ytimg.com/yts/img/favicon_32-vflWoMFGx.png', 52 | 'type': u'favicon' 53 | }], 54 | 'locale': u'en_US' 55 | } 56 | 57 | Or what if you wanted to get information about an article? 58 | 59 | Specifically: http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/ 60 | 61 | .. code-block:: python 62 | 63 | >>> import lassie 64 | >>> lassie.fetch('http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/') 65 | { 66 | 'description': u"GitHub has surpassed the 3 million-developer mark, a milestone for the collaborative platform for application development.\xa0GitHub said it happened Monday night on the first day of the company's\xa0all-hands winter summit. Launched\xa0in April 2008, GitHub\xa0celebrated\xa0its first million users in..", 67 | 'videos': [], 68 | 'title': u'GitHub Passes The 3 Million Developer Mark | TechCrunch', 69 | 'url': u'http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/', 70 | 'locale': u'en_US', 71 | 'images': [{ 72 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png?w=150', 73 | 'type': u'og:image' 74 | }, { 75 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png', 76 | 'type': u'twitter:image' 77 | }, { 78 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1357660109g', 79 | 'type': u'favicon' 80 | }, { 81 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1357660109g', 82 | 'type': u'favicon' 83 | }] 84 | } 85 | 86 | Lassie, by default, also filters for content from Twitter Cards, grab favicons and touch icons. 87 | 88 | Priorities 89 | ---------- 90 | 91 | Open Graph values takes priority over other values (Twitter Card data, generic data, etc.) 92 | 93 | In other words, if a website has the title of their page as ``YouTube`` and they have their Open Graph title set ```` 94 | 95 | The value of ``title`` when you ``fetch`` the web page will return as "YouTube | A Video Sharing Site" instead of just "YouTube". 96 | 97 | But what if I don't want open graph data? 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 99 | 100 | Then pass ``open_graph=False`` to the ``fetch`` method. 101 | 102 | .. code-block:: python 103 | 104 | >>> lassie.fetch('http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/', open_graph=False) 105 | { 106 | 'description': u"GitHub has surpassed the 3 million-developer mark, a milestone for the collaborative platform for application development.\xa0GitHub said it happened Monday night on the first day of the company's\xa0all-hands winter summit. Launched\xa0in April 2008, GitHub\xa0celebrated\xa0its first million users in..", 107 | 'videos': [], 108 | 'title': u'GitHub Passes The 3 Million Developer Mark | TechCrunch', 109 | 'url': u'http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/', 110 | 'locale': u'en_US', 111 | 'images': [{ 112 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png?w=150', 113 | 'type': u'og:image' 114 | }, { 115 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png', 116 | 'type': u'twitter:image' 117 | }, { 118 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1357660109g', 119 | 'type': u'favicon' 120 | }, { 121 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1357660109g', 122 | 'type': u'favicon' 123 | }] 124 | } 125 | 126 | If you **don't** want Twitter cards, favicons or touch icons, use any combination of the following parameters and pass them to ``fetch``: 127 | 128 | - Pass ``twitter_card=False`` to exclude Twitter Card data from being filtered 129 | - Pass ``touch_icon=False`` to exclude the Apple touch icons from being added to the images array 130 | - Pass ``favicon=False`` to exclude the favicon from being added to the images array 131 | 132 | Obtaining All Images 133 | -------------------- 134 | 135 | Sometimes you might want to obtain a list of all the images on a web page... simple, just pass ``all_images=True`` to ``fetch``. 136 | 137 | .. code-block:: python 138 | 139 | >>> lassie.fetch('http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/', all_images=True) 140 | { 141 | 'description': u"GitHub has surpassed the 3 million-developer mark, a milestone for the collaborative platform for application development.\xa0GitHub said it happened Monday night on the first day of the company's\xa0all-hands winter summit. Launched\xa0in April 2008, GitHub\xa0celebrated\xa0its first million users in..", 142 | 'videos': [], 143 | 'title': u'GitHub Passes The 3 Million Developer Mark | TechCrunch', 144 | 'url': u'http://techcrunch.com/2013/01/16/github-passes-the-3-million-developer-mark/', 145 | 'locale': u'en_US', 146 | 'images': [{ 147 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png?w=150', 148 | 'type': u'og:image' 149 | }, { 150 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png', 151 | 'type': u'twitter:image' 152 | }, { 153 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1357660109g', 154 | 'type': u'favicon' 155 | }, { 156 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/favicon.ico?m=1357660109g', 157 | 'type': u'favicon' 158 | }, { 159 | 'src': u'http://s2.wp.com/wp-content/themes/vip/tctechcrunch2/images/site-logo-cutout.png?m=1342508617g', 160 | 'alt': u'', 161 | 'type': u'body_image' 162 | }, { 163 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/countdown4.jpg?w=640', 164 | 'alt': u'Main Event Page', 165 | 'type': u'body_image' 166 | }, { 167 | 'src': u'http://2.gravatar.com/avatar/b4e205744ae2f9b44921d103b4d80e54?s=60&d=identicon&r=G', 168 | 'alt': u'', 169 | 'height': 60, 170 | 'type': u'body_image', 171 | 'width': 60 172 | }, { 173 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/01/github-logo.png?w=300', 174 | 'alt': u'github-logo', 175 | 'height': 300, 176 | 'type': u'body_image', 177 | 'width': 300 178 | }, { 179 | 'src': u'http://crunchbase.com/assets/images/resized/0001/7208/17208v9-max-150x150.png', 180 | 'alt': u'', 181 | 'type': u'body_image' 182 | }, { 183 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/tardis-egg.jpg?w=89&h=64&crop=1', 184 | 'alt': '', 185 | 'type': u'body_image' 186 | }, { 187 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/made-in-space-zero-gravity.jpg?w=89&h=64&crop=1', 188 | 'alt': '', 189 | 'type': u'body_image' 190 | }, { 191 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/04/apple1.jpg?w=89&h=64&crop=1', 192 | 'alt': '', 193 | 'type': u'body_image' 194 | }, { 195 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/p9130014.jpg?w=89&h=64&crop=1', 196 | 'alt': '', 197 | 'type': u'body_image' 198 | }, { 199 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/htc.png?w=89&h=64&crop=1', 200 | 'alt': '', 201 | 'type': u'body_image' 202 | }, { 203 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/screen-shot-2013-08-13-at-8-18-25-pm.png?w=89&h=64&crop=1', 204 | 'alt': '', 205 | 'type': u'body_image' 206 | }, { 207 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/24112v5-max-250x250.jpg?w=89&h=63&crop=1', 208 | 'alt': '', 209 | 'type': u'body_image' 210 | }, { 211 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/surface-14.jpg?w=89&h=64&crop=1', 212 | 'alt': '', 213 | 'type': u'body_image' 214 | }, { 215 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/sprawl_tuned_robot.jpg?w=89&h=64&crop=1', 216 | 'alt': '', 217 | 'type': u'body_image' 218 | }, { 219 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/ashton-kutcher-jobs.jpg?w=89&h=64&crop=1', 220 | 'alt': '', 221 | 'type': u'body_image' 222 | }, { 223 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/facebook-commerce.png?w=89&h=64&crop=1', 224 | 'alt': '', 225 | 'type': u'body_image' 226 | }, { 227 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/screen-shot-2013-08-14-at-10-23-20-am.png?w=89&h=64&crop=1', 228 | 'alt': '', 229 | 'type': u'body_image' 230 | }, { 231 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2012/10/ibm_logo.jpg?w=89&h=64&crop=1', 232 | 'alt': '', 233 | 'type': u'body_image' 234 | }, { 235 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/screen-shot-2013-08-15-at-12-09-16.png?w=89&h=64&crop=1', 236 | 'alt': '', 237 | 'type': u'body_image' 238 | }, { 239 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/inklogo.jpg?w=89&h=64&crop=1', 240 | 'alt': '', 241 | 'type': u'body_image' 242 | }, { 243 | 'src': u'http://tctechcrunch2011.files.wordpress.com/2013/08/screen-shot-2013-08-15-at-9-31-21-am.png?w=89&h=64&crop=1', 244 | 'alt': '', 245 | 'type': u'body_image' 246 | }] 247 | } 248 | 249 | ******************************************************************************* 250 | 251 | So, now you know the basics. What if you don't want to declare params *every* time to the ``fetch`` method? Head over to the :ref:`advanced usage ` section to learn about the ``Lassie`` class. 252 | -------------------------------------------------------------------------------- /lassie/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # __ ______ ______ ______ __ ______ 4 | # /\ \ /\ __ \ /\ ___\ /\ ___\ /\ \ /\ ___\ 5 | # \ \ \____ \ \ __ \ \ \___ \ \ \___ \ \ \ \ \ \ __\ 6 | # \ \_____\ \ \_\ \_\ \/\_____\ \/\_____\ \ \_\ \ \_____\ 7 | # \/_____/ \/_/\/_/ \/_____/ \/_____/ \/_/ \/_____/ 8 | 9 | """ 10 | Lassie 11 | ------ 12 | 13 | Lassie is a Python library for retrieving basic content from websites. 14 | 15 | """ 16 | 17 | __version__ = '0.11.11' 18 | 19 | from .api import fetch 20 | from .core import Lassie 21 | from .exceptions import LassieError 22 | -------------------------------------------------------------------------------- /lassie/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.api 5 | ~~~~~~~~~~ 6 | 7 | This module implements the Lassie API. 8 | 9 | """ 10 | 11 | from .core import Lassie 12 | 13 | 14 | def fetch(url, **kwargs): 15 | """Constructs and sends a :class:`Lassie ` 16 | Retrieves content from the specified url, parses it, and returns 17 | a beautifully crafted dictionary of important information about that 18 | web page. 19 | 20 | Priority tree is as follows: 21 | 1. Open Graph 22 | 2. Twitter Card 23 | 3. Other meta content (i.e. description, keywords) 24 | 25 | :param url: URL to send a GET request to 26 | :param open_graph: (optional) If ``True``, filters web page content for Open Graph meta tags. The content of these properties have top priority on return values. 27 | :type open_graph: bool 28 | :param twitter_card: (optional) If ``True``, filters web page content for Twitter Card meta tags 29 | :type twitter_card: bool 30 | :param touch_icon: (optional) If ``True``, retrieves Apple touch icons and includes them in the response ``images`` array 31 | :type touch_icon: bool 32 | :param favicon: (optional) If ``True``, retrieves any favicon images and includes them in the response ``images`` array 33 | :type favicon: bool 34 | :param all_images: (optional) If ``True``, retrieves images inside web pages body and includes them in the response ``images`` array. Default: False 35 | :type all_images: bool 36 | :param parser: (optional) String reference for the parser that BeautifulSoup will use 37 | :type parser: string 38 | :param handle_file_content: (optional) If ``True``, lassie will return a generic response when a file is fetched. Default: False 39 | :type handle_file_content: bool 40 | 41 | """ 42 | l = Lassie() 43 | return l.fetch(url, **kwargs) 44 | -------------------------------------------------------------------------------- /lassie/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.compat 5 | ~~~~~~~~~~~~~ 6 | 7 | This module contains imports and declarations for seamless Python 2 and 8 | Python 3 compatibility. 9 | """ 10 | 11 | import sys 12 | 13 | _ver = sys.version_info 14 | 15 | #: Python 2.x? 16 | is_py2 = (_ver[0] == 2) 17 | 18 | #: Python 3.x? 19 | is_py3 = (_ver[0] == 3) 20 | 21 | if is_py2: 22 | from urlparse import urljoin, urlparse 23 | 24 | str = unicode 25 | 26 | elif is_py3: 27 | from urllib.parse import urljoin, urlparse 28 | 29 | str = str 30 | -------------------------------------------------------------------------------- /lassie/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.core 5 | ~~~~~~~~~~~ 6 | 7 | This module contains a Lassie object to maintain settings across lassie. 8 | 9 | """ 10 | 11 | 12 | import json 13 | import re 14 | from os.path import basename 15 | 16 | import requests 17 | from bs4 import BeautifulSoup 18 | from requests import Request, Session 19 | 20 | from .compat import str, urljoin, urlparse 21 | from .exceptions import LassieError 22 | from .filters import FILTER_MAPS 23 | from .filters.oembed.providers import consumer, parse_oembed_data 24 | from .utils import ( 25 | FAKE_USER_AGENT, clean_text, convert_to_int, determine_user_agent, normalize_image_data, 26 | normalize_locale, 27 | ) 28 | 29 | REQUEST_OPTS = { 30 | 'client': ('cert', 'headers', 'hooks', 'max_redirects', 'proxies'), 31 | 'request': ('timeout', 'allow_redirects', 'stream', 'verify'), 32 | } 33 | 34 | IMAGE_MIMETYPES = [ 35 | 'image/jpeg', 'image/gif', 'image/bmp', 'image/png' 36 | ] 37 | 38 | 39 | def merge_settings(fetch_setting, class_setting): 40 | """Merge settings for ``fetch``, method params have priority.""" 41 | if fetch_setting is None: 42 | return class_setting 43 | else: 44 | return fetch_setting 45 | 46 | 47 | class Lassie(object): 48 | __attrs__ = [ 49 | 'open_graph', 'twitter_card', 'touch_icon', 'favicon', 50 | 'canonical', 'all_images', 'parser', '_retrieve_content', 51 | 'client' 52 | ] 53 | 54 | def __init__(self): 55 | """Instantiates an instance of Lassie.""" 56 | self.open_graph = True 57 | self.twitter_card = True 58 | self.touch_icon = True 59 | self.favicon = True 60 | self.canonical = False 61 | self.all_images = False 62 | self.parser = 'html5lib' 63 | self.handle_file_content = False 64 | self.user_agent_set_manually = False 65 | self._request_opts = {} 66 | self.client = Session() 67 | 68 | @property 69 | def request_opts(self): 70 | return self._request_opts 71 | 72 | @request_opts.setter 73 | def request_opts(self, _dict): 74 | for k, v in _dict.items(): 75 | if (k in REQUEST_OPTS['client'] or k in REQUEST_OPTS['request']): 76 | self._request_opts[k] = v 77 | 78 | if k in REQUEST_OPTS['client']: 79 | setattr(self.client, k, v) 80 | 81 | if not self.client.headers or not isinstance(self.client.headers, (dict, requests.structures.CaseInsensitiveDict)): 82 | self.client.headers = {} 83 | 84 | self.client.headers = requests.structures.CaseInsensitiveDict(self.client.headers) 85 | 86 | user_agent = self.client.headers.get('User-Agent') 87 | self.client.headers['User-Agent'] = determine_user_agent(user_agent) 88 | 89 | if user_agent != requests.utils.default_user_agent() and user_agent != FAKE_USER_AGENT: 90 | self.user_agent_set_manually = True 91 | else: 92 | self.user_agent_set_manually = False 93 | 94 | def __repr__(self): 95 | return '' % (self.parser) 96 | 97 | def fetch(self, url, open_graph=None, twitter_card=None, touch_icon=None, 98 | favicon=None, all_images=None, parser=None, handle_file_content=None, 99 | canonical=None): 100 | """Retrieves content from the specified url, parses it, and returns 101 | a beautifully crafted dictionary of important information about that 102 | web page. 103 | 104 | Priority tree is as follows: 105 | 1. OEmbed 106 | 2. Open Graph 107 | 3. Twitter Card 108 | 4. Other meta content (i.e. description, keywords) 109 | 110 | :param url: URL to send a GET request to 111 | :param open_graph: (optional) If ``True``, filters web page content for Open Graph meta tags. The content of these properties have top priority on return values. 112 | :type open_graph: bool 113 | :param twitter_card: (optional) If ``True``, filters web page content for Twitter Card meta tags 114 | :type twitter_card: bool 115 | :param touch_icon: (optional) If ``True``, retrieves Apple touch icons and includes them in the response ``images`` array 116 | :type touch_icon: bool 117 | :param favicon: (optional) If ``True``, retrieves any favicon images and includes them in the response ``images`` array 118 | :type favicon: bool 119 | :param canonical: (optional) If ``True``, retrieves canonical url from meta tags. Default: False 120 | :type canonical: bool 121 | :param all_images: (optional) If ``True``, retrieves images inside web pages body and includes them in the response ``images`` array. Default: False 122 | :type all_images: bool 123 | :param parser: (optional) String reference for the parser that BeautifulSoup will use 124 | :type parser: string 125 | :param handle_file_content: (optional) If ``True``, lassie will return a generic response when a file is fetched. Default: False 126 | :type handle_file_content: bool 127 | 128 | """ 129 | 130 | # Set params, method params have priority over class params 131 | open_graph = merge_settings(open_graph, self.open_graph) 132 | twitter_card = merge_settings(twitter_card, self.twitter_card) 133 | touch_icon = merge_settings(touch_icon, self.touch_icon) 134 | favicon = merge_settings(favicon, self.favicon) 135 | canonical = merge_settings(canonical, self.canonical) 136 | all_images = merge_settings(all_images, self.all_images) 137 | parser = merge_settings(parser, self.parser) 138 | handle_file_content = merge_settings(handle_file_content, self.handle_file_content) 139 | 140 | data = { 141 | 'images': [], 142 | 'videos': [], 143 | } 144 | 145 | has_file_content = False 146 | content_type = None 147 | if handle_file_content: 148 | headers, status_code = self._retrieve_headers(url) 149 | content_type = headers.get('Content-Type') 150 | has_file_content = content_type and not 'text/html' in content_type 151 | 152 | if has_file_content and content_type: 153 | has_image_content = content_type in IMAGE_MIMETYPES 154 | if has_image_content: 155 | parsed_url = urlparse(url) 156 | data['title'] = basename(parsed_url.path.lstrip('/')) # TODO: if the url doesn't have an extension, maybe we should match it up to the mimetype and append an ext? 157 | data['url'] = url 158 | data['images'].append({ 159 | 'type': 'body_image', 160 | 'src': url, 161 | }) 162 | else: 163 | try: 164 | oembed_data, status_code = self._retrieve_oembed_data(url) 165 | parse_oembed_data(oembed_data, data) 166 | except LassieError: 167 | oembed_data = None 168 | 169 | html, status_code = self._retrieve_content(url) 170 | 171 | if not html and not oembed_data: 172 | raise LassieError('There was no content to parse.') 173 | 174 | if '(?:)?)', '', html) 176 | 177 | soup = BeautifulSoup(clean_text(html), parser) 178 | 179 | self._filter_amp_data(soup, data, url, all_images) 180 | 181 | if open_graph: 182 | self._filter_meta_data('open_graph', soup, data, url) 183 | 184 | if twitter_card: 185 | self._filter_meta_data('twitter_card', soup, data) 186 | 187 | self._filter_meta_data('generic', soup, data) 188 | 189 | if touch_icon: 190 | self._filter_link_tag_data('touch_icon', soup, data, url) 191 | 192 | if favicon: 193 | self._filter_link_tag_data('favicon', soup, data, url) 194 | 195 | if canonical: 196 | self._filter_link_tag_data('canonical', soup, data, url) 197 | 198 | if all_images: 199 | # Maybe filter out 1x1, no "good" way to do this if image doesn't supply 200 | # width/height. 201 | self._find_all_images(soup, data, url) 202 | 203 | # TODO: Find a good place for setting url, title and locale 204 | if soup.html.get('lang'): 205 | lang = soup.html.get('lang') 206 | else: 207 | lang = soup.html.get('xml:lang') 208 | 209 | if lang and ('locale' not in data): 210 | locale = normalize_locale(lang) 211 | if locale: 212 | data['locale'] = locale 213 | 214 | data_url = data.get('url') 215 | if not data_url or (data_url in url and len(data_url) < len(url)): 216 | data['url'] = url 217 | 218 | if ('title' not in data or not data.get('title')) and hasattr(soup.title, 'string'): 219 | data['title'] = soup.title.string 220 | 221 | data['html'] = html 222 | 223 | data['status_code'] = status_code 224 | 225 | return data 226 | 227 | def _prepare_request(self, method, url, headers, **request_kwargs): 228 | request = Request(method, url, headers=headers) 229 | prepped = request.prepare() 230 | 231 | if not self.user_agent_set_manually: 232 | prepped.headers['User-Agent'] = determine_user_agent(prepped.headers.get('User-Agent')) 233 | 234 | return prepped 235 | 236 | def _retrieve_oembed_data(self, url): # pragma: no cover 237 | try: 238 | response = consumer.embed(url) 239 | oembed_data = response.getData() 240 | status_code = 200 241 | except Exception as e: 242 | raise LassieError(e) 243 | 244 | return oembed_data, status_code 245 | 246 | def _retrieve_headers(self, url): # pragma: no cover 247 | request_kwargs = self.merge_request_kwargs() 248 | 249 | try: 250 | request = self._prepare_request( 251 | 'HEAD', url, headers=self.client.headers, **request_kwargs 252 | ) 253 | response = self.client.send(request, **request_kwargs) 254 | except requests.exceptions.RequestException as e: 255 | raise LassieError(e) 256 | 257 | return response.headers, response.status_code 258 | 259 | def _retrieve_content(self, url): # pragma: no cover 260 | request_kwargs = self.merge_request_kwargs() 261 | 262 | try: 263 | request = self._prepare_request( 264 | 'GET', url, headers=self.client.headers, **request_kwargs 265 | ) 266 | response = self.client.send(request, **request_kwargs) 267 | except requests.exceptions.RequestException as e: 268 | raise LassieError(e) 269 | 270 | return response.text, response.status_code 271 | 272 | def merge_request_kwargs(self): 273 | request_kwargs = {} 274 | 275 | for k, v in self._request_opts.items(): 276 | if k in REQUEST_OPTS['request']: 277 | # Set request specific kwarg 278 | request_kwargs[k] = v 279 | 280 | return request_kwargs 281 | 282 | def _filter_meta_data(self, source, soup, data, url=None): 283 | """This method filters the web page content for meta tags that match patterns given in the ``FILTER_MAPS`` 284 | 285 | :param source: The key of the meta dictionary in ``FILTER_MAPS['meta']`` 286 | :type source: string 287 | :param soup: BeautifulSoup instance to find meta tags 288 | :type soup: instance 289 | :param data: The response dictionary to manipulate 290 | :type data: (dict) 291 | 292 | """ 293 | meta = FILTER_MAPS['meta'][source] 294 | meta_map = meta['map'] 295 | 296 | html = soup.find_all('meta', {meta['key']: meta['pattern']}) 297 | 298 | image = {} 299 | video = {} 300 | 301 | for line in html: 302 | prop = line.get(meta['key']) 303 | value = line.get('content') 304 | _prop = meta_map.get(prop) 305 | 306 | if prop in meta_map and _prop and not data.get(_prop): 307 | # this could be bad in cases where any values that the property 308 | # is mapped up to (i.e. "src", "type", etc) are found in ``data`` 309 | # TODO: Figure out a smoother way to prevent conflicts ^^^^^^^^ 310 | image_prop = meta['image_key'] 311 | video_prop = meta['video_key'] 312 | 313 | if prop.startswith((image_prop, video_prop)) and \ 314 | prop.endswith(('width', 'height')): 315 | if prop.endswith(('width', 'height')): 316 | value = convert_to_int(value) 317 | 318 | if meta_map[prop] == 'locale': 319 | locale = normalize_locale(value) 320 | if locale: 321 | data['locale'] = locale 322 | 323 | if prop == 'keywords': 324 | if isinstance(value, str): 325 | value = [v.strip() for v in value.split(',')] 326 | else: 327 | value = [] 328 | 329 | if image_prop and prop.startswith(image_prop) and value: 330 | # og:image URLs can be relative 331 | if prop == 'og:image' and url: 332 | value = urljoin(url, value) 333 | image[meta_map[prop]] = value 334 | elif video_prop and prop.startswith(video_prop) and value: 335 | video[meta_map[prop]] = value 336 | else: 337 | data[meta_map[prop]] = value 338 | 339 | if image: 340 | image['type'] = image_prop 341 | data['images'].append(image) 342 | if video: 343 | data['videos'].append(video) 344 | 345 | def _filter_link_tag_data(self, source, soup, data, url): 346 | """This method filters the web page content for link tags that match patterns given in the ``FILTER_MAPS`` 347 | 348 | :param source: The key of the meta dictionary in ``FILTER_MAPS['link']`` 349 | :type source: string 350 | :param soup: BeautifulSoup instance to find meta tags 351 | :type soup: instance 352 | :param data: The response dictionary to manipulate 353 | :type data: (dict) 354 | :param url: URL used for making an absolute url 355 | :type url: string 356 | 357 | """ 358 | link = FILTER_MAPS['link'][source] 359 | 360 | html = soup.find_all('link', {link['key']: link['pattern']}) 361 | 362 | if link['type'] == 'url': 363 | for line in html: 364 | data['url'] = line.get('href') 365 | else: 366 | for line in html: 367 | data['images'].append({ 368 | 'src': urljoin(url, line.get('href')), 369 | 'type': link['type'], 370 | }) 371 | 372 | def _filter_amp_data(self, soup, data, url, all_images): 373 | amp_scripts = soup.find_all('script', {'type': 'application/ld+json'}) 374 | for script in amp_scripts: 375 | content = script.contents 376 | _json = None 377 | try: 378 | _json = json.loads(content[0]) 379 | except (IndexError, ValueError): 380 | continue 381 | 382 | if _json: 383 | if isinstance(_json, list): 384 | try: 385 | # if the json is a list (see #46), 386 | # set _json to the first item which _should_ be an object 387 | _json = _json[0] 388 | except IndexError: # pragma: no cover 389 | pass 390 | 391 | if isinstance(_json, object): 392 | image = _json.get('image') 393 | if image: 394 | if isinstance(image, str): 395 | data['images'].append({ 396 | 'src': urljoin(url, image), 397 | }) 398 | elif isinstance(image, list) or isinstance(image, object): 399 | if isinstance(image, list): 400 | image = image[0] 401 | 402 | try: 403 | image_list = image.get('@list') 404 | except AttributeError: 405 | image_list = [image] 406 | 407 | if image_list: 408 | for _image in image_list: 409 | if isinstance(_image, str): 410 | data['images'].append({ 411 | 'src': urljoin(url, _image), 412 | }) 413 | elif isinstance(_image, object): 414 | data['images'].append({ 415 | 'src': urljoin(url, _image.get('url')), 416 | 'width': convert_to_int(_image.get('width')), 417 | 'height': convert_to_int(_image.get('height')), 418 | }) 419 | elif not image_list and image.get('url') and url != image.get('url'): 420 | data['images'].append({ 421 | 'src': urljoin(url, image.get('url')), 422 | 'width': convert_to_int(image.get('width')), 423 | 'height': convert_to_int(image.get('height')), 424 | }) 425 | 426 | thumbnail_url = _json.get('thumbnailUrl') 427 | if thumbnail_url: 428 | data['images'].append({ 429 | 'src': urljoin(url, thumbnail_url), 430 | }) 431 | 432 | _type = _json.get('@type') 433 | if _type and _type == 'VideoObject': 434 | video_src = _json.get('embedUrl') 435 | 436 | if video_src: 437 | data['videos'].append({ 438 | 'src': video_src, 439 | 'width': convert_to_int(_json.get('width')), 440 | 'height': convert_to_int(_json.get('height')), 441 | }) 442 | 443 | thumbnail = _json.get('thumbnail') 444 | if thumbnail: 445 | if isinstance(thumbnail, str): 446 | data['images'].append({ 447 | 'src': urljoin(url, thumbnail), 448 | }) 449 | elif isinstance(thumbnail, object): 450 | if thumbnail.get('@list'): 451 | for _thumbnail in thumbnail.get('@list'): 452 | data['images'].append({ 453 | 'src': urljoin(url, _thumbnail.get('url')), 454 | 'width': convert_to_int(_thumbnail.get('width')), 455 | 'height': convert_to_int(_thumbnail.get('height')), 456 | }) 457 | else: 458 | data['images'].append({ 459 | 'src': urljoin(url, thumbnail.get('url')), 460 | 'width': convert_to_int(thumbnail.get('width')), 461 | 'height': convert_to_int(thumbnail.get('height')), 462 | }) 463 | 464 | data['title'] = _json.get('headline', '') 465 | data['url'] = _json.get('url', '') 466 | data['description'] = _json.get('description', '') 467 | 468 | if all_images: 469 | amp_imgs = soup.find_all('amp-img') 470 | for image in amp_imgs: 471 | item = normalize_image_data(image, url) 472 | 473 | data['images'].append(item) 474 | 475 | def _find_all_images(self, soup, data, url): 476 | """This method finds all images in the web page content 477 | 478 | :param soup: BeautifulSoup instance to find meta tags 479 | :type soup: instance 480 | :param data: The response dictionary to manipulate 481 | :type data: (dict) 482 | 483 | """ 484 | all_images = soup.find_all('img') 485 | for image in all_images: 486 | item = normalize_image_data(image, url) 487 | 488 | data['images'].append(item) 489 | -------------------------------------------------------------------------------- /lassie/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.exceptions 5 | ~~~~~~~~~~~~~~~~~ 6 | 7 | This module contains the set of Lassie exceptions. 8 | 9 | """ 10 | 11 | class LassieError(Exception): 12 | """Generic catch-all Exceptions""" 13 | pass 14 | -------------------------------------------------------------------------------- /lassie/filters/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.filters 5 | ~~~~~~~~~~~~~~ 6 | 7 | This package contains various filters for parsing content. 8 | 9 | """ 10 | 11 | from .apple import APPLE_MAPS 12 | from .generic import GENERIC_MAPS 13 | from .social import SOCIAL_MAPS 14 | 15 | META_MAPS = dict(list(GENERIC_MAPS['meta'].items()) + list(SOCIAL_MAPS['meta'].items())) 16 | LINK_MAPS = dict(list(APPLE_MAPS['link'].items()) + list(GENERIC_MAPS['link'].items())) 17 | 18 | FILTER_MAPS = { 19 | 'meta': META_MAPS, 20 | 'link': LINK_MAPS, 21 | } 22 | -------------------------------------------------------------------------------- /lassie/filters/apple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.filters.apple 5 | ~~~~~~~~~~~~~~~~~~~~ 6 | 7 | This module contains Apple related content to help Lassie filter for content. 8 | 9 | """ 10 | 11 | import re 12 | 13 | from ..compat import str 14 | 15 | APPLE_MAPS = { # http://i.imgur.com/cla85xT.jpg 16 | 'link': { 17 | 'touch_icon': { 18 | 'pattern': re.compile(r"^(apple-touch-icon|apple-touch-icon-precomposed)", re.I), 19 | 'key': 'icon', 20 | 'type': str('touch_icon'), 21 | }, 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /lassie/filters/generic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.filters.generic 5 | ~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | This module contains data about generic type content to help Lassie filter for content. 8 | 9 | """ 10 | 11 | import re 12 | 13 | from ..compat import str 14 | 15 | GENERIC_MAPS = { 16 | 'meta': { 17 | 'generic': { 18 | 'pattern': re.compile(r"^(description|keywords|title)", re.I), 19 | 'map': { 20 | 'description': 'description', 21 | 'keywords': 'keywords', 22 | 'title': 'title', 23 | }, 24 | 'image_key': '', 25 | 'video_key': '', 26 | 'key': 'name', 27 | }, 28 | }, 29 | 'link': { 30 | 'favicon': { 31 | 'pattern': 'icon', 32 | 'key': 'rel', 33 | 'type': str('favicon'), 34 | }, 35 | 'canonical': { 36 | 'pattern': 'canonical', 37 | 'key': 'rel', 38 | 'type': 'url' 39 | } 40 | }, 41 | } 42 | -------------------------------------------------------------------------------- /lassie/filters/oembed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelhelmick/lassie/1122c719a68c20b847c1963719070e10a3d253dd/lassie/filters/oembed/__init__.py -------------------------------------------------------------------------------- /lassie/filters/oembed/providers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.filters.providers 5 | ~~~~~~~~~~ 6 | 7 | This module contains oembed providers and a python oembed consumer. 8 | 9 | """ 10 | 11 | import re 12 | 13 | import oembed 14 | 15 | from ...utils import convert_to_int 16 | 17 | HYPERLINK_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') 18 | 19 | PROVIDERS = { 20 | 'http://www.youtube.com/oembed': [ 21 | 'https?://*.youtube.com/watch*', 22 | 'https?://*.youtube.com/v/*', 23 | 'https?://youtu.be/*', 24 | 'https?://*.youtube.com/user/*', 25 | 'https?://*.youtube.com/*#*/*', 26 | 'https?://m.youtube.com/index*', 27 | 'https?://*.youtube.com/profile*', 28 | 'https?://*.youtube.com/view_play_list*', 29 | 'https?://*.youtube.com/playlist*' 30 | ] 31 | } 32 | 33 | consumer = oembed.OEmbedConsumer() 34 | for k, v in PROVIDERS.items(): 35 | endpoint = oembed.OEmbedEndpoint(k, v) 36 | consumer.addEndpoint(endpoint) 37 | 38 | 39 | def parse_oembed_data(oembed_data, data): 40 | """Parse OEmbed response data to inject into lassie's response dict. 41 | 42 | :param oembed_data: OEmbed response data. 43 | :type oembed_data: dict 44 | :param data: Refrence to data variable being updated. 45 | :type data: dict 46 | 47 | """ 48 | data.update({ 49 | 'oembed': oembed_data, 50 | }) 51 | _type = oembed_data.get('type') 52 | provider_name = oembed_data.get('provider_name') 53 | if not _type: 54 | return data 55 | 56 | if oembed_data.get('title'): 57 | data.update({ 58 | 'title': oembed_data.get('title'), 59 | }) 60 | 61 | if _type == 'video': 62 | try: 63 | item = { 64 | 'width': convert_to_int(oembed_data.get('width')), 65 | 'height': convert_to_int(oembed_data.get('height')) 66 | } 67 | if provider_name in ['YouTube', ]: 68 | item['src'] = HYPERLINK_PATTERN.search(oembed_data.get('html')).group(0) 69 | 70 | data['videos'].append(item) 71 | except Exception: 72 | pass 73 | 74 | if oembed_data.get('thumbnail_url'): 75 | item = { 76 | 'width': convert_to_int(oembed_data.get('thumbnail_width')), 77 | 'height': convert_to_int(oembed_data.get('thumbnail_height')), 78 | 'src': oembed_data.get('thumbnail_url') 79 | } 80 | 81 | data['images'].append(item) 82 | 83 | return data 84 | -------------------------------------------------------------------------------- /lassie/filters/social.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.filters.social 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | This module contains data social related content to help Lassie filter for content. 8 | 9 | """ 10 | 11 | import re 12 | 13 | from ..compat import str 14 | 15 | SOCIAL_MAPS = { 16 | 'meta': { 17 | 'open_graph': { # http://ogp.me/ 18 | 'pattern': re.compile(r"^og:", re.I), 19 | 'map': { 20 | 'og:url': 'url', 21 | 'og:title': 'title', 22 | 'og:description': 'description', 23 | 'og:locale': 'locale', 24 | 'og:site_name': 'site_name', 25 | 26 | 'og:image': 'src', 27 | 'og:image:url': 'src', 28 | 'og:image:secure_url': 'secure_src', 29 | 'og:image:width': 'width', 30 | 'og:image:height': 'height', 31 | 'og:image:type': 'type', 32 | 33 | 'og:video': 'src', 34 | 'og:video:url': 'src', 35 | 'og:video:secure_url': 'secure_src', 36 | 'og:video:width': 'width', 37 | 'og:video:height': 'height', 38 | 'og:video:type': 'type', 39 | }, 40 | 'image_key': str('og:image'), 41 | 'video_key': str('og:video'), 42 | 'key': 'property', 43 | }, 44 | 'twitter_card': { # https://dev.twitter.com/docs/cards 45 | 'pattern': re.compile(r"^twitter:", re.I), 46 | 'map': { 47 | 'twitter:url': 'url', 48 | 'twitter:title': 'title', 49 | 'twitter:description': 'description', 50 | 51 | 'twitter:image': 'src', 52 | 'twitter:image:width': 'width', 53 | 'twitter:image:height': 'height', 54 | 55 | 'twitter:player': 'src', 56 | 'twitter:player:width': 'width', 57 | 'twitter:player:height': 'height', 58 | 'twitter:player:content_type': 'type', 59 | }, 60 | 'image_key': str('twitter:image'), 61 | 'video_key': str('twitter:player'), 62 | 'key': 'name', 63 | }, 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /lassie/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | lassie.helpers 5 | ~~~~~~~~~~~~~~ 6 | 7 | This module contains the set of helper functions executed by Lassie methods. 8 | 9 | """ 10 | 11 | import locale 12 | import re 13 | 14 | from requests.utils import default_user_agent 15 | 16 | from .compat import str, urljoin 17 | 18 | CLEANER = re.compile(r'[\r\n\t]') 19 | RE_INT = re.compile(r'\d+') 20 | FAKE_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.20 (KHTML, like Gecko) Version/10.1 Safari/603.1.20' 21 | 22 | def clean_text(value): 23 | """Removes all line breaks, new lines and tabs from the specified content 24 | 25 | :param value: Content to be cleansed 26 | :type value: string 27 | 28 | """ 29 | return CLEANER.sub('', value) 30 | 31 | def convert_to_int(value): 32 | """Attempts to convert a specified value to an integer 33 | 34 | :param value: Content to be converted into an integer 35 | :type value: string or int 36 | 37 | """ 38 | if not value: 39 | return None 40 | 41 | # Apart from numbers also accept values that end with px 42 | if isinstance(value, str): 43 | value = value.strip(' px') 44 | 45 | try: 46 | return int(value) 47 | except (TypeError, ValueError): 48 | return None 49 | 50 | def normalize_locale(value): 51 | value = value.replace('-', '_') 52 | the_locale = locale.normalize(value) 53 | 54 | if the_locale != value: 55 | # Should we return the actual locale, returned from the locale lib instead of splitting? 56 | try: 57 | return str(the_locale.split('.')[0]) 58 | except IndexError: # pragma: no cover 59 | pass 60 | return None 61 | 62 | def normalize_image_data(data, url): 63 | # Create image list then remove duplicate images? 64 | img = { 65 | 'src': urljoin(url, data.get('src')), 66 | 'alt': data.get('alt', ''), 67 | 'type': u'body_image', 68 | } 69 | 70 | # Only include width and height if included as an attribute of the element 71 | width = convert_to_int(data.get('width')) 72 | if width: 73 | img['width'] = width 74 | 75 | height = convert_to_int(data.get('height')) 76 | if height: 77 | img['height'] = height 78 | 79 | return img 80 | 81 | def determine_user_agent(user_agent): 82 | if not user_agent or user_agent == default_user_agent(): 83 | return FAKE_USER_AGENT 84 | 85 | return user_agent 86 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] # PEP 508 specifications. 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.18.4,<3.0.0 2 | beautifulsoup4>=4.9.0,<4.10.0 3 | html5lib==1.0b10 4 | python-oembed 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | try: 7 | from setuptools import setup 8 | except ImportError: 9 | from distutils.core import setup 10 | 11 | __version__ = '0.11.11' 12 | 13 | packages = [ 14 | 'lassie', 15 | 'lassie.filters', 16 | 'lassie.filters.oembed' 17 | ] 18 | 19 | setup( 20 | name='lassie', 21 | version=__version__, 22 | install_requires=open("requirements.txt").read().split("\n"), 23 | author='Mike Helmick', 24 | license=open('LICENSE').read(), 25 | url='https://github.com/michaelhelmick/lassie/tree/master', 26 | keywords='lassie open graph web content scrape scraper', 27 | description='Lassie is a Python library for retrieving content from websites and being returned in a pretty format.', 28 | include_package_data=True, 29 | packages=packages, 30 | classifiers=[ 31 | 'Development Status :: 4 - Beta', 32 | 'Intended Audience :: Developers', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Topic :: Software Development :: Libraries :: Python Modules', 35 | 'Topic :: Internet', 36 | 'Natural Language :: English', 37 | 'Programming Language :: Python', 38 | 'Programming Language :: Python :: 2.7', 39 | 'Programming Language :: Python :: 3', 40 | 'Programming Language :: Python :: 3.8', 41 | ] 42 | ) 43 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | python-coveralls==2.1.0 3 | nose-cov==1.6 4 | mock==1.0.1 5 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelhelmick/lassie/1122c719a68c20b847c1963719070e10a3d253dd/tests/__init__.py -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | import mimetypes 3 | import unittest 4 | 5 | from mock import patch 6 | 7 | from lassie.compat import urlparse 8 | from lassie.core import Lassie 9 | from lassie.exceptions import LassieError 10 | 11 | 12 | def _mock_retrieve_oembed_data(mock, url): 13 | if '.json' not in url: 14 | return {}, 404 15 | 16 | try: 17 | filename = urlparse(url).path 18 | _file = open('./json%s' % filename, 'r') 19 | content = _file.read() 20 | _file.close() 21 | 22 | status_code = 200 23 | except Exception as e: 24 | raise LassieError(e) 25 | 26 | return json.loads(content), status_code 27 | 28 | 29 | def _mock_retrieve_content(mock, url): 30 | if '.html' not in url: 31 | filename = '/generic/all_properties.html' 32 | else: 33 | filename = urlparse(url).path 34 | 35 | _file = open('./templates%s' % filename, 'r') 36 | content = _file.read() 37 | _file.close() 38 | 39 | status_code = 200 40 | 41 | return content, status_code 42 | 43 | 44 | def _mock_retrieve_headers(mock, url): 45 | filename = urlparse(url).path 46 | 47 | headers = { 48 | 'Content-Type': mimetypes.guess_type(filename)[0] or 'application/octet-stream' 49 | } 50 | 51 | status_code = 200 52 | 53 | return headers, status_code 54 | 55 | 56 | class LassieBaseTestCase(unittest.TestCase): 57 | def setUp(self): 58 | self.patch = patch.object(Lassie, '_retrieve_content', _mock_retrieve_content) 59 | self.patch2 = patch.object(Lassie, '_retrieve_headers', _mock_retrieve_headers) 60 | self.patch3 = patch.object(Lassie, '_retrieve_oembed_data', _mock_retrieve_oembed_data) 61 | 62 | self.patch.start() 63 | self.patch2.start() 64 | self.patch3.start() 65 | 66 | def tearDown(self): 67 | self.patch.stop() 68 | self.patch2.stop() 69 | self.patch3.stop() 70 | -------------------------------------------------------------------------------- /tests/json/youtube/bad_html.json: -------------------------------------------------------------------------------- 1 | { 2 | "author_name": "BadAuthor", 3 | "author_url": "https://www.youtube.com/user/BadAuthor", 4 | "height": 270, 5 | "html": "", 6 | "provider_name": "YouTube", 7 | "provider_url": "https://www.youtube.com/", 8 | "thumbnail_height": 360, 9 | "thumbnail_url": "https://i.ytimg.com/vi/nothumb/hqdefault.jpg", 10 | "thumbnail_width": 480, 11 | "title": "Bad Author", 12 | "type": "video", 13 | "version": "1.0", 14 | "width": 480 15 | } 16 | -------------------------------------------------------------------------------- /tests/json/youtube/good.json: -------------------------------------------------------------------------------- 1 | { 2 | "author_name": "Himself0890", 3 | "author_url": "https://www.youtube.com/user/Himself0890", 4 | "height": 270, 5 | "html": "", 6 | "provider_name": "YouTube", 7 | "provider_url": "https://www.youtube.com/", 8 | "thumbnail_height": 360, 9 | "thumbnail_url": "https://i.ytimg.com/vi/lVhOLT2xQAc/hqdefault.jpg", 10 | "thumbnail_width": 480, 11 | "title": "Man vs Thunderjaw", 12 | "type": "video", 13 | "version": "1.0", 14 | "width": 480 15 | } 16 | -------------------------------------------------------------------------------- /tests/json/youtube/no_thumb.json: -------------------------------------------------------------------------------- 1 | { 2 | "author_name": "NoThumb", 3 | "author_url": "https://www.youtube.com/user/NoThumb", 4 | "height": 270, 5 | "html": "", 6 | "provider_name": "YouTube", 7 | "provider_url": "https://www.youtube.com/", 8 | "title": "Bad Type", 9 | "type": "video", 10 | "version": "1.0", 11 | "width": 480 12 | } 13 | -------------------------------------------------------------------------------- /tests/json/youtube/no_type.json: -------------------------------------------------------------------------------- 1 | { 2 | "author_name": "BadType", 3 | "author_url": "https://www.youtube.com/user/BadType", 4 | "height": 270, 5 | "html": "", 6 | "provider_name": "YouTube", 7 | "provider_url": "https://www.youtube.com/", 8 | "thumbnail_height": 360, 9 | "thumbnail_url": "https://i.ytimg.com/vi/nothumb/hqdefault.jpg", 10 | "thumbnail_width": 480, 11 | "title": "Bad Type", 12 | "type": "", 13 | "version": "1.0", 14 | "width": 480 15 | } 16 | -------------------------------------------------------------------------------- /tests/oembed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelhelmick/lassie/1122c719a68c20b847c1963719070e10a3d253dd/tests/oembed/__init__.py -------------------------------------------------------------------------------- /tests/oembed/test_youtube.py: -------------------------------------------------------------------------------- 1 | from lassie import Lassie 2 | 3 | from ..base import LassieBaseTestCase 4 | 5 | 6 | class LassieOEmbedYouTubeTestCase(LassieBaseTestCase): 7 | def test_youtube_good(self): 8 | url = 'http://lassie.it/youtube/good.json' 9 | 10 | l = Lassie() 11 | data = l.fetch(url) 12 | 13 | self.assertEqual(len(data['videos']), 1) 14 | self.assertEqual(len(data['images']), 1) 15 | 16 | def test_bad_url(self): 17 | url = 'http://lassie.it/youtube/bad_url_123456.json' 18 | 19 | l = Lassie() 20 | data = l.fetch(url) 21 | 22 | self.assertIsNone(data.get('oembed')) 23 | 24 | def test_youtube_bad_html(self): 25 | url = 'http://lassie.it/youtube/bad_html.json' 26 | 27 | l = Lassie() 28 | data = l.fetch(url) 29 | 30 | def test_youtube_no_type(self): 31 | url = 'http://lassie.it/youtube/no_type.json' 32 | 33 | l = Lassie() 34 | data = l.fetch(url) 35 | 36 | def test_youtube_no_thumb(self): 37 | url = 'http://lassie.it/youtube/no_thumb.json' 38 | 39 | l = Lassie() 40 | data = l.fetch(url) 41 | 42 | self.assertEqual(len(data['videos']), 1) 43 | self.assertEqual(len(data['images']), 0) 44 | -------------------------------------------------------------------------------- /tests/templates/amp/bad_json.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 41 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /tests/templates/amp/list_image_list_str.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 36 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
103 | 104 | 105 |
106 | 117 |
118 |
119 |
120 |

Google Glass Is Dead, Long Live Snapchat Spectacles

121 |
122 |
123 | 124 | 125 | 126 |
127 |
128 | 129 |
Filed to:Spectacle
130 |
131 |
132 |
133 |
134 |
135 | 137 |
138 | Photo: Snapchat 139 |
140 |
141 |

It seems like it was ages ago that Google Glass was the future that nobody wanted. The wearable tech had at least one bad design flaw—it 142 | seemed to get its early adopters punched in the face because people didn’t like the camera being pointed at them. Now, Snapchat thinks people are finally ready for 143 | glasses-mounted personal recording devices.

144 |

Snapchat is betting that it wasn’t so much the fear of being assaulted that killed Google Glass; it’s just that people didn’t want to pay $1500 for the privilege. The millennial-approved social network is jumping into the hardware game with its $130 "Spectacles." Rather than trying to do everything a smartphone can, the frames will simply focus on looking "stylish" and recording 10-second bursts of circular video.

145 |
146 |
147 |

Advertisement

148 |
149 | 150 | 151 |
152 |

153 |
154 |
155 |

The glasses feature a fish-eye lens that captures videos at an 115-degree angle, which is closer to the eyes’ natural field of view. The user taps a button on the hinge, a ring of lights indicate to strangers that they are being filmed 156 | and a short clip is recorded. (Good luck with that.) The footage is then automatically pushed to Snapchat memories.

157 |

CEO Evan Spiegel recounted to WSJ Magazine the story 158 | of his eureka moment with the Spectacles:

159 |
160 |

"It was our first vacation, and we went to Big Sur for a day or two. We were walking through the woods, stepping over logs, looking up at the beautiful trees. And when I got the footage back and watched it, I could see my own memory, 161 | through my own eyes—it was unbelievable."

162 |
163 |

Ok, I’ll admit that a demonstration of the circular video is kind of cool:

164 |
165 |
166 |

Sponsored

167 | 168 | 169 |

170 |
171 |
172 |

173 | 174 |

175 |

For now, it seems that the company is taking the step into hardware cautiously and it plans to roll out the glasses to the public slowly and get a feel for how much demand is out there. Speigel also refers to the product as a "toy" to 176 | downplay any perception that the company considers this a groundbreaking innovation.

177 |

Spiegel is also using the launch of his new toy to announce the corporate renaming of Snapchat to just Snap Inc., to help with product searches: "You can search Snapchat or Spectacles for the fun stuff and leave Snap Inc. for the Wall 178 | Street crowd." 179 |

180 |

Just in time for October, here’s the summery promotion video for Snapchat Spectacles.

181 |

182 | 183 | 184 | 185 |

186 |

[WSJ Magazine]

187 | 188 |
189 |
190 |
191 | 192 |
Read more!
193 |
194 | 195 |
196 | 216 |
217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /tests/templates/amp/list_json.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 64 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /tests/templates/amp/list_thumbnail_image.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 66 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /tests/templates/amp/str_image.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 36 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
103 | 104 | 105 |
106 | 117 |
118 |
119 |
120 |

Google Glass Is Dead, Long Live Snapchat Spectacles

121 |
122 |
123 | 124 | 125 | 126 |
127 |
128 | 129 |
Filed to:Spectacle
130 |
131 |
132 |
133 |
134 |
135 | 137 |
138 | Photo: Snapchat 139 |
140 |
141 |

It seems like it was ages ago that Google Glass was the future that nobody wanted. The wearable tech had at least one bad design flaw—it 142 | seemed to get its early adopters punched in the face because people didn’t like the camera being pointed at them. Now, Snapchat thinks people are finally ready for 143 | glasses-mounted personal recording devices.

144 |

Snapchat is betting that it wasn’t so much the fear of being assaulted that killed Google Glass; it’s just that people didn’t want to pay $1500 for the privilege. The millennial-approved social network is jumping into the hardware game with its $130 "Spectacles." Rather than trying to do everything a smartphone can, the frames will simply focus on looking "stylish" and recording 10-second bursts of circular video.

145 |
146 |
147 |

Advertisement

148 |
149 | 150 | 151 |
152 |

153 |
154 |
155 |

The glasses feature a fish-eye lens that captures videos at an 115-degree angle, which is closer to the eyes’ natural field of view. The user taps a button on the hinge, a ring of lights indicate to strangers that they are being filmed 156 | and a short clip is recorded. (Good luck with that.) The footage is then automatically pushed to Snapchat memories.

157 |

CEO Evan Spiegel recounted to WSJ Magazine the story 158 | of his eureka moment with the Spectacles:

159 |
160 |

"It was our first vacation, and we went to Big Sur for a day or two. We were walking through the woods, stepping over logs, looking up at the beautiful trees. And when I got the footage back and watched it, I could see my own memory, 161 | through my own eyes—it was unbelievable."

162 |
163 |

Ok, I’ll admit that a demonstration of the circular video is kind of cool:

164 |
165 |
166 |

Sponsored

167 | 168 | 169 |

170 |
171 |
172 |

173 | 174 |

175 |

For now, it seems that the company is taking the step into hardware cautiously and it plans to roll out the glasses to the public slowly and get a feel for how much demand is out there. Speigel also refers to the product as a "toy" to 176 | downplay any perception that the company considers this a groundbreaking innovation.

177 |

Spiegel is also using the launch of his new toy to announce the corporate renaming of Snapchat to just Snap Inc., to help with product searches: "You can search Snapchat or Spectacles for the fun stuff and leave Snap Inc. for the Wall 178 | Street crowd." 179 |

180 |

Just in time for October, here’s the summery promotion video for Snapchat Spectacles.

181 |

182 | 183 | 184 | 185 |

186 |

[WSJ Magazine]

187 | 188 |
189 |
190 |
191 | 192 |
Read more!
193 |
194 | 195 |
196 | 216 |
217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /tests/templates/amp/str_thumbnail_image.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 59 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /tests/templates/amp/thumbnail_image.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 64 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /tests/templates/amp/video_objects.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Google Glass Is Dead, Long Live Snapchat Spectacles 7 | 8 | 9 | 12 | 64 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /tests/templates/core/bad_image_dimensions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | All Properties 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/templates/core/bad_keywords.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | All Properties 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/templates/core/class_setting_is_none.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Core Test | Class setting is None 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/templates/core/class_vs_method_settings.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Core Test | Class vs Method settings 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/templates/core/empty.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelhelmick/lassie/1122c719a68c20b847c1963719070e10a3d253dd/tests/templates/core/empty.html -------------------------------------------------------------------------------- /tests/templates/core/image_dimensions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | All Properties 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /tests/templates/core/no_html_tag.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Lassie Generic Test | No HTML Tag 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /tests/templates/core/retrieve_all_images.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Core Test | Retrieve All Images 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |

16 | 17 |

18 | 19 |

20 | 21 |

22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/templates/generic/all_properties.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | All Properties 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/templates/generic/bad_locale.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | Bad Locale 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/templates/generic/canonical.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | Canonical 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /tests/templates/generic/favicon.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Generic Test | Favicon 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/templates/generic/no_title.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/templates/handle_file_content/image_file.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelhelmick/lassie/1122c719a68c20b847c1963719070e10a3d253dd/tests/templates/handle_file_content/image_file.jpg -------------------------------------------------------------------------------- /tests/templates/open_graph/all_properties.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Open Graph Test | All Properties 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /tests/templates/open_graph/no_og_title_no_og_url.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Open Graph Test | No og:title, No og:url 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/templates/open_graph/og_image_plus_two_body_images.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Open Graph Test | og:image Plus Two Body Images 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/templates/open_graph/og_image_relative_url.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Open Graph Test | og:image with relative URL 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/templates/twitter_card/all_properties.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Twitter Card Test | All Properties 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /tests/templates/twitter_card/no_og_title_use_twitter_title.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lassie Twitter Test | No og:title Use twitter:title 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/test_amp.py: -------------------------------------------------------------------------------- 1 | from lassie import Lassie 2 | 3 | from .base import LassieBaseTestCase 4 | 5 | 6 | class LassieAMPTestCase(LassieBaseTestCase): 7 | def test_all_properites(self): 8 | url = 'http://lassie.it/amp/all_properties.html' 9 | 10 | l = Lassie() 11 | data = l.fetch(url, all_images=True) 12 | 13 | self.assertEqual(len(data['images']), 3) 14 | 15 | title = 'Google Glass Is Dead, Long Live Snapchat Spectacles' 16 | self.assertEqual(data['title'], title) 17 | 18 | def test_bad_json(self): 19 | url = 'http://lassie.it/amp/bad_json.html' 20 | 21 | l = Lassie() 22 | data = l.fetch(url) 23 | 24 | self.assertTrue('amp' in data['url']) 25 | 26 | def test_str_image(self): 27 | url = 'http://lassie.it/amp/str_image.html' 28 | 29 | l = Lassie() 30 | data = l.fetch(url) 31 | 32 | self.assertEqual(1, len(data['images'])) 33 | 34 | def test_list_image(self): 35 | url = 'http://lassie.it/amp/list_image.html' 36 | 37 | l = Lassie() 38 | data = l.fetch(url) 39 | 40 | self.assertEqual(2, len(data['images'])) 41 | 42 | def test_list_image_list(self): 43 | url = 'http://lassie.it/amp/list_image_list.html' 44 | 45 | l = Lassie() 46 | data = l.fetch(url) 47 | 48 | self.assertEqual(2, len(data['images'])) 49 | 50 | def test_list_image_list_str(self): 51 | url = 'http://lassie.it/amp/list_image_list_str.html' 52 | 53 | l = Lassie() 54 | data = l.fetch(url) 55 | 56 | self.assertEqual(1, len(data['images'])) 57 | 58 | def test_list_image_str(self): 59 | url = 'http://lassie.it/amp/list_image_str.html' 60 | 61 | l = Lassie() 62 | data = l.fetch(url) 63 | 64 | self.assertEqual(1, len(data['images'])) 65 | 66 | def test_list_image_empty(self): 67 | url = 'http://lassie.it/amp/list_image_empty.html' 68 | 69 | l = Lassie() 70 | data = l.fetch(url) 71 | 72 | self.assertEqual(1, len(data['images'])) 73 | 74 | def test_list_json(self): 75 | url = 'http://lassie.it/amp/list_json.html' 76 | 77 | l = Lassie() 78 | data = l.fetch(url) 79 | 80 | self.assertTrue('Pixar' in data['description']) 81 | 82 | def test_video_objects(self): 83 | url = 'http://lassie.it/amp/video_objects.html' 84 | 85 | l = Lassie() 86 | data = l.fetch(url) 87 | 88 | self.assertEqual(1, len(data['videos'])) 89 | 90 | def test_thumbnail_image(self): 91 | url = 'http://lassie.it/amp/thumbnail_image.html' 92 | 93 | l = Lassie() 94 | data = l.fetch(url) 95 | 96 | self.assertEqual(2, len(data['images'])) 97 | 98 | def test_list_thumbnail_image(self): 99 | url = 'http://lassie.it/amp/list_thumbnail_image.html' 100 | 101 | l = Lassie() 102 | data = l.fetch(url) 103 | 104 | self.assertEqual(2, len(data['images'])) 105 | 106 | def test_str_thumbnail_image(self): 107 | url = 'http://lassie.it/amp/str_thumbnail_image.html' 108 | 109 | l = Lassie() 110 | data = l.fetch(url) 111 | 112 | self.assertEqual(2, len(data['images'])) 113 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | from lassie import Lassie, LassieError 2 | from lassie.utils import FAKE_USER_AGENT 3 | 4 | from .base import LassieBaseTestCase 5 | 6 | 7 | class LassieCoreTestCase(LassieBaseTestCase): 8 | def test_core_class_vs_method_settings(self): 9 | url = 'http://lassie.it/core/class_vs_method_settings.html' 10 | 11 | l = Lassie() 12 | data = l.fetch(url) 13 | 14 | self.assertEqual(len(data['images']), 1) 15 | 16 | l.open_graph = False 17 | data = l.fetch(url) 18 | 19 | # open_graph is set to False so there shouldn't be any images in the list this time around 20 | self.assertEqual(len(data['images']), 0) 21 | 22 | def test_core_class_setting_is_none(self): 23 | url = 'http://lassie.it/core/class_setting_is_none.html' 24 | 25 | # This is a really odd use-case where they'd set the class attr to None, but it might happen so oh wellz. 26 | l = Lassie() 27 | l.open_graph = None 28 | data = l.fetch(url, open_graph=False) 29 | 30 | self.assertEqual(len(data['images']), 0) 31 | 32 | def test_core_no_content_raises_error(self): 33 | url = 'http://lassie.it/core/empty.html' 34 | 35 | l = Lassie() 36 | self.assertRaises(LassieError, l.fetch, url) 37 | 38 | def test_core_retrieve_all_images(self): 39 | url = 'http://lassie.it/core/retrieve_all_images.html' 40 | 41 | l = Lassie() 42 | l.all_images = True 43 | 44 | data = l.fetch(url) 45 | self.assertEqual(len(data['images']), 3) 46 | 47 | last_image = data['images'][2] 48 | self.assertEqual(last_image['width'], 550) 49 | self.assertEqual(last_image['height'], 365) 50 | 51 | def test_image_dimensions(self): 52 | url = 'http://lassie.it/core/image_dimensions.html' 53 | 54 | l = Lassie() 55 | data = l.fetch(url, all_images=True) 56 | 57 | self.assertEqual(len(data['images']), 4) 58 | 59 | image = data['images'][0] 60 | self.assertEqual(image['width'], 100) 61 | self.assertEqual(image['height'], 100) 62 | 63 | image = data['images'][1] 64 | self.assertEqual(image['width'], 100) 65 | self.assertEqual(image['height'], 100) 66 | 67 | image = data['images'][2] 68 | self.assertEqual(image['width'], 100) 69 | self.assertEqual(image['height'], 100) 70 | 71 | image = data['images'][3] 72 | self.assertEqual(image['width'], 100) 73 | self.assertEqual(image['height'], 100) 74 | 75 | def test_bad_image_dimensions(self): 76 | url = 'http://lassie.it/core/bad_image_dimensions.html' 77 | 78 | l = Lassie() 79 | data = l.fetch(url, all_images=True) 80 | 81 | # lassie.utils.convert_to_int will except a TypeError or ValueError and pass (not setting a width/height on the image) 82 | image = data['images'][0] 83 | self.assertTrue(not 'width' in image) 84 | self.assertTrue(not 'height' in image) 85 | 86 | def test_request_opts(self): 87 | l = Lassie() 88 | l.request_opts = { 89 | 'headers': { 90 | 'User-Agent': 'lassie python', 91 | }, 92 | 'timeout': 3 93 | } 94 | 95 | self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts)) 96 | 97 | # If they modify one of the keys value, make sure it actually happened 98 | l.request_opts['headers'].update({'Content-Type': 'application/json'}) 99 | self.assertEqual(len(l.request_opts['headers']), 2) 100 | self.assertTrue(set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers'])) 101 | 102 | def test_request_opts_no_headers(self): 103 | l = Lassie() 104 | l.request_opts = { 105 | 'headers': {}, 106 | 'timeout': 3 107 | } 108 | 109 | # headers should be set to {} then User-Agent should be added 110 | self.assertTrue(l.client.headers != {}) 111 | 112 | def test_request_opts_default_user_agent(self): 113 | l = Lassie() 114 | l.request_opts = { 115 | 'timeout': 3 116 | } 117 | 118 | # headers should be set to {} then User-Agent should be added 119 | self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT) 120 | 121 | def test_bad_request_opts(self): 122 | l = Lassie() 123 | l.request_opts = { 124 | 'bad_key': True, 125 | 'headers': { 126 | 'User-Agent': 'lassie python' 127 | } 128 | } 129 | 130 | self.assertTrue('bad_key' not in l.request_opts) 131 | self.assertTrue('headers' in l.request_opts) 132 | 133 | def test_core_bad_keywords(self): 134 | url = 'http://lassie.it/core/bad_keywords.html' 135 | 136 | l = Lassie() 137 | data = l.fetch(url) 138 | self.assertEqual(data.get('keywords'), []) 139 | 140 | def test_merge_request_kwargs(self): 141 | l = Lassie() 142 | l.request_opts = { 143 | 'timeout': 3, 144 | } 145 | 146 | request_kwargs = l.merge_request_kwargs() 147 | self.assertTrue('timeout' in request_kwargs) 148 | 149 | def test_prepare_request(self): 150 | url = 'http://lassie.it/core/bad_keywords.html' 151 | 152 | l = Lassie() 153 | l._prepare_request('HEAD', url=url, headers=l.client.headers) 154 | 155 | def test_no_html_tag(self): 156 | url = 'http://lassie.it/core/no_html_tag.html' 157 | 158 | l = Lassie() 159 | data = l.fetch(url) 160 | 161 | self.assertTrue('no_html_tag' in data['title']) 162 | -------------------------------------------------------------------------------- /tests/test_generic.py: -------------------------------------------------------------------------------- 1 | import lassie 2 | 3 | from .base import LassieBaseTestCase 4 | 5 | 6 | class LassieTwitterCardTestCase(LassieBaseTestCase): 7 | def test_generic_all_properties(self): 8 | url = 'http://lassie.it/generic/all_properties.html' 9 | data = lassie.fetch(url, canonical=True) 10 | 11 | self.assertEqual(data['locale'], 'en_US') 12 | self.assertEqual(data['title'], 'Lassie Generic Test | all_properties') 13 | self.assertEqual(data['description'], 'Just a random description of a web page.') 14 | self.assertEqual(data['url'], 'http://example.com/canonical/path') 15 | self.assertEqual(len(data['keywords']), 5) 16 | 17 | def test_generic_bad_locale(self): 18 | url = 'http://lassie.it/generic/bad_locale.html' 19 | data = lassie.fetch(url) 20 | 21 | self.assertTrue(not 'locale' in data) 22 | 23 | def test_generic_favicon(self): 24 | url = 'http://lassie.it/generic/favicon.html' 25 | data = lassie.fetch(url) 26 | 27 | self.assertEqual(len(data['images']), 1) 28 | image = data['images'][0] 29 | 30 | self.assertEqual(image['type'], 'favicon') 31 | 32 | def test_no_title(self): 33 | url = 'http://lassie.it/generic/no_title.html' 34 | data = lassie.fetch(url) 35 | 36 | self.assertTrue(not 'title' in data) 37 | 38 | def test_canonical(self): 39 | url = 'http://lassie.it/generic/canonical.html' 40 | data = lassie.fetch(url, canonical=True) 41 | 42 | self.assertEqual(data['url'], 'http://example.com/canonical/path') 43 | -------------------------------------------------------------------------------- /tests/test_handle_file_content.py: -------------------------------------------------------------------------------- 1 | import lassie 2 | 3 | from .base import LassieBaseTestCase 4 | 5 | 6 | class LassieFileContentTestCase(LassieBaseTestCase): 7 | def test_image_file(self): 8 | url = 'http://lassie.it/handle_file_content/image_file.jpg' 9 | data = lassie.fetch(url, handle_file_content=True) 10 | 11 | self.assertEqual(data['url'], url) 12 | self.assertEqual(data['title'], 'image_file.jpg') 13 | 14 | self.assertEqual(len(data['images']), 1) 15 | image = data['images'][0] 16 | self.assertEqual(image['src'], 'http://lassie.it/handle_file_content/image_file.jpg') 17 | self.assertEqual(image['type'], 'body_image') 18 | -------------------------------------------------------------------------------- /tests/test_open_graph.py: -------------------------------------------------------------------------------- 1 | import lassie 2 | 3 | from .base import LassieBaseTestCase 4 | 5 | 6 | class LassieOpenGraphTestCase(LassieBaseTestCase): 7 | def test_open_graph_all_properties(self): 8 | url = 'http://lassie.it/open_graph/all_properties.html' 9 | data = lassie.fetch(url) 10 | 11 | self.assertEqual(data['url'], url) 12 | self.assertEqual(data['title'], 'Lassie Open Graph All Properies Test') 13 | self.assertEqual(data['description'], 'Just a test template with OG data!') 14 | self.assertEqual(data['locale'], 'en_US') 15 | self.assertEqual(data['site_name'], 'Lassie') 16 | 17 | self.assertEqual(len(data['images']), 1) 18 | image = data['images'][0] 19 | self.assertEqual(image['src'], 'http://i.imgur.com/cvoR7zv.jpg') 20 | self.assertEqual(image['width'], 550) 21 | self.assertEqual(image['height'], 365) 22 | self.assertEqual(image['type'], 'og:image') 23 | 24 | self.assertEqual(len(data['videos']), 1) 25 | video = data['videos'][0] 26 | self.assertEqual(video['src'], 'http://www.youtube.com/v/dQw4w9WgXcQ?version=3&autohide=1') 27 | self.assertEqual(video['width'], 640) 28 | self.assertEqual(video['height'], 480) 29 | self.assertEqual(video['type'], 'application/x-shockwave-flash') 30 | 31 | def test_open_graph_no_og_title_no_og_url(self): 32 | url = 'http://lassie.it/open_graph/no_og_title_no_og_url.html' 33 | data = lassie.fetch(url) 34 | 35 | self.assertEqual(data['url'], url) 36 | self.assertEqual(data['title'], 'Lassie Open Graph Test | No og:title, No og:url') 37 | 38 | def test_open_graph_og_image_plus_two_body_images(self): 39 | url = 'http://lassie.it/open_graph/og_image_plus_two_body_images.html' 40 | data = lassie.fetch(url) 41 | 42 | # Try without passing "all_images", then pass it 43 | 44 | self.assertEqual(len(data['images']), 1) 45 | 46 | data = lassie.fetch(url, all_images=True) 47 | 48 | self.assertEqual(len(data['images']), 3) 49 | 50 | image_0 = data['images'][0] 51 | image_1 = data['images'][1] 52 | image_2 = data['images'][2] 53 | self.assertEqual(image_0['type'], 'og:image') 54 | self.assertEqual(image_1['type'], 'body_image') 55 | self.assertEqual(image_2['type'], 'body_image') 56 | 57 | def test_open_graph_og_image_relative_url(self): 58 | url = 'http://lassie.it/open_graph/og_image_relative_url.html' 59 | data = lassie.fetch(url) 60 | 61 | self.assertEqual( 62 | data['images'][0]['src'], 'http://lassie.it/open_graph/name.jpg') 63 | -------------------------------------------------------------------------------- /tests/test_twitter_card.py: -------------------------------------------------------------------------------- 1 | import lassie 2 | 3 | from .base import LassieBaseTestCase 4 | 5 | 6 | class LassieTwitterCardTestCase(LassieBaseTestCase): 7 | def test_twitter_all_properties(self): 8 | url = 'http://lassie.it/twitter_card/all_properties.html' 9 | data = lassie.fetch(url) 10 | self.assertEqual(data['url'], 'http://www.youtube.com/watch?v=fWNaR-rxAic') 11 | self.assertEqual(data['title'], 'Carly Rae Jepsen - Call Me Maybe') 12 | self.assertEqual(data['description'], 'Buy Now! http://smarturl.it/CallMeMaybe Music video by Carly Rae Jepsen performing Call Me Maybe. (C) 2011 604 Records Inc. #VEVOCertified on June 8, 2012. h...') 13 | 14 | self.assertEqual(len(data['images']), 1) 15 | image = data['images'][0] 16 | self.assertEqual(image['src'], 'http://i1.ytimg.com/vi/fWNaR-rxAic/maxresdefault.jpg') 17 | 18 | self.assertEqual(len(data['videos']), 1) 19 | video = data['videos'][0] 20 | self.assertEqual(video['src'], 'https://www.youtube.com/embed/fWNaR-rxAic') 21 | self.assertEqual(video['width'], 1920) 22 | self.assertEqual(video['height'], 1080) 23 | 24 | def test_twitter_no_og_title_use_twitter_title(self): 25 | url = 'http://lassie.it/twitter_card/no_og_title_use_twitter_title.html' 26 | data = lassie.fetch(url) 27 | 28 | self.assertEqual(data['description'], 'A test case for Lassie!') 29 | self.assertEqual(data['title'], 'Lassie Twitter Test | no_og_title_use_twitter_title') 30 | --------------------------------------------------------------------------------