├── .github └── workflows │ └── tests.yaml ├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── api.rst ├── conf.py ├── django.rst ├── examples.rst ├── flask.rst ├── getting_started.rst ├── index.rst ├── installation.rst └── make.bat ├── examples ├── __init__.py ├── django_ex │ ├── __init__.py │ ├── manage.py │ ├── settings.py │ ├── static │ │ └── style.css │ ├── templates │ │ └── example.html │ ├── urls.py │ └── views.py ├── flask_ex │ ├── app.py │ ├── static │ │ └── style.css │ └── templates │ │ └── example.html └── python_ex │ └── example.py ├── micawber ├── __init__.py ├── cache.py ├── compat.py ├── contrib │ ├── __init__.py │ ├── mcdjango │ │ ├── __init__.py │ │ ├── mcdjango_tests │ │ │ ├── __init__.py │ │ │ ├── models.py │ │ │ └── tests.py │ │ ├── models.py │ │ ├── providers.py │ │ ├── templates │ │ │ └── micawber │ │ │ │ ├── link.html │ │ │ │ ├── photo.html │ │ │ │ ├── rich.html │ │ │ │ └── video.html │ │ └── templatetags │ │ │ ├── __init__.py │ │ │ └── micawber_tags.py │ ├── mcflask.py │ └── providers.py ├── exceptions.py ├── parsers.py ├── providers.py ├── test_utils.py └── tests.py ├── runtests.py └── setup.py /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push] 3 | jobs: 4 | tests: 5 | name: ${{ matrix.python-version }} 6 | runs-on: ubuntu-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: [3.8, "3.10", "3.12", "3.13"] 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-python@v2 14 | with: 15 | python-version: ${{ matrix.python-version }} 16 | - name: pip deps 17 | run: pip install django bs4 18 | - name: runtests 19 | run: python runtests.py 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-22.04 4 | tools: 5 | python: "3.11" 6 | sphinx: 7 | configuration: docs/conf.py 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Charles Leifer 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | include LICENSE 3 | include README.rst 4 | include runtests.py 5 | recursive-include micawber/contrib/mcdjango/templates * 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: http://media.charlesleifer.com/blog/photos/micawber-logo-0.png 2 | 3 | A small library for extracting rich content from urls. 4 | 5 | 6 | what does it do? 7 | ---------------- 8 | 9 | micawber supplies a few methods for retrieving rich metadata about a variety of 10 | links, such as links to youtube videos. micawber also provides functions for 11 | parsing blocks of text and html and replacing links to videos with rich embedded 12 | content. 13 | 14 | examples 15 | -------- 16 | 17 | here is a quick example: 18 | 19 | .. code-block:: python 20 | 21 | import micawber 22 | 23 | # load up rules for some default providers, such as youtube and flickr 24 | providers = micawber.bootstrap_basic() 25 | 26 | providers.request('http://www.youtube.com/watch?v=54XHDUOHuzU') 27 | 28 | # returns the following dictionary: 29 | { 30 | 'author_name': 'pascalbrax', 31 | 'author_url': u'http://www.youtube.com/user/pascalbrax' 32 | 'height': 344, 33 | 'html': u'', 34 | 'provider_name': 'YouTube', 35 | 'provider_url': 'http://www.youtube.com/', 36 | 'title': 'Future Crew - Second Reality demo - HD', 37 | 'type': u'video', 38 | 'thumbnail_height': 360, 39 | 'thumbnail_url': u'http://i2.ytimg.com/vi/54XHDUOHuzU/hqdefault.jpg', 40 | 'thumbnail_width': 480, 41 | 'url': 'http://www.youtube.com/watch?v=54XHDUOHuzU', 42 | 'width': 459, 43 | 'version': '1.0', 44 | } 45 | 46 | providers.parse_text('this is a test:\nhttp://www.youtube.com/watch?v=54XHDUOHuzU') 47 | 48 | # returns the following string: 49 | this is a test: 50 | 51 | 52 | providers.parse_html('
http://www.youtube.com/watch?v=54XHDUOHuzU
') 53 | 54 | # returns the following html: 55 | 56 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make{{ object.body|oembed:"600x600" }}
34 | 35 | Each filter accepts one argument and one optional argument, due to django's template 36 | filters being wack. 37 | 38 | Piping a string through the ``oembed`` filter (or ``oembed_html``) will convert 39 | URLs to things like youtube videos into video players. A couple things to 40 | understand about the parsers: 41 | 42 | * the plaintext parser (``oembed``) will convert URLs *on their own line* into 43 | full images/video-players/etc. URLs that are interspersed within text will 44 | simply be converted into clickable links so as not to disrupt the flow of text. 45 | * the HTML parser (``oembed_html``) will convert URLs that *are not already links* 46 | into full images/video-players/etc. URLs within block elements along with other 47 | text will be converted into clickable links as this would likely disrupt the flow 48 | of text or produce invalid HTML. 49 | 50 | .. note:: 51 | You can control how things are rendered -- check out `the default templates{{ object.body|oembed(html=False, maxwidth=600, maxheight=600) }}
30 | {% endblock %} 31 | 32 | Flask filter API 33 | ---------------- 34 | 35 | .. py:module:: micawber.contrib.mcflask 36 | 37 | The following filters are exposed via the :py:mod:`micawber.contrib.mcflask` module: 38 | 39 | .. py:function:: oembed(text, urlize_all=True, html=False, **params) 40 | 41 | Parse the given text, rendering URLs as rich media 42 | 43 | Usage within a Jinja2 template: 44 | 45 | .. code-block:: python 46 | 47 | {{ blog_entry.body|oembed(urlize_all=False, maxwidth=600) }} 48 | 49 | :param text: the text to be parsed, can be HTML 50 | :param urlize_all: boolean indicating whether to convert bare links to clickable ones 51 | :param html: boolean indicating whether text is plaintext or markup 52 | :param params: any additional keyword arguments, e.g. maxwidth or an api key 53 | :rtype: parsed text with rich content embedded 54 | 55 | .. py:function:: extract_oembed(text, html=False, **params) 56 | 57 | Returns a 2-tuple containing 58 | 59 | * a list of all URLs found within the text (if HTML, all URLs that aren't already links) 60 | * a dictionary of URL to metadata provided by the API endpoint 61 | 62 | .. note:: 63 | Not all URLs listed will have matching entries in the dictionary, since there 64 | may not be a provider for them. 65 | 66 | :param text: the text to be parsed, can be HTML 67 | :param html: boolean indicating whether text is plaintext or markup 68 | :param params: any additional keyword arguments, e.g. maxwidth or an api key 69 | :rtype: 2-tuple containing a list of *all* urls and a dictionary of url -> metadata 70 | 71 | Adding filters to the Jinja Environment 72 | --------------------------------------- 73 | 74 | To actually use these filters they must be made available to the application. Use the 75 | following function to do this sometime after initializing your ``Flask`` app: 76 | 77 | .. py:function:: add_oembed_filters(app, providers) 78 | 79 | Add the ``oembed`` and ``extract_oembed`` filters to the jinja environment 80 | 81 | :param app: a flask application 82 | :param providers: a :py:class:`micawber.providers.ProviderRegistry` instance 83 | :rtype: (no return value) 84 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | .. _getting_started: 2 | 3 | Getting Started 4 | =============== 5 | 6 | If you want the dead simple get-me-up-and-running, try the following: 7 | 8 | .. code-block:: python 9 | 10 | >>> import micawber 11 | >>> providers = micawber.bootstrap_basic() # may take a second 12 | >>> print providers.parse_text('this is a test:\nhttp://www.youtube.com/watch?v=54XHDUOHuzU') 13 | this is a test: 14 | 15 | 16 | Using django? Add ``micawber.contrib.mcdjango`` to your ``INSTALLED_APP``, then 17 | in your templates: 18 | 19 | .. code-block:: html 20 | 21 | {% load micawber_tags %} 22 | {# show a video player for the youtube video #} 23 | {{ "http://www.youtube.com/watch?v=mQEWI1cn7HY"|oembed }} 24 | 25 | Using flask? Use the ``add_oembed_filters`` function to register two jinja 26 | template filters, ``oembed`` and ``extract_oembed``: 27 | 28 | .. code-block:: python 29 | 30 | from flask import Flask 31 | from micawber.providers import bootstrap_basic 32 | from micawber.contrib.mcflask import add_oembed_filters 33 | 34 | app = Flask(__name__) 35 | 36 | oembed_providers = bootstrap_basic() 37 | add_oembed_filters(app, oembed_providers) 38 | 39 | .. code-block:: html 40 | 41 | {# show a video player for the youtube video #} 42 | {{ "http://www.youtube.com/watch?v=mQEWI1cn7HY"|oembed() }} 43 | 44 | Overview 45 | -------- 46 | 47 | micawber is rather simple. It is built to use the `oembedhttp://www.youtube.com/watch?v=54XHDUOHuzU
') 171 | 172 | # yields the following output: 173 | 174 | 175 | If you would rather extract metadata, there are two functions: 176 | 177 | * :py:meth:`~micawber.providers.ProviderRegistry.extract`, which finds all URLs 178 | within a block of text and returns a dictionary of metadata for each. 179 | * :py:meth:`~micawber.providers.ProviderRegistry.extract_html`, which finds 180 | URLs within HTML and returns a dictionary of metadata for each. 181 | 182 | The :ref:`API docshttp://www.youtube.com/watch?v=54XHDUOHuzU
') 61 | 62 | # returns the following html: 63 | 64 | 65 | check out the :ref:`getting startedThis is a test
7 |http://www.youtube.com/watch?v=nda_OSWeyn8
8 |This will get rendered as a link: http://www.youtube.com/watch?v=nda_OSWeyn8
9 |This will not be modified: http://www.youtube.com/watch?v=nda_OSWeyn8
10 | """) 11 | return render_to_response('example.html', dict( 12 | text=text, 13 | html=html, 14 | )) 15 | -------------------------------------------------------------------------------- /examples/flask_ex/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | from micawber.providers import bootstrap_basic 3 | from micawber.contrib.mcflask import add_oembed_filters 4 | 5 | app = Flask(__name__) 6 | app.config['DEBUG'] = True 7 | 8 | oembed_providers = bootstrap_basic() 9 | add_oembed_filters(app, oembed_providers) 10 | 11 | @app.route('/') 12 | def example_view(): 13 | text = request.args.get('text', 'http://www.youtube.com/watch?v=nda_OSWeyn8') 14 | html = request.args.get('html', """ 15 |This is a test
16 |http://www.youtube.com/watch?v=nda_OSWeyn8
17 |This will get rendered as a link: http://www.youtube.com/watch?v=nda_OSWeyn8
18 |This will not be modified: http://www.youtube.com/watch?v=nda_OSWeyn8
19 | """) 20 | return render_template('example.html', text=text, html=html) 21 | 22 | if __name__ == '__main__': 23 | app.run() 24 | -------------------------------------------------------------------------------- /examples/flask_ex/static/style.css: -------------------------------------------------------------------------------- 1 | body { font-family: sans-serif; background: #eee; } 2 | a, h1, h2 { color: #377BA8; } 3 | h1, h2 { font-family: 'Georgia', serif; margin: 0; } 4 | h1 { border-bottom: 2px solid #eee; } 5 | h2 { font-size: 1.2em; } 6 | 7 | .page { margin: 2em auto; width: 35em; border: 5px solid #ccc; 8 | padding: 0.8em; background: white; } 9 | .entries { list-style: none; margin: 0; padding: 0; } 10 | .entries li { margin: 0.8em 1.2em; } 11 | .entries li h2 { margin-left: -1em; } 12 | .add-entry { font-size: 0.9em; border-bottom: 1px solid #ccc; } 13 | .add-entry dl { font-weight: bold; } 14 | .metanav { text-align: right; font-size: 0.8em; padding: 0.3em; 15 | margin-bottom: 1em; background: #fafafa; } 16 | .flash { background: #CEE5F5; padding: 0.5em; 17 | border: 1px solid #AACBE2; } 18 | .error { background: #F0D6D6; padding: 0.5em; } 19 | -------------------------------------------------------------------------------- /examples/flask_ex/templates/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |%s
\nthis is inline: %s
\n\n%s\n
last test\n%s\n
' 65 | 66 | test_str = frame % (url, url, url, url) 67 | 68 | parsed = self.render('{{ test_str|oembed_html }}', test_str=test_str) 69 | self.assertHTMLEqual(parsed, frame % (expected, expected_inline, expected, expected_inline)) 70 | 71 | for url, expected in self.full_pairs.items(): 72 | expected_inline = self.inline_pairs[url] 73 | frame = '\nthis is inline: %s
\nlast test\n%s\n
' 74 | 75 | test_str = frame % (url, url, url) 76 | 77 | parsed = self.render('{{ test_str|oembed_html }}', test_str=test_str) 78 | self.assertHTMLEqual(parsed, frame % (url, expected_inline, expected_inline)) 79 | 80 | def test_urlize(self): 81 | u1 = 'http://fappio.com/' 82 | u2 = 'http://google.com/fap/' 83 | u1h = '%s' % (u1, u1) 84 | u2h = '%s' % (u2, u2) 85 | for url, expected in self.full_pairs.items(): 86 | expected_inline = self.inline_pairs[url] 87 | frame = 'test %s\n%s\n%s\nand another %s' 88 | 89 | test_str = frame % (u1, u2, url, url) 90 | 91 | parsed = self.render('{{ test_str|oembed }}', test_str=test_str) 92 | self.assertEqual(parsed, frame % (u1h, u2h, expected, expected_inline)) 93 | 94 | def test_oembed_filter_extension(self): 95 | for url, expected in self.full_pairs.items(): 96 | expected_inline = self.inline_pairs[url] 97 | frame = 'test http://fappio.com\nhttp://google.com\n%s\nand another %s' 98 | 99 | test_str = frame % (url, url) 100 | 101 | parsed = self.render('{{ test_str|oembed_no_urlize }}', test_str=test_str) 102 | self.assertEqual(parsed, frame % (expected, expected_inline)) 103 | 104 | def test_extract_filter(self): 105 | blank = 'http://fapp.io/foo/' 106 | frame = 'test %s\n%s\n%s\n%s at last' 107 | frame_html = 'test %s
%s %s
%s
' 108 | 109 | t = """{% for url, data in test_str|extract_oembed %}{{ url }}\n{% endfor %}""" 110 | t2 = """{% for url, data in test_str|extract_oembed_html %}{{ url }}\n{% endfor %}""" 111 | 112 | for url, expected in self.data_pairs.items(): 113 | test_str = frame % (url, blank, url, blank) 114 | rendered = self.render(t, test_str=test_str) 115 | self.assertEqual(rendered, url) 116 | 117 | test_str = frame_html % (url, blank, url, blank) 118 | rendered = self.render(t, test_str=test_str) 119 | self.assertEqual(rendered, url) 120 | -------------------------------------------------------------------------------- /micawber/contrib/mcdjango/models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coleifer/micawber/1e8b8af3cc3c3f2a7d597a100b4de71533d24ceb/micawber/contrib/mcdjango/models.py -------------------------------------------------------------------------------- /micawber/contrib/mcdjango/providers.py: -------------------------------------------------------------------------------- 1 | from django.conf import settings 2 | from django.core.cache import cache 3 | 4 | from micawber.providers import bootstrap_basic as _bootstrap_basic, bootstrap_embedly as _bootstrap_embedly 5 | 6 | 7 | def bootstrap_basic(): 8 | return _bootstrap_basic(cache) 9 | 10 | def bootstrap_embedly(): 11 | key = getattr(settings, 'MICAWBER_EMBEDLY_KEY', None) 12 | params = {} 13 | if key: 14 | params['key'] = key 15 | return _bootstrap_embedly(cache, **params) 16 | -------------------------------------------------------------------------------- /micawber/contrib/mcdjango/templates/micawber/link.html: -------------------------------------------------------------------------------- 1 | {{ response.title }} 2 | -------------------------------------------------------------------------------- /micawber/contrib/mcdjango/templates/micawber/photo.html: -------------------------------------------------------------------------------- 1 |%s
' % url) 125 | self.assertHTMLEqual(parsed, '%s
' % expected) 126 | 127 | def test_parse_text(self): 128 | for url, expected in self.inline_pairs.items(): 129 | parsed = test_pr.parse_text('this is inline: %s' % url) 130 | self.assertHTMLEqual(parsed, 'this is inline: %s' % expected) 131 | 132 | # We can disable parsing inline links by specifying block_handler=None. 133 | for url, expected in self.inline_pairs.items(): 134 | parsed = test_pr.parse_text('this is inline: %s' % url, block_handler=None) 135 | self.assertEqual(parsed, 'this is inline: %s' % url) 136 | 137 | # if the link comes on its own line it gets included in full 138 | for url, expected in self.full_pairs.items(): 139 | parsed = test_pr.parse_text(url) 140 | self.assertHTMLEqual(parsed, expected) 141 | 142 | # Specifying block_handler=None only applies to inline links, so 143 | # the behavior is the same for standalone links. 144 | parsed = test_pr.parse_text(url, block_handler=None) 145 | self.assertHTMLEqual(parsed, expected) 146 | 147 | # links inside block tags will render as inline 148 | frame = 'Testing %s
' 149 | for url, expected in self.inline_pairs.items(): 150 | parsed = test_pr.parse_html(frame % (url)) 151 | self.assertHTMLEqual(parsed, frame % (expected)) 152 | 153 | # links inside tags won't change at all 154 | frame = '' 155 | for url, expected in self.inline_pairs.items(): 156 | parsed = test_pr.parse_html(frame % (url, url)) 157 | self.assertHTMLEqual(parsed, frame % (url, url)) 158 | 159 | # links within tags within a tags are fine too 160 | frame = '' 161 | for url, expected in self.inline_pairs.items(): 162 | parsed = test_pr.parse_html(frame % (url, url)) 163 | self.assertHTMLEqual(parsed, frame % (url, url)) 164 | 165 | def test_multiline(self): 166 | for url, expected in self.full_pairs.items(): 167 | expected_inline = self.inline_pairs[url] 168 | frame = 'this is inline: %s\n%s\nand yet another %s' 169 | 170 | test_str = frame % (url, url, url) 171 | 172 | parsed = test_pr.parse_text(test_str) 173 | self.assertHTMLEqual(parsed, frame % (expected_inline, expected, expected_inline)) 174 | 175 | # On multi-line text, if we specify block_handler=None, only standalone 176 | # links will be handled. 177 | for url, expected in self.full_pairs.items(): 178 | frame = 'this is inline: %s\n%s\nand yet another %s' 179 | test_str = frame % (url, url, url) 180 | 181 | parsed = test_pr.parse_text(test_str, block_handler=None) 182 | self.assertHTMLEqual(parsed, frame % (url, expected, url)) 183 | 184 | for url, expected in self.full_pairs.items(): 185 | expected_inline = self.inline_pairs[url] 186 | frame = '%s\nthis is inline: %s\n%s' 187 | 188 | test_str = frame % (url, url, url) 189 | 190 | parsed = test_pr.parse_text(test_str) 191 | self.assertHTMLEqual(parsed, frame % (expected, expected_inline, expected)) 192 | 193 | # test mixing multiline with p tags 194 | for url, expected in self.full_pairs.items(): 195 | expected_inline = self.inline_pairs[url] 196 | frame = '%s
\nthis is inline: %s
\n\n%s\n
last test\n%s\n
' 197 | 198 | test_str = frame % (url, url, url, url) 199 | 200 | parsed = test_pr.parse_html(test_str) 201 | self.assertHTMLEqual(parsed, frame % (expected, expected_inline, expected, expected_inline)) 202 | 203 | for url, expected in self.full_pairs.items(): 204 | expected_inline = self.inline_pairs[url] 205 | frame = '\nthis is inline: %s
\nlast test\n%s\n
' 206 | 207 | test_str = frame % (url, url, url) 208 | 209 | parsed = test_pr.parse_html(test_str) 210 | self.assertHTMLEqual(parsed, frame % (url, expected_inline, expected_inline)) 211 | 212 | def test_multiline_full(self): 213 | for url, expected in self.full_pairs.items(): 214 | frame = 'this is inline: %s\n%s\nand yet another %s' 215 | 216 | test_str = frame % (url, url, url) 217 | 218 | parsed = test_pr.parse_text_full(test_str) 219 | self.assertHTMLEqual(parsed, frame % (expected, expected, expected)) 220 | 221 | def test_urlize(self): 222 | blank = 'http://fapp.io/foo/' 223 | blank_e = 'http://fapp.io/foo/' 224 | for url, expected in self.full_pairs.items(): 225 | expected_inline = self.inline_pairs[url] 226 | frame = 'test %s\n%s\n%s\nand finally %s' 227 | 228 | test_str = frame % (url, blank, url, blank) 229 | 230 | parsed = test_pr.parse_text(test_str) 231 | self.assertHTMLEqual(parsed, frame % (expected_inline, blank_e, expected, blank_e)) 232 | 233 | parsed = test_pr.parse_text(test_str, urlize_all=False) 234 | self.assertHTMLEqual(parsed, frame % (expected_inline, blank, expected, blank)) 235 | 236 | parsed = test_pr.parse_text_full(test_str) 237 | self.assertHTMLEqual(parsed, frame % (expected, blank_e, expected, blank_e)) 238 | 239 | parsed = test_pr.parse_text_full(test_str, urlize_all=False) 240 | self.assertHTMLEqual(parsed, frame % (expected, blank, expected, blank)) 241 | 242 | parsed = test_pr.parse_html(test_str) 243 | self.assertHTMLEqual(parsed, frame % (expected_inline, blank_e, expected_inline, blank_e)) 244 | 245 | parsed = test_pr.parse_html(test_str, urlize_all=False) 246 | self.assertHTMLEqual(parsed, frame % (expected_inline, blank, expected_inline, blank)) 247 | 248 | frame = 'test %s
\n%s\n%s\nand finally %s
' 249 | 250 | test_str = frame % (url, blank, url, blank) 251 | 252 | parsed = test_pr.parse_html(test_str) 253 | self.assertHTMLEqual(parsed, frame % (expected_inline, blank, url, blank_e)) 254 | 255 | parsed = test_pr.parse_html(test_str, urlize_all=False) 256 | self.assertHTMLEqual(parsed, frame % (expected_inline, blank, url, blank)) 257 | 258 | def test_urlize_params(self): 259 | text = 'test http://foo.com/' 260 | urlize_params = {'target': '_blank', 'rel': 'nofollow'} 261 | exp = ('test ' 262 | 'http://foo.com/') 263 | 264 | result = test_pr.parse_text(text, urlize_params=urlize_params) 265 | self.assertEqual(result, exp) 266 | 267 | result = test_pr.parse_text_full(text, urlize_params=urlize_params) 268 | self.assertEqual(result, exp) 269 | 270 | result = test_pr.parse_html(text, urlize_params=urlize_params) 271 | self.assertEqual(result, exp) 272 | 273 | def test_extract(self): 274 | blank = 'http://fapp.io/foo/' 275 | frame = 'test %s\n%s\n%s\n%s at last' 276 | frame_html = 'test %s
%s %s
%s
' 277 | 278 | for url, expected in self.data_pairs.items(): 279 | text = frame % (url, blank, url, blank) 280 | all_urls, extracted = test_pr.extract(text) 281 | self.assertEqual(all_urls, [url, blank]) 282 | 283 | if 'url' not in expected: 284 | expected['url'] = url 285 | if 'title' not in expected: 286 | expected['title'] = expected['url'] 287 | self.assertEqual(extracted, {url: expected}) 288 | 289 | html = frame_html % (url, url, blank, blank) 290 | all_urls, extracted = test_pr.extract_html(html) 291 | self.assertEqual(all_urls, [url, blank]) 292 | 293 | if 'url' not in expected: 294 | expected['url'] = url 295 | self.assertEqual(extracted, {url: expected}) 296 | 297 | def test_outside_of_markup(self): 298 | frame = '%stesting
' 299 | for url, expected in self.full_pairs.items(): 300 | parsed = test_pr.parse_html(frame % (url)) 301 | self.assertHTMLEqual(parsed, frame % (expected)) 302 | 303 | def test_html_entities(self): 304 | frame_html = 'test %s
' 305 | 306 | for url, expected in self.data_pairs.items(): 307 | esc_url = url.replace('&', '&') 308 | html = frame_html % (esc_url, esc_url) 309 | all_urls, extracted = test_pr.extract_html(html) 310 | self.assertEqual(all_urls, [url]) 311 | 312 | if 'url' not in expected: 313 | expected['url'] = url 314 | if 'title' not in expected: 315 | expected['title'] = expected['url'] 316 | self.assertEqual(extracted, {url: expected}) 317 | 318 | rendered = test_pr.parse_html('%s
' % esc_url) 319 | self.assertHTMLEqual(rendered, '%s
' % self.full_pairs[url]) 320 | 321 | 322 | class TestHTMLEntities(BaseTestCase): 323 | def test_parse_html_entities(self): 324 | e = '<script></script>' 325 | p = 'Test %s
' % e 326 | self.assertEqual(test_pr.parse_html(p), p) 327 | 328 | a = 'http://google.com %s
' % e 329 | self.assertEqual(test_pr.parse_html(a), 330 | 'http://google.com' 331 | ' %s
' % e) 332 | 333 | h = ('http://foo.com http://bar.com ' 334 | 'http://baz.com <script> ' 335 | 'http://nug.com X <foo>
') 336 | self.assertEqual(test_pr.parse_html(h), ( 337 | 'http://foo.com ' 338 | 'http://bar.com ' 339 | 'http://baz.com <script> ' 340 | 'http://nug.com ' 341 | 'X <foo>
')) 342 | 343 | h = ('http://foo.com http://bar.com ' 344 | '<script> http://baz.com </script>\n' 345 | 'http://baze.com\n<foo>
') 346 | self.assertEqual(test_pr.parse_html(h), ( 347 | 'http://foo.com ' 348 | 'http://bar.com <script> ' 349 | 'http://baz.com </script>\n' 350 | 'http://baze.com\n' 351 | '<foo>
')) 352 | 353 | 354 | if __name__ == '__main__': 355 | unittest.main(argv=sys.argv) 356 | -------------------------------------------------------------------------------- /runtests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import unittest 5 | 6 | from micawber import tests 7 | 8 | 9 | def run_django_tests(): 10 | try: 11 | import django 12 | except ImportError: 13 | print('Skipping django tests') 14 | return 15 | else: 16 | print('Running django integration tests') 17 | 18 | providers = 'micawber.contrib.mcdjango.mcdjango_tests.tests.test_pr' 19 | extensions = ( 20 | ('oembed_no_urlize', {'urlize_all': False}), 21 | ) 22 | 23 | from django.conf import settings 24 | if not settings.configured: 25 | settings.configure( 26 | DATABASES={ 27 | 'default': { 28 | 'ENGINE': 'django.db.backends.sqlite3', 29 | }, 30 | }, 31 | SITE_ID=1, 32 | INSTALLED_APPS=[ 33 | 'django.contrib.auth', 34 | 'django.contrib.contenttypes', 35 | 'django.contrib.sessions', 36 | 'django.contrib.sites', 37 | 'micawber.contrib.mcdjango', 38 | 'micawber.contrib.mcdjango.mcdjango_tests', 39 | ], 40 | TEMPLATES=[ 41 | { 42 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 43 | 'DIRS': [], 44 | 'APP_DIRS': True, 45 | 'OPTIONS': {} 46 | }, 47 | ], 48 | MICAWBER_PROVIDERS=providers, 49 | MICAWBER_TEMPLATE_EXTENSIONS=extensions, 50 | ) 51 | else: 52 | settings.MICAWBER_PROVIDERS = providers 53 | settings.MICAWBER_TEMPLATE_EXTENSIONS = extensions 54 | 55 | try: 56 | from django import setup 57 | except ImportError: 58 | pass 59 | else: 60 | setup() 61 | 62 | from django.test.runner import DiscoverRunner 63 | parent = os.path.dirname(os.path.abspath(__file__)) 64 | sys.path.insert(0, parent) 65 | return DiscoverRunner().run_tests(['micawber/contrib/mcdjango']) 66 | 67 | 68 | def runtests(*test_args): 69 | print("Running micawber tests") 70 | errors = failures = False 71 | suite = unittest.TestLoader().loadTestsFromModule(tests) 72 | result = unittest.TextTestRunner(verbosity=2).run(suite) 73 | if result.failures: 74 | failures = True 75 | if result.errors: 76 | errors = True 77 | if not (errors or failures): 78 | print("All micawber tests passed") 79 | 80 | dj_failures = run_django_tests() 81 | 82 | if failures or errors or dj_failures: 83 | sys.exit(1) 84 | 85 | sys.exit(0) 86 | 87 | if __name__ == '__main__': 88 | runtests(*sys.argv[1:]) 89 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | f = open(os.path.join(os.path.dirname(__file__), 'README.rst')) 5 | readme = f.read() 6 | f.close() 7 | 8 | setup( 9 | name='micawber', 10 | version='0.5.6', 11 | description='a small library for extracting rich content from urls', 12 | long_description=readme, 13 | author='Charles Leifer', 14 | author_email='coleifer@gmail.com', 15 | url='http://github.com/coleifer/micawber/', 16 | packages=[p for p in find_packages() if not p.startswith('examples')], 17 | package_data = { 18 | 'micawber': [ 19 | 'contrib/mcdjango/templates/micawber/*.html', 20 | ], 21 | }, 22 | classifiers=[ 23 | 'Development Status :: 4 - Beta', 24 | 'Environment :: Web Environment', 25 | 'Intended Audience :: Developers', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Operating System :: OS Independent', 28 | 'Programming Language :: Python', 29 | 'Programming Language :: Python :: 2.6', 30 | 'Programming Language :: Python :: 2.7', 31 | 'Programming Language :: Python :: 3.2', 32 | 'Programming Language :: Python :: 3.3', 33 | 'Programming Language :: Python :: 3.4', 34 | 'Programming Language :: Python :: 3.5', 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | 'Programming Language :: Python :: 3.8', 38 | 'Programming Language :: Python :: 3.9', 39 | 'Programming Language :: Python :: 3.10', 40 | 'Programming Language :: Python :: 3.11', 41 | 'Framework :: Django', 42 | ], 43 | test_suite='runtests.runtests', 44 | ) 45 | --------------------------------------------------------------------------------