├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── CONTRIBUTORS.md
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── banner.png
    ├── changelog.md
    ├── index.md
    ├── parser.md
    └── stylesheets
    │   └── styles.css
├── mf2py
    ├── __init__.py
    ├── backcompat-rules
    │   ├── adr.json
    │   ├── geo.json
    │   ├── hentry.json
    │   ├── hfeed.json
    │   ├── hproduct.json
    │   ├── hrecipe.json
    │   ├── hresume.json
    │   ├── hreview-aggregate.json
    │   ├── hreview.json
    │   ├── vcard.json
    │   └── vevent.json
    ├── backcompat.py
    ├── datetime_helpers.py
    ├── dom_helpers.py
    ├── implied_properties.py
    ├── metaformats.py
    ├── mf2_classes.py
    ├── mf_helpers.py
    ├── parse_property.py
    ├── parser.py
    ├── temp_fixes.py
    ├── value_class_pattern.py
    └── version.py
├── mkdocs.yml
├── pyproject.toml
└── test
    ├── examples
        ├── area.html
        ├── backcompat
        │   ├── feed_with_rel_bookmark.html
        │   ├── hentry.html
        │   ├── hentry_content_html.html
        │   ├── hentry_with_rel_bookmark.html
        │   ├── hentry_with_rel_tag.html
        │   ├── hentry_with_rel_tag_entry_title.html
        │   ├── hfeed_with_rel_tag.html
        │   ├── hproduct.html
        │   ├── hproduct_hreview_nested.html
        │   ├── hrecipe_with_rel_tag.html
        │   ├── hreview_hentry_with_rel_tag_bookmark.html
        │   ├── hreview_nested_card_event_product.html
        │   ├── hreview_with_rel_tag_bookmark.html
        │   ├── ignore_mf1_properties_in_mf2_root.html
        │   ├── ignore_mf1_root_if_mf2_present.html
        │   ├── ignore_mf2_properties_in_mf1_root.html
        │   ├── nested_mf1_in_mf2.html
        │   ├── nested_mf1_in_mf2_e_content.html
        │   ├── nested_mf2_in_mf1.html
        │   └── no_implied_properties_mf1_root.html
        ├── base.html
        ├── broken_url.html
        ├── class_names_format.html
        ├── complex_e_content.html
        ├── datetimes.html
        ├── embedded.html
        ├── empty.html
        ├── eras.html
        ├── festivus.html
        ├── filter_roots.html
        ├── filter_roots_custom.html
        ├── hcard_with_empty_url.html
        ├── hfeed_on_html_tag.html
        ├── img_with_alt.html
        ├── img_with_srcset.html
        ├── img_with_srcset_with_base.html
        ├── implied_properties
        │   ├── implied_name_alt.html
        │   ├── implied_name_empty_alt.html
        │   ├── implied_photo.html
        │   ├── implied_photo_relative_url.html
        │   ├── implied_properties.html
        │   ├── implied_properties_silo_pub.html
        │   ├── implied_relative_datetimes.html
        │   ├── implied_url.html
        │   ├── simple_person_reference_implied.html
        │   ├── stop_implied_name_e_content.html
        │   ├── stop_implied_name_nested_h.html
        │   ├── stop_implied_name_p_content.html
        │   └── stop_implied_url.html
        ├── language.html
        ├── link-rel-minimal.html
        ├── link_with_u-url.html
        ├── metaformats_html_meta.html
        ├── metaformats_ogp.html
        ├── metaformats_twitter.html
        ├── nested_complex_values.html
        ├── nested_hcards.html
        ├── nested_multiple_classnames.html
        ├── nested_values.html
        ├── ordering_dedup.html
        ├── parse_id.html
        ├── person_with_url.html
        ├── plaintext_img_whitespace.html
        ├── plaintext_p_whitespace.html
        ├── rel.html
        ├── rel_enclosure.html
        ├── relative_url_in_e.html
        ├── rsvp.html
        ├── simple_person_reference.html
        ├── simple_person_reference_same_element.html
        ├── string_stripping.html
        ├── tag_whitespace_inside_p_value.html
        ├── template_tag.html
        ├── template_tag_inside_e_value.html
        ├── test_src_equiv.html
        ├── u-test.html
        ├── u_all_cases.html
        ├── value_class_person.html
        └── value_name_whitespace.html
    ├── test_dom_addins.py
    ├── test_parser.py
    └── test_suite.py


/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: Run Python Tests
  2 | on: push
  3 | 
  4 | jobs:
  5 |   build-macos:
  6 |     strategy:
  7 |       matrix:
  8 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
  9 |     runs-on: "macos-latest"
 10 |     steps:
 11 |       - name: Install md5sha1sum
 12 |         run: brew install md5sha1sum
 13 |       - uses: actions/checkout@v3
 14 |         with:
 15 |           python-version: ${{ matrix.python-version }}
 16 |       - name: Install Python
 17 |         uses: actions/setup-python@v4
 18 |         with:
 19 |           python-version: ${{ matrix.python-version }}
 20 |       - name: Install Poetry
 21 |         uses: snok/install-poetry@v1
 22 |         with:
 23 |           version: 1.5.1
 24 |           virtualenvs-in-project: true
 25 |       - name: Install dependencies
 26 |         run: poetry install --no-interaction --no-root
 27 |       - name: Install library
 28 |         run: poetry install --no-interaction
 29 |       - name: Run tests
 30 |         run: poetry run make tests
 31 |       - uses: psf/black@stable
 32 |         with:
 33 |           options: "--check --verbose"
 34 |           src: "./"
 35 |           version: "23.3"
 36 |       - uses: isort/isort-action@v1
 37 |   build-linux:
 38 |     strategy:
 39 |       matrix:
 40 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 41 |     runs-on: "ubuntu-latest"
 42 |     steps:
 43 |       - name: Install libxml2
 44 |         run: |
 45 |           sudo apt-get update
 46 |           sudo apt-get install libxml2 libxml2-dev libxslt1-dev
 47 |       - uses: actions/checkout@v3
 48 |         with:
 49 |           python-version: ${{ matrix.python-version }}
 50 |       - name: Install Python
 51 |         uses: actions/setup-python@v4
 52 |         with:
 53 |           python-version: ${{ matrix.python-version }}
 54 |       - name: Install Poetry
 55 |         uses: snok/install-poetry@v1
 56 |         with:
 57 |           version: 1.5.1
 58 |           virtualenvs-in-project: true
 59 |       - name: Install dependencies
 60 |         run: poetry install --no-interaction --no-root
 61 |       - name: Install library
 62 |         run: poetry install --no-interaction
 63 |       - name: Run tests
 64 |         run: poetry run make tests
 65 |       - uses: psf/black@stable
 66 |         with:
 67 |           options: "--check --verbose"
 68 |           src: "./"
 69 |           version: "23.3"
 70 |       - uses: isort/isort-action@v1
 71 |   build-windows:
 72 |     strategy:
 73 |       matrix:
 74 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 75 |     runs-on: "windows-latest"
 76 |     defaults:
 77 |       run:
 78 |         shell: bash
 79 |     steps:
 80 |       - uses: actions/checkout@v3
 81 |         with:
 82 |           python-version: ${{ matrix.python-version }}
 83 |       - name: Install Python
 84 |         uses: actions/setup-python@v4
 85 |         with:
 86 |           python-version: ${{ matrix.python-version }}
 87 |       - name: Install Poetry
 88 |         uses: snok/install-poetry@v1
 89 |         with:
 90 |           version: 1.5.1
 91 |           virtualenvs-in-project: true
 92 |       - name: Install dependencies
 93 |         run: poetry install --no-interaction --no-root
 94 |       - name: Install library
 95 |         run: poetry install --no-interaction
 96 |       - name: Run tests
 97 |         run: poetry run make tests
 98 |       - uses: psf/black@stable
 99 |         with:
100 |           options: "--check --verbose"
101 |           src: "./"
102 |           version: "23.3"
103 |       # - uses: isort/isort-action@v1
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | *.pyc
 4 | .idea/
 5 | .eggs/
 6 | build/
 7 | dist/
 8 | local/
 9 | mf2py.egg-info/
10 | nbproject/
11 | venv/
12 | *~
13 | poetry.lock
14 | site/
15 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change Log
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | ## 2.0.1 - 2023-12-07
  5 | The mf2py library is excited to transition into 2.0. This version increase incorporates months of work from contributors, informed by active discussions among implementers and users.
  6 | 
  7 | This release officially deprecates support for versions of Python lower than 3.8.
  8 | 
  9 | Below are the changes we have made in this release.
 10 | 
 11 | ### New Features
 12 | - Enable `img_with_alt` by default (#184)
 13 | - Add timezone offset normalisation (#206)
 14 | - Add option for exposing DOM for embedded properties (#208)
 15 | - Add srcset support (#209)
 16 | - Add language support (#210)
 17 | - Add option for filtering root class names (#211)
 18 | - Add option for metaformats support (#213)
 19 | 
 20 | ### Changes
 21 | - Remove `img_with_alt` option entirely (#200)
 22 | - Resolve implied photo relative paths (#205)
 23 | - Make relative URLs in embedded properties absolute (#201)
 24 | - Fix whitespace in plaintext conversion (#207)
 25 | - Replace `dict_class` with standard `dict` (#196)
 26 | 
 27 | ### Tests, Library and Documentation Maintenance
 28 | - Update tests to include alt texts by default (#190)
 29 | - Add Windows and macOS tests (#198)
 30 | - Use poetry for dependency management (#189)
 31 | - Deprecate Python 2 support (#179)
 32 | - Lint code with `black` and `isort`
 33 | - Add linting CI actions (#193)
 34 | - Move from `nosetests` to `pytest` (#186)
 35 | - Add 3.11, 3.12 and drop pypy from test matrix; upgrade poetry action (#204)
 36 | - Prepare tests to test options (#214)
 37 | - Bring README doctests up-to-date (#215)
 38 | 
 39 | ## 1.1.3 - 2022-06-28
 40 | - reduce instances where photo is implied (#135)
 41 | - always do relative URL resolution (#138)
 42 | - VCP now handles tz offsets without leading zeros (#142)
 43 | - implement id parsing (#143)
 44 | - fix outdated syntax causing SyntaxWarning (#157)
 45 | 
 46 | ## 1.1.2 - 2018-08-08
 47 | - add parsing for iframe.u-*[src] (#116)
 48 | - bug fix: reduced implied urls (#117)
 49 | - bug fix: don't collapse whitespace between tags
 50 | - specify explicit versions for dependencies
 51 | - revert BeautifulSoup copying added in 1.1.1 due to bugs (eg #108)
 52 | - misc performance improvements
 53 | 
 54 | ## 1.1.1 - 2018-06-15
 55 | - streamline backcompat to use JSON only.
 56 | - fix multiple mf1 root rel-tag parsing
 57 | - correct url and photo for hreview.
 58 | - add rules for nested hreview. update backcompat to use multiple matches in old properties.
 59 | - fix `rel-tag` to `p-category` conversion so that other classes are not lost.
 60 | - use original authored html for `e-*` parsing in backcompat
 61 | - make classes and rels into unordered (alphabetically ordered) deduped arrays.
 62 | - only use class names for mf2 which follow the naming rules
 63 | - fix `parse` method to use default html parser.
 64 | - always use the first value for attributes for rels.
 65 | - correct AM/PM conversion in datetime value class pattern.
 66 | - add ordinal date parsing to datetimes value class pattern. ordinal date is normalised to YYYY-MM-DD
 67 | - remove hack for html tag classes since that is fixed in new BS
 68 | - better whitespace algorithm for `name` and `html.value` parsing
 69 | - experimental flag for including `alt` in `u-photo` parsing
 70 | - make a copy of the BeautifulSoup given by user to work on for parsing to prevent changes to original doc
 71 | - bump version to 1.1.1
 72 | 
 73 | ## 1.1.0 - 2018-03-16
 74 | - bump version to 1.1.0 since it is a "major" change
 75 | - added tests for new implied name rules
 76 | - modified earlier tests to accommodate new rules
 77 | - use space separator instead of "T"
 78 | - Don't add "00" seconds unless authored
 79 | - use TZ authored in separate `value` element
 80 | - only use first found `value` of a particular type `date`, `time`, or `timezone`.
 81 | - move backcompat rules into JSON files
 82 | - reorganise value class pattern parsing into new files
 83 | - add datetime_helpers to organise datetime parsing rules
 84 | - reorganise tests
 85 | - remove Heroku frontend, point to mf2py-web and python.microformats.io instead in README.
 86 | - remove Flask and gunicorn requirements
 87 | - add debug info with description, version, url and the html parser used
 88 | 
 89 | ## 1.0.6 - 2018-03-04
 90 | - strip leading/trailing white space for `e-*[html]`. update the corresponding tests
 91 | - blank values explicitly authored are allowed as property values
 92 | - include `alt` or `src` from `<img>` in parsing for `p-*` and `e-*[value]`
 93 | - parse `title` from `<link>` for `p-*` resolves #84
 94 | - and `poster` from `<video>` for `u-*` resolves #76
 95 | - use `html5lib` as default parser
 96 | - use the final redirect URL resolves #62
 97 | - update requirements to use BS4 v4.6.0 and html5lib v1.0.1
 98 | - drop support for Python 2.6 as html5lib dropped support
 99 | 
100 | ## 1.0.5 - 2016-05-09
101 | - Implied property checks now ignore alt="", treating it the same as
102 |   if no alt value is defined.
103 | - Support for using a custom dict implementation by setting
104 |   mf2py.Parser.dict_class. collections.OrderedDict yields much nicer
105 |   output for hosted parsers.
106 | 
107 | ## 1.0.4 - 2016-03-21
108 | - Performance improvement changing simple calls to soup.find_all to
109 |   a manual iteration over .contents.
110 | 
111 | ## 1.0.3 - 2016-02-05
112 | - Performance improvement by limiting number of calls to soup.find_all
113 |   in backcompat module. Should not be any functional changes.
114 | 
115 | ## 1.0.2 - 2016-01-26
116 | - Backward compatibility parsing for rel=tag properties. These are now converted
117 |   to p-category based on the last path segment of the tag URI as spec'd in
118 |   http://microformats.org/wiki/h-entry#Parser_Compatibility
119 | - Optional property html_parser to specify the html parser that BeautifulSoup
120 |   should use (e.g., "lxml" or "html5lib")
121 | 
122 | ## 1.0.1 - 2015-12-11
123 | - `u-*` properties are now parsed from `<link>` elements per the updated spec
124 |   http://microformats.org/wiki/microformats2-parsing-issues#link_elements_and_u-_parsing
125 | 
126 | ## 1.0.0 - 2015-10-05
127 | - Version number bumped to 1.0.0 following community discussion.
128 | 
129 | ## 0.2.8 - 2015-09-21
130 | - Stricter checks that Parser.__init__ params are actually None before
131 |   ignoring them.
132 | 
133 | ## 0.2.7 - 2015-08-03
134 | - Now produces unicode strings for every key and value, no more byte
135 |   strings anywhere.
136 | - Do not add 'T' between date and time when normalizing dates
137 | - Unit tests for running the microformats test suite
138 | 
139 | ## 0.2.6 - 2015-05-06
140 | - New top-level "rel-urls" entry, contains rich data parsed from rel
141 |   links, organized by URL.
142 | 
143 | ## 0.2.5 - 2015-03-01
144 | - convenience method `mf2py.parse` that takes the same arguments as Parser
145 |   and returns a dict.
146 | - nested h-* classes now parse their "value" based on the property
147 |   they represent (p-*, u-*, dt-*), so for example "p-in-reply-to
148 |   h-cite" would have a name as its value and "u-in-reply-to h-cite"
149 |   will have a URL.
150 | 
151 | ## 0.2.4 - 2015-02-13
152 | - Add rel=bookmark to backward compat parsing rules based (translated
153 |   to u-url in mf2)
154 | - Parser constructor now takes explicit named arguments instead of
155 |   **kwargs, for saner behavior when called with unnamed arguments.
156 | - Bugfix: Empty href="" attributes are now properly interpreted as
157 |   the current document's URL.
158 | 
159 | ## 0.2.3 - 2015-02-07
160 | - Minor Py3 compatibility fix
161 | - Correct typo `test_requires` -> `tests_require` in setup.py
162 | 
163 | ## 0.2.2 - 2015-02-05
164 | - Started keeping a changelog!
165 | - Use a better method for extracting HTML for an e-* property
166 | - Correct BeautifulSoup4 dependency in setup.py to fix error with
167 |   installation from PyPI.
168 | - Buffed up docstrings for public methods.
169 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to mf2py 🛠️
 2 | 
 3 | Thank you for your interest in contributing to mf2py! In this document, we outline all you need to know to submit a contribution to the mf2py project.
 4 | 
 5 | ## Contribution Guidelines
 6 | 
 7 | We welcome contributions to:
 8 | 
 9 | 1. Add new parsing features to the library (see *Adding Parsing Features* below for more information).
10 | 2. Improve documentation.
11 | 3. Report bugs and issues.
12 | 4. Submit a request for a new feature.
13 | 5. Add more test cases.
14 | 
15 | We encourage you to submit an Issue to discuss any new feature requests before submitting a PR.
16 | 
17 | ### Adding Parsing Features
18 | 
19 | As the official `mf2py` parser, we will only implement new parsing features after they have been ratified into the Microformats parsing standard, or have achieved sufficient support among contributors to parsing discussions and requires implementations. The [microformats2 parsing](https://github.com/microformats/microformats2-parsing/issues) repository is the offical home to all issues pertaining to microformats2 parsing.
20 | 
21 | ## How to Contribute Changes
22 | 
23 | First, fork this repository to your own GitHub account. Create a new branch that describes your changes (i.e. `line-counter-docs`). Push your changes to the branch on your fork and then submit a pull request to this repository.
24 | 
25 | When creating new functions, please ensure you have the following:
26 | 
27 | 1. Docstrings for the function and all parameters.
28 | 2. Unit tests for the function.
29 | 3. Examples in the documentation for any public functions.
30 | 4. Created an entry in our docs to autogenerate the documentation for the function, if the function is public.
31 | 
32 | All pull requests will be reviewed by the maintainers of the project. We will provide feedback and ask for changes if necessary.
33 | 
34 | PRs must pass all tests and linting requirements before they can be merged.
35 | 
36 | ## Code Quality
37 | 
38 | Before you submit a PR to `mf2py`, run the following command in the base directory of the project:
39 | 
40 | ```bash
41 | make lint
42 | ```
43 | 
44 | This will format your code using the linters configured with the project.
45 | 
46 | ## Tests
47 | 
48 | Before you submit a PR, please run the `mf2py` test suite on your code. You can do so using the following command:
49 | 
50 | ```bash
51 | make test
52 | ```
53 | 
54 | ## Join the Microformats Community
55 | 
56 | Have a question about microformats or mf2py? Join the `#microformats` disussion on Slack, Discord, or IRC. Guidance on how to join the community is available on the [IndieWeb wiki](https://indieweb.org/discuss).
57 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | - Tom Morris <tom@tommorris.org> https://tommorris.org
 4 | - Barnaby Walters https://waterpigs.co.uk
 5 | - Kartik Prabhu <me@kartikprabhu.com> https://kartikprabhu.com
 6 | - Kyle Mahan https://github.com/kylewm
 7 | - Kevin Marks <kevinmarks@gmail.com> https://www.kevinmarks.com
 8 | - James <jamesg@jamesg.blog> https://jamesg.blog
 9 | - Angelo Gladding <angelo@ragt.ag> https://ragt.ag
10 | - Paweł Miech https://pawelmhm.github.io
11 | - Sven Knebel https://www.svenknebel.de
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyight (c) 2013, 2014 Tom Morris and contributors
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | install:
 2 | 	poetry install
 3 | tests:
 4 | 	poetry run pytest -s -vv --doctest-modules --doctest-glob README*
 5 | lint:
 6 | 	poetry run black .
 7 | 	poetry run isort .
 8 | docs_dev:
 9 | 	poetry run mkdocs serve
10 | docs_deploy:
11 | 	poetry run mkdocs gh-deploy
12 | publish:
13 | 	poetry publish --build
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![mf2py banner](https://microformats.github.io/mf2py/banner.png)
  2 | 
  3 | [![version](https://badge.fury.io/py/mf2py.svg?)](https://badge.fury.io/py/mf2py)
  4 | [![downloads](https://img.shields.io/pypi/dm/mf2py)](https://pypistats.org/packages/mf2py)
  5 | [![license](https://img.shields.io/pypi/l/mf2py?)](https://github.com/microformats/mf2py/blob/main/LICENSE)
  6 | [![python-version](https://img.shields.io/pypi/pyversions/mf2py)](https://badge.fury.io/py/mf2py)
  7 | 
  8 | ## Welcome 👋
  9 | 
 10 | `mf2py` is a Python [microformats](https://microformats.org/wiki/microformats) parser with full support for `microformats2`, backwards-compatible support for `microformats1` and experimental support for `metaformats`.
 11 | 
 12 | ## Installation 💻
 13 | 
 14 | To install `mf2py` run the following command:
 15 | 
 16 | ```bash
 17 | $ pip install mf2py
 18 | 
 19 | ```
 20 | 
 21 | ## Quickstart 🚀
 22 | 
 23 | Import the library:
 24 | 
 25 | ```pycon
 26 | >>> import mf2py
 27 | 
 28 | ```
 29 | 
 30 | ### Parse an HTML Document from a file or string
 31 | 
 32 | ```pycon
 33 | >>> with open("test/examples/eras.html") as fp:
 34 | ...     mf2json = mf2py.parse(doc=fp)
 35 | >>> mf2json
 36 | {'items': [{'type': ['h-entry'],
 37 |             'properties': {'name': ['Excited for the Taylor Swift Eras Tour'],
 38 |                            'author': [{'type': ['h-card'],
 39 |                                        'properties': {'name': ['James'],
 40 |                                                       'url': ['https://example.com/']},
 41 |                                        'value': 'James',
 42 |                                        'lang': 'en-us'}],
 43 |                            'published': ['2023-11-30T19:08:09'],
 44 |                            'featured': [{'value': 'https://example.com/eras.jpg',
 45 |                                          'alt': 'Eras tour poster'}],
 46 |                            'content': [{'value': "I can't decide which era is my favorite.",
 47 |                                         'lang': 'en-us',
 48 |                                         'html': "<p>I can't decide which era is my favorite.</p>"}],
 49 |                            'category': ['music', 'Taylor Swift']},
 50 |             'lang': 'en-us'}],
 51 |  'rels': {'webmention': ['https://example.com/mentions']},
 52 |  'rel-urls': {'https://example.com/mentions': {'text': '',
 53 |                                                'rels': ['webmention']}},
 54 |  'debug': {'description': 'mf2py - microformats2 parser for python',
 55 |            'source': 'https://github.com/microformats/mf2py',
 56 |            'version': '2.0.1',
 57 |            'markup parser': 'html5lib'}}
 58 | 
 59 | ```
 60 | 
 61 | ```pycon
 62 | >>> mf2json = mf2py.parse(doc="<a class=h-card href=https://example.com>James</a>")
 63 | >>> mf2json["items"]
 64 | [{'type': ['h-card'],
 65 |   'properties': {'name': ['James'],
 66 |                  'url': ['https://example.com']}}]
 67 | 
 68 | ```
 69 | 
 70 | ### Parse an HTML Document from a URL
 71 | 
 72 | ```pycon
 73 | >>> mf2json = mf2py.parse(url="https://events.indieweb.org")
 74 | >>> mf2json["items"][0]["type"]
 75 | ['h-feed']
 76 | >>> mf2json["items"][0]["children"][0]["type"]
 77 | ['h-event']
 78 | 
 79 | ```
 80 | 
 81 | ## Experimental Options
 82 | 
 83 | The following options can be invoked via keyword arguments to `parse()` and `Parser()`.
 84 | 
 85 | ### `expose_dom`
 86 | 
 87 | Use `expose_dom=True` to expose the DOM of embedded properties.
 88 | 
 89 | ### `metaformats`
 90 | 
 91 | Use `metaformats=True` to include any [metaformats](https://microformats.org/wiki/metaformats)
 92 | found.
 93 | 
 94 | ### `filter_roots`
 95 | 
 96 | Use `filter_roots=True` to filter known conflicting user names (e.g. Tailwind).
 97 | Otherwise provide a custom list to filter instead.
 98 | 
 99 | ## Advanced Usage
100 | 
101 | `parse` is a convenience function for `Parser`. More sophisticated behaviors are
102 | available by invoking the parser object directly.
103 | 
104 | ```pycon
105 | >>> with open("test/examples/festivus.html") as fp:
106 | ...     mf2parser = mf2py.Parser(doc=fp)
107 | 
108 | ```
109 | 
110 | #### Filter by Microformat Type
111 | 
112 | ```pycon
113 | >>> mf2json = mf2parser.to_dict()
114 | >>> len(mf2json["items"])
115 | 7
116 | >>> len(mf2parser.to_dict(filter_by_type="h-card"))
117 | 3
118 | >>> len(mf2parser.to_dict(filter_by_type="h-entry"))
119 | 4
120 | 
121 | ```
122 | 
123 | #### JSON Output
124 | 
125 | ```pycon
126 | >>> json = mf2parser.to_json()
127 | >>> json_cards = mf2parser.to_json(filter_by_type="h-card")
128 | 
129 | ```
130 | 
131 | ## Breaking Changes in `mf2py` 2.0
132 | 
133 | - Image `alt` support is now on by default.
134 | 
135 | ## Notes 📝
136 | 
137 | - If you pass a BeautifulSoup document it may be modified.
138 | - A hosted version of `mf2py` is available at [python.microformats.io](https://python.microformats.io).
139 | 
140 | ## Contributing 🛠️
141 | 
142 | We welcome contributions and bug reports via GitHub.
143 | 
144 | This project follows the [IndieWeb code of conduct](https://indieweb.org/code-of-conduct). Please be respectful of other contributors and forge a spirit of positive co-operation without discrimination or disrespect.
145 | 
146 | ## License 🧑‍⚖️
147 | 
148 | `mf2py` is licensed under an MIT License.
149 | 


--------------------------------------------------------------------------------
/docs/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microformats/mf2py/555f53e42a47c4b471dfbfb52fa964a9dcd674f3/docs/banner.png


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ../CHANGELOG.md


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ../README.md


--------------------------------------------------------------------------------
/docs/parser.md:
--------------------------------------------------------------------------------
1 | # Parser Object
2 | 
3 | :::mf2py.Parser
4 | 


--------------------------------------------------------------------------------
/docs/stylesheets/styles.css:
--------------------------------------------------------------------------------
1 | :root {
2 |     --md-primary-fg-color:        #6CA300;
3 | }


--------------------------------------------------------------------------------
/mf2py/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Microformats2 is a general way to mark up any HTML document with
 3 | classes and propeties. This library parses structured data from
 4 | a microformatted HTML document and returns a well-formed JSON
 5 | dictionary.
 6 | """
 7 | 
 8 | from .mf_helpers import get_url
 9 | from .parser import Parser, parse
10 | from .version import __version__
11 | 
12 | __all__ = ["Parser", "parse", "get_url", "__version__"]
13 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/adr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-adr"
 4 |     ], 
 5 |     "properties": {
 6 |         "locality": [
 7 |             "p-locality"
 8 |         ], 
 9 |         "region": [
10 |             "p-region"
11 |         ], 
12 |         "extended-address": [
13 |             "p-extended-address"
14 |         ], 
15 |         "post-office-box": [
16 |             "p-post-office-box"
17 |         ], 
18 |         "street-address": [
19 |             "p-street-address"
20 |         ], 
21 |         "postal-code": [
22 |             "p-postal-code"
23 |         ], 
24 |         "country-name": [
25 |             "p-country-name"
26 |         ]
27 |     }
28 | }


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/geo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-geo"
 4 |     ], 
 5 |     "properties": {
 6 |         "latitude": [
 7 |             "p-latitude"
 8 |         ], 
 9 |         "longitude": [
10 |             "p-longitude"
11 |         ]
12 |     }
13 | }


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hentry.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-entry"
 4 |     ], 
 5 |     "properties": {
 6 |         "category": [
 7 |             "p-category"
 8 |         ], 
 9 |         "entry-title": [
10 |             "p-name"
11 |         ], 
12 |         "published": [
13 |             "dt-published"
14 |         ],
15 |         "entry-content": [
16 |             "e-content"
17 |         ], 
18 |         "entry-summary": [
19 |             "p-summary"
20 |         ], 
21 |         "author": [
22 |             "p-author", 
23 |             "h-card"
24 |         ], 
25 |         "geo": [
26 |             "p-geo", 
27 |             "h-geo"
28 |         ], 
29 |         "updated": [
30 |             "dt-updated"
31 |         ]
32 |     },
33 |     "rels": {
34 |         "bookmark": [
35 |             "u-url"
36 |         ],
37 |         "tag": [
38 |             "p-category"
39 |         ]
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hfeed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-feed"
 4 |     ], 
 5 |     "properties": {
 6 |         "category": [
 7 |             "p-category"
 8 |         ], 
 9 |         "site-description": [
10 |             "p-summary"
11 |         ], 
12 |         "description": [
13 |             "p-summary"
14 |         ], 
15 |         "site-title": [
16 |             "p-name"
17 |         ], 
18 |         "title": [
19 |             "p-name"
20 |         ]
21 |     },
22 |     "rels": {
23 |         "tag": [
24 |             "p-category"
25 |         ]
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hproduct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-product"
 4 |     ], 
 5 |     "properties": {
 6 |         "category": [
 7 |             "p-category"
 8 |         ], 
 9 |         "price": [
10 |             "p-price"
11 |         ], 
12 |         "description": [
13 |             "p-description"
14 |         ], 
15 |         "url": [
16 |             "u-url"
17 |         ], 
18 |         "photo": [
19 |             "u-photo"
20 |         ], 
21 |         "brand": [
22 |             "p-brand"
23 |         ], 
24 |         "identifier": [
25 |             "u-identifier"
26 |         ], 
27 |         "review": [
28 |             "p-review", 
29 |             "h-review" 
30 |         ], 
31 |         "fn": [
32 |             "p-name"
33 |         ]
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hrecipe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-recipe"
 4 |     ], 
 5 |     "properties": {
 6 |         "nutrition": [
 7 |             "p-nutrition"
 8 |         ], 
 9 |         "yield": [
10 |             "p-yield"
11 |         ], 
12 |         "author": [
13 |             "p-author", 
14 |             "h-card"
15 |         ], 
16 |         "duration": [
17 |             "dt-duration"
18 |         ], 
19 |         "photo": [
20 |             "u-photo"
21 |         ], 
22 |         "instructions": [
23 |             "e-instructions"
24 |         ], 
25 |         "summary": [
26 |             "p-summary"
27 |         ], 
28 |         "fn": [
29 |             "p-name"
30 |         ], 
31 |         "ingredient": [
32 |             "p-ingredient"
33 |         ],
34 |         "category": [
35 |             "p-category"
36 |         ]
37 |     },
38 |     "rels": {
39 |         "tag": [
40 |             "p-category"
41 |         ]
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hresume.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-resume"
 4 |     ], 
 5 |     "properties": {
 6 |         "experience": [
 7 |             "h-event", 
 8 |             "p-experience"
 9 |         ], 
10 |         "summary": [
11 |             "p-summary"
12 |         ], 
13 |         "affiliation": [
14 |             "p-affiliation", 
15 |             "h-card"
16 |         ], 
17 |         "contact": [
18 |             "h-card", 
19 |             "p-contact"
20 |         ], 
21 |         "skill": [
22 |             "p-skill"
23 |         ], 
24 |         "education": [
25 |             "h-event", 
26 |             "p-education"
27 |         ]
28 |     }
29 | }


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hreview-aggregate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-review-aggregate"
 4 |     ], 
 5 |     "properties": {
 6 |         "rating": [
 7 |             "p-rating"
 8 |         ], 
 9 |         "description": [
10 |             "p-description"
11 |         ], 
12 |         "photo": [
13 |             "u-photo"
14 |         ], 
15 |         "worst": [
16 |             "p-worst"
17 |         ], 
18 |         "reviewer": [
19 |             "p-reviewer", 
20 |             "p-author", 
21 |             "h-card"
22 |         ], 
23 |         "best": [
24 |             "p-best"
25 |         ], 
26 |         "count": [
27 |             "p-count"
28 |         ], 
29 |         "votes": [
30 |             "p-votes"
31 |         ], 
32 |         "dtreviewed": [
33 |             "dt-reviewed"
34 |         ], 
35 |         "url": [
36 |             "u-url"
37 |         ], 
38 |         "summary": [
39 |             "p-name"
40 |         ], 
41 |         "fn": [
42 |             "p-item", 
43 |             "h-item", 
44 |             "p-name"
45 |         ]
46 |     }
47 | }


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/hreview.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-review"
 4 |     ], 
 5 |     "properties": {
 6 |         "rating": [
 7 |             "p-rating"
 8 |         ], 
 9 |         "worst": [
10 |             "p-worst"
11 |         ], 
12 |         "dtreviewed": [
13 |             "dt-reviewed"
14 |         ], 
15 |         "reviewer": [
16 |             "p-author", 
17 |             "h-card"
18 |         ], 
19 |         "url": [
20 |             "p-item", 
21 |             "h-item", 
22 |             "u-url"
23 |         ], 
24 |         "photo": [
25 |             "p-item", 
26 |             "h-item", 
27 |             "u-photo"
28 |         ], 
29 |         "best": [
30 |             "p-best"
31 |         ], 
32 |         "description": [
33 |             "p-description"
34 |         ], 
35 |         "fn": [
36 |             "p-item", 
37 |             "h-item", 
38 |             "p-name"
39 |         ], 
40 |         "summary": [
41 |             "p-name"
42 |         ],
43 |         "item vcard": [
44 |             "p-item",
45 |             "vcard"
46 |         ],
47 |         "item vevent": [
48 |             "p-item",
49 |             "vevent"
50 |         ],
51 |         "item hproduct": [
52 |             "p-item",
53 |             "hproduct"
54 |         ]
55 |     },
56 |     "rels": {
57 |         "self bookmark": [
58 |             "u-url"
59 |         ],
60 |         "tag": [
61 |             "p-category"
62 |         ]
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/vcard.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "type": [
  3 |         "h-card"
  4 |     ], 
  5 |     "properties": {
  6 |         "tel": [
  7 |             "p-tel"
  8 |         ], 
  9 |         "honorific-suffix": [
 10 |             "p-honorific-suffix"
 11 |         ], 
 12 |         "family-name": [
 13 |             "p-family-name"
 14 |         ], 
 15 |         "photo": [
 16 |             "u-photo"
 17 |         ], 
 18 |         "logo": [
 19 |             "u-logo"
 20 |         ], 
 21 |         "postal-code": [
 22 |             "p-postal-code"
 23 |         ], 
 24 |         "country-name": [
 25 |             "p-country-name"
 26 |         ], 
 27 |         "uid": [
 28 |             "u-uid"
 29 |         ], 
 30 |         "category": [
 31 |             "p-category"
 32 |         ], 
 33 |         "adr": [
 34 |             "p-adr", 
 35 |             "h-adr"
 36 |         ], 
 37 |         "locality": [
 38 |             "p-locality"
 39 |         ], 
 40 |         "nickname": [
 41 |             "p-nickname"
 42 |         ], 
 43 |         "label": [
 44 |             "p-label"
 45 |         ], 
 46 |         "note": [
 47 |             "p-note"
 48 |         ], 
 49 |         "street-address": [
 50 |             "p-street-address"
 51 |         ], 
 52 |         "latitude": [
 53 |             "p-latitude"
 54 |         ], 
 55 |         "email": [
 56 |             "u-email"
 57 |         ], 
 58 |         "bday": [
 59 |             "dt-bday"
 60 |         ], 
 61 |         "extended-address": [
 62 |             "p-extended-address"
 63 |         ], 
 64 |         "additional-name": [
 65 |             "p-additional-name"
 66 |         ], 
 67 |         "organization-unit": [
 68 |             "p-organization-unit"
 69 |         ], 
 70 |         "given-name": [
 71 |             "p-given-name"
 72 |         ], 
 73 |         "key": [
 74 |             "u-key"
 75 |         ], 
 76 |         "org": [
 77 |             "p-org"
 78 |         ], 
 79 |         "honorific-prefix": [
 80 |             "p-honorific-prefix"
 81 |         ], 
 82 |         "geo": [
 83 |             "p-geo", 
 84 |             "h-geo"
 85 |         ], 
 86 |         "fn": [
 87 |             "p-name"
 88 |         ], 
 89 |         "url": [
 90 |             "u-url"
 91 |         ], 
 92 |         "region": [
 93 |             "p-region"
 94 |         ], 
 95 |         "longitude": [
 96 |             "p-longitude"
 97 |         ], 
 98 |         "organization-name": [
 99 |             "p-organization-name"
100 |         ],
101 |         "title": [
102 |             "p-job-title"
103 |         ],
104 |         "role": [
105 |             "p-role"
106 |         ]
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/mf2py/backcompat-rules/vevent.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": [
 3 |         "h-event"
 4 |     ], 
 5 |     "properties": {
 6 |         "attendee": [
 7 |             "p-attendee"
 8 |         ], 
 9 |         "description": [
10 |             "p-description"
11 |         ], 
12 |         "duration": [
13 |             "dt-duration"
14 |         ], 
15 |         "dtend": [
16 |             "dt-end"
17 |         ], 
18 |         "dtstart": [
19 |             "dt-start"
20 |         ], 
21 |         "geo": [
22 |             "p-location h-geo"
23 |         ], 
24 |         "organizer": [
25 |             "p-organizer"
26 |         ], 
27 |         "category": [
28 |             "p-category"
29 |         ], 
30 |         "url": [
31 |             "u-url"
32 |         ], 
33 |         "summary": [
34 |             "p-name"
35 |         ], 
36 |         "contact": [
37 |             "p-contact"
38 |         ], 
39 |         "location": [
40 |             "p-location"
41 |         ]
42 |     }
43 | }


--------------------------------------------------------------------------------
/mf2py/backcompat.py:
--------------------------------------------------------------------------------
  1 | """Looks for classic microformats class names and augments them with
  2 | microformats2 names. Ported and adapted from php-mf2.
  3 | 
  4 | NOTE: functions in this module modify DOM elements. For this copies of the source tree are created, but BS4's copy
  5 | function doesn't create copy of all sub-elements: most notably, the values of the `attrs` dictionary are not copied,
  6 | and thus reference the same list objects as in the original tree. Thus, special care has to be taken when modifying the
  7 | tree so changes do not accidentally propagate.
  8 | 
  9 | a) adding new/removing children is safe
 10 | 
 11 | b) attributes (e.g. `class`) should only by changed by assigning a *copy* of the original value, not by modifying it in
 12 | place.
 13 | 
 14 | DO NOT:
 15 | child_classes = child.get('class', [])
 16 | child_classes.append('p-kittens')
 17 | child['class'] = child_classes
 18 | 
 19 | DO:
 20 | child_classes = child.get('class', [])[:] ###<------- COPY CREATED HERE
 21 | child_classes.append('p-kittens')
 22 | child['class'] = child_classes
 23 | 
 24 | """
 25 | 
 26 | import codecs
 27 | import copy
 28 | import json
 29 | import os
 30 | from urllib.parse import unquote
 31 | 
 32 | import bs4
 33 | 
 34 | from . import mf2_classes
 35 | from .dom_helpers import get_children
 36 | from .mf_helpers import unordered_list
 37 | 
 38 | # Classic map
 39 | _CLASSIC_MAP = {}
 40 | 
 41 | # populate backcompat rules from JSON files
 42 | 
 43 | _RULES_LOC = os.path.join(
 44 |     os.path.dirname(os.path.abspath(__file__)), "backcompat-rules"
 45 | )
 46 | 
 47 | for filename in os.listdir(_RULES_LOC):
 48 |     file_path = os.path.join(_RULES_LOC, filename)
 49 |     root = os.path.splitext(filename)[0]
 50 |     with codecs.open(file_path, "r", "utf-8") as f:
 51 |         rules = json.load(f)
 52 | 
 53 |     _CLASSIC_MAP[root] = rules
 54 | 
 55 | 
 56 | def _make_classes_rule(old_classes, new_classes):
 57 |     """Builds a rule for augmenting an mf1 class with its mf2
 58 |     equivalent(s).
 59 |     """
 60 | 
 61 |     def f(child, **kwargs):
 62 |         child_original = child.original or copy.copy(child)
 63 |         child_classes = child.get("class", [])[:]
 64 |         if all(cl in child_classes for cl in old_classes):
 65 |             child_classes.extend([cl for cl in new_classes if cl not in child_classes])
 66 |             child["class"] = child_classes
 67 | 
 68 |             # if any new class is e-* attach original to parse originally authored HTML
 69 |             if mf2_classes.has_embedded_class(child_classes) and child.original is None:
 70 |                 child.original = child_original
 71 | 
 72 |     return f
 73 | 
 74 | 
 75 | def _rel_tag_to_category_rule(child, html_parser, **kwargs):
 76 |     """rel=tag converts to p-category using a special transformation (the
 77 |     category becomes the tag href's last path segment). This rule adds a new data tag so that
 78 |     <a rel="tag" href="http://example.com/tags/cat"></a> gets replaced with
 79 |     <data class="p-category" value="cat"></data>
 80 |     """
 81 | 
 82 |     href = child.get("href", "")
 83 |     rels = child.get("rel", [])
 84 |     if "tag" in rels and href:
 85 |         segments = [seg for seg in href.split("/") if seg]
 86 |         if segments:
 87 |             if html_parser:
 88 |                 soup = bs4.BeautifulSoup("", features=html_parser)
 89 |             else:
 90 |                 soup = bs4.BeautifulSoup("")
 91 | 
 92 |             data = soup.new_tag("data")
 93 |             # this does not use what's given in the JSON
 94 |             # but that is not a problem currently
 95 |             # use mf1 class so it doesn't get removed later
 96 |             data["class"] = ["p-category"]
 97 |             data["value"] = unquote(segments[-1])
 98 |             child.insert_before(data)
 99 |             # remove tag from rels to avoid repeat
100 |             # new list created, original not modifed -> safe on incomplete copy
101 |             child["rel"] = [r for r in rels if r != "tag"]
102 | 
103 | 
104 | def _make_rels_rule(old_rels, new_classes, html_parser):
105 |     """Builds a rule for augmenting an mf1 rel with its mf2 class equivalent(s)."""
106 | 
107 |     # need to special case rel=tag as it operates differently
108 | 
109 |     def f(child, **kwargs):
110 |         child_rels = child.get("rel", [])
111 |         child_classes = child.get("class", [])[:]
112 |         if all(r in child_rels for r in old_rels):
113 |             if "tag" in old_rels:
114 |                 _rel_tag_to_category_rule(child, html_parser, **kwargs)
115 |             else:
116 |                 child_classes.extend(
117 |                     [cl for cl in new_classes if cl not in child_classes]
118 |                 )
119 |                 child["class"] = child_classes
120 | 
121 |     return f
122 | 
123 | 
124 | def _get_rules(old_root, html_parser):
125 |     """for given mf1 root get the rules as a list of functions to act on children"""
126 | 
127 |     class_rules = [
128 |         _make_classes_rule(old_classes.split(), new_classes)
129 |         for old_classes, new_classes in _CLASSIC_MAP[old_root]
130 |         .get("properties", {})
131 |         .items()
132 |     ]
133 |     rel_rules = [
134 |         _make_rels_rule(old_rels.split(), new_classes, html_parser)
135 |         for old_rels, new_classes in _CLASSIC_MAP[old_root].get("rels", {}).items()
136 |     ]
137 | 
138 |     return class_rules + rel_rules
139 | 
140 | 
141 | def root(classes):
142 |     """get all backcompat root classnames"""
143 |     return unordered_list([c for c in classes if c in _CLASSIC_MAP])
144 | 
145 | 
146 | def apply_rules(el, html_parser, filtered_roots):
147 |     """add modern classnames for older mf1 classnames
148 | 
149 |     returns a copy of el and does not modify the original
150 |     """
151 | 
152 |     el_copy = copy.copy(el)
153 | 
154 |     def apply_prop_rules_to_children(parent, rules):
155 |         for child in get_children(parent):
156 |             classes = child.get("class", [])[:]
157 |             # find existing mf2 properties if any and delete them
158 |             child["class"] = [
159 |                 cl for cl in classes if not mf2_classes.is_property_class(cl)
160 |             ]
161 | 
162 |             # apply rules to change mf1 to mf2
163 |             for rule in rules:
164 |                 rule(child)
165 | 
166 |             # recurse if it's not a nested mf1 or mf2 root
167 |             if not (mf2_classes.root(classes, filtered_roots) or root(classes)):
168 |                 apply_prop_rules_to_children(child, rules)
169 | 
170 |     # add mf2 root equivalent
171 |     classes = el_copy.get("class", [])[:]
172 |     old_roots = root(classes)
173 |     for old_root in old_roots:
174 |         new_roots = _CLASSIC_MAP[old_root]["type"]
175 |         classes.extend(new_roots)
176 |     el_copy["class"] = classes
177 | 
178 |     # add mf2 prop equivalent to descendents and remove existing mf2 props
179 |     rules = []
180 |     for old_root in old_roots:
181 |         rules.extend(_get_rules(old_root, html_parser))
182 | 
183 |     apply_prop_rules_to_children(el_copy, rules)
184 | 
185 |     return el_copy
186 | 


--------------------------------------------------------------------------------
/mf2py/datetime_helpers.py:
--------------------------------------------------------------------------------
 1 | """helper functions to deal wit datetime strings"""
 2 | 
 3 | import re
 4 | from datetime import datetime
 5 | 
 6 | # REGEX!
 7 | 
 8 | DATE_RE = r"(\d{4}-\d{2}-\d{2})|(\d{4}-\d{3})"
 9 | SEC_RE = r"(:(?P<second>\d{2})(\.\d+)?)"
10 | RAWTIME_RE = r"(?P<hour>\d{1,2})(:(?P<minute>\d{2})%s?)?" % (SEC_RE)
11 | AMPM_RE = r"am|pm|a\.m\.|p\.m\.|AM|PM|A\.M\.|P\.M\."
12 | TIMEZONE_RE = r"Z|[+-]\d{1,2}:?\d{2}?"
13 | TIME_RE = r"(?P<rawtime>%s)( ?(?P<ampm>%s))?( ?(?P<tz>%s))?" % (
14 |     RAWTIME_RE,
15 |     AMPM_RE,
16 |     TIMEZONE_RE,
17 | )
18 | DATETIME_RE = r"(?P<date>%s)(?P<separator>[T ])(?P<time>%s)" % (DATE_RE, TIME_RE)
19 | 
20 | 
21 | def normalize_datetime(dtstr, match=None):
22 |     """Try to normalize a datetime string.
23 |     1. Convert 12-hour time to 24-hour time
24 | 
25 |     pass match in if we have already calculated it to avoid rework
26 |     """
27 |     match = match or (dtstr and re.match(DATETIME_RE + "$", dtstr))
28 |     if match:
29 |         datestr = match.group("date")
30 |         hourstr = match.group("hour")
31 |         minutestr = match.group("minute") or "00"
32 |         secondstr = match.group("second")
33 |         ampmstr = match.group("ampm")
34 |         separator = match.group("separator")
35 | 
36 |         # convert ordinal date YYYY-DDD to YYYY-MM-DD
37 |         try:
38 |             datestr = datetime.strptime(datestr, "%Y-%j").strftime("%Y-%m-%d")
39 |         except ValueError:
40 |             # datestr was not in YYYY-DDD format
41 |             pass
42 | 
43 |         # 12 to 24 time conversion
44 |         if ampmstr:
45 |             hourstr = match.group("hour")
46 |             hourint = int(hourstr)
47 | 
48 |             if (ampmstr.startswith("a") or ampmstr.startswith("A")) and hourint == 12:
49 |                 hourstr = "00"
50 | 
51 |             if (ampmstr.startswith("p") or ampmstr.startswith("P")) and hourint < 12:
52 |                 hourstr = hourint + 12
53 | 
54 |         dtstr = "%s%s%s:%s" % (datestr, separator, hourstr, minutestr)
55 | 
56 |         if secondstr:
57 |             dtstr += ":" + secondstr
58 | 
59 |         tzstr = match.group("tz")
60 |         if tzstr:
61 |             dtstr += tzstr.replace(":", "")
62 |     return dtstr
63 | 


--------------------------------------------------------------------------------
/mf2py/dom_helpers.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from urllib.parse import urljoin
  3 | 
  4 | import bs4
  5 | from bs4.element import Comment, NavigableString, Tag
  6 | 
  7 | _whitespace_to_space_regex = re.compile(r"[\n\t\r]+")
  8 | _reduce_spaces_regex = re.compile(r" {2,}")
  9 | 
 10 | 
 11 | def try_urljoin(base, url, allow_fragments=True):
 12 |     """attempts urljoin, on ValueError passes through url. Shortcuts http(s):// urls"""
 13 |     if url.startswith(("https://", "http://")):
 14 |         return url
 15 |     try:
 16 |         url = urljoin(base, url, allow_fragments=allow_fragments)
 17 |     except ValueError:
 18 |         pass
 19 |     return url
 20 | 
 21 | 
 22 | def get_attr(el, attr, check_name=None):
 23 |     """Get the attribute of an element if it exists.
 24 | 
 25 |     Args:
 26 |       el (bs4.element.Tag): a DOM element
 27 |       attr (string): the attribute to get
 28 |       check_name (string or list, optional): a list/tuple of strings or single
 29 |         string, that must match the element's tag name
 30 | 
 31 |     Returns:
 32 |       string: the attribute's value
 33 |     """
 34 |     if check_name is None:
 35 |         return el.get(attr)
 36 |     if isinstance(check_name, str) and el.name == check_name:
 37 |         return el.get(attr)
 38 |     if isinstance(check_name, (tuple, list)) and el.name in check_name:
 39 |         return el.get(attr)
 40 | 
 41 | 
 42 | def parse_srcset(srcset, base_url):
 43 |     """Return a dictionary of sources found in srcset."""
 44 |     sources = {}
 45 |     for url, descriptor in re.findall(
 46 |         r"(\S+)\s*([\d.]+[xw])?\s*,?\s*",
 47 |         srcset,
 48 |         re.MULTILINE,
 49 |     ):
 50 |         if not descriptor:
 51 |             descriptor = "1x"
 52 |         if descriptor not in sources:
 53 |             sources[descriptor] = try_urljoin(base_url, url.strip(","))
 54 |     return sources
 55 | 
 56 | 
 57 | def get_img(img, base_url):
 58 |     """Return a dictionary with src and alt/srcset if present, else just string src."""
 59 |     src = get_attr(img, "src", check_name="img")
 60 |     if src is None:
 61 |         return
 62 |     src = try_urljoin(base_url, src)
 63 |     alt = get_attr(img, "alt", check_name="img")
 64 |     srcset = get_attr(img, "srcset", check_name="img")
 65 |     if alt is not None or srcset:
 66 |         prop_value = {"value": src}
 67 |         if alt is not None:
 68 |             prop_value["alt"] = alt
 69 |         if srcset:
 70 |             prop_value["srcset"] = parse_srcset(srcset, base_url)
 71 |         return prop_value
 72 |     else:
 73 |         return src
 74 | 
 75 | 
 76 | def get_children(node):
 77 |     """An iterator over the immediate children tags of this tag"""
 78 |     for child in node.children:
 79 |         if isinstance(child, bs4.Tag) and child.name != "template":
 80 |             yield child
 81 | 
 82 | 
 83 | def get_descendents(node):
 84 |     """An iterator over the all children tags (descendants) of this tag"""
 85 |     for desc in node.descendants:
 86 |         if isinstance(desc, bs4.Tag) and desc.name != "template":
 87 |             yield desc
 88 | 
 89 | 
 90 | def get_textContent(el, replace_img=False, img_to_src=True, base_url=""):
 91 |     """Get the text content of an element, replacing images by alt or src"""
 92 | 
 93 |     DROP_TAGS = ("script", "style", "template")
 94 |     PRE_TAGS = ("pre",)
 95 |     P_BREAK_BEFORE = 1
 96 |     P_BREAK_AFTER = 0
 97 |     PRE_BEFORE = 2
 98 |     PRE_AFTER = 3
 99 | 
100 |     def text_collection(el, replace_img=False, img_to_src=True, base_url=""):
101 |         # returns array of strings or integers
102 | 
103 |         items = []
104 | 
105 |         # drops the tags defined above and comments
106 |         if el.name in DROP_TAGS or isinstance(el, Comment):
107 |             items = []
108 | 
109 |         elif isinstance(el, NavigableString):
110 |             value = el
111 |             # replace \t \n \r by space
112 |             value = _whitespace_to_space_regex.sub(" ", value)
113 |             # replace multiple spaces with one space
114 |             items = [_reduce_spaces_regex.sub(" ", value)]
115 | 
116 |         # don't do anything special for PRE-formatted tags defined above
117 |         elif el.name in PRE_TAGS:
118 |             items = [PRE_BEFORE, el.get_text(), PRE_AFTER]
119 | 
120 |         elif el.name == "img" and replace_img:
121 |             value = el.get("alt")
122 |             if value is None and img_to_src:
123 |                 value = el.get("src")
124 |                 if value is not None:
125 |                     value = try_urljoin(base_url, value)
126 | 
127 |             if value is not None:
128 |                 items = [" ", value, " "]
129 | 
130 |         elif el.name == "br":
131 |             items = ["\n"]
132 | 
133 |         else:
134 |             for child in el.children:
135 |                 child_items = text_collection(child, replace_img, img_to_src, base_url)
136 |                 items.extend(child_items)
137 | 
138 |             if el.name == "p":
139 |                 items = [P_BREAK_BEFORE] + items + [P_BREAK_AFTER, "\n"]
140 | 
141 |         return items
142 | 
143 |     results = [
144 |         t for t in text_collection(el, replace_img, img_to_src, base_url) if t != ""
145 |     ]
146 | 
147 |     if results:
148 |         # remove <space> if it is first and last or if it is preceded by a <space> or <p> open/close
149 |         length = len(results)
150 |         for i in range(0, length):
151 |             if results[i] == " " and (
152 |                 i == 0
153 |                 or i == length - 1
154 |                 or results[i - 1] == " "
155 |                 or results[i - 1] in (P_BREAK_BEFORE, P_BREAK_AFTER)
156 |                 or results[i + 1] == " "
157 |                 or results[i + 1] in (P_BREAK_BEFORE, P_BREAK_AFTER)
158 |             ):
159 |                 results[i] = ""
160 | 
161 |     if results:
162 |         # remove leading whitespace and <int> i.e. next lines
163 |         while (
164 |             isinstance(results[0], str) and (results[0] == "" or results[0].isspace())
165 |         ) or results[0] in (P_BREAK_BEFORE, P_BREAK_AFTER):
166 |             results.pop(0)
167 |             if not results:
168 |                 break
169 | 
170 |     if results:
171 |         # remove trailing whitespace and <int> i.e. next lines
172 |         while (
173 |             isinstance(results[-1], str)
174 |             and (results[-1] == "" or results[-1].isspace())
175 |         ) or results[-1] in (P_BREAK_BEFORE, P_BREAK_AFTER):
176 |             results.pop(-1)
177 |             if not results:
178 |                 break
179 | 
180 |     # trim leading and trailing non-<pre> whitespace
181 |     if results:
182 |         if isinstance(results[0], str):
183 |             results[0] = results[0].lstrip()
184 |         if isinstance(results[-1], str):
185 |             results[-1] = results[-1].rstrip()
186 | 
187 |     # create final string by concatenating replacing consecutive sequence of <int> by largest value number of \n
188 |     text = ""
189 |     count = 0
190 |     last = None
191 |     for t in results:
192 |         if t in (P_BREAK_BEFORE, P_BREAK_AFTER):
193 |             count = max(t, count)
194 |         elif t == PRE_BEFORE:
195 |             text = text.rstrip(" ")
196 |         elif not isinstance(t, int):
197 |             if count or last == "\n":
198 |                 t = t.lstrip(" ")
199 |             text = "".join([text, "\n" * count, t])
200 |             count = 0
201 |         last = t
202 | 
203 |     return text
204 | 


--------------------------------------------------------------------------------
/mf2py/implied_properties.py:
--------------------------------------------------------------------------------
  1 | from . import mf2_classes
  2 | from .dom_helpers import get_attr, get_children, get_img, get_textContent, try_urljoin
  3 | 
  4 | 
  5 | def name(el, base_url, filtered_roots):
  6 |     """Find an implied name property
  7 | 
  8 |     Args:
  9 |       el (bs4.element.Tag): a DOM element
 10 | 
 11 |     Returns:
 12 |       string: the implied name value
 13 |     """
 14 | 
 15 |     def non_empty(val):
 16 |         """If alt or title is empty, we don't want to use it as the implied
 17 |         name"""
 18 |         return val is not None and val != ""
 19 | 
 20 |     # if image or area use alt text if not empty
 21 |     prop_value = get_attr(el, "alt", check_name=("img", "area"))
 22 |     if non_empty(prop_value):
 23 |         return prop_value
 24 | 
 25 |     # if abbreviation use the title if not empty
 26 |     prop_value = get_attr(el, "title", check_name="abbr")
 27 |     if non_empty(prop_value):
 28 |         return prop_value
 29 | 
 30 |     # find candidate child or grandchild
 31 |     poss_child = None
 32 |     children = list(get_children(el))
 33 |     if len(children) == 1:
 34 |         poss_child = children[0]
 35 | 
 36 |         # ignore if mf2 root
 37 |         if mf2_classes.root(poss_child.get("class", []), filtered_roots):
 38 |             poss_child = None
 39 | 
 40 |         # if it is not img, area, abbr then find grandchild
 41 |         if poss_child and poss_child.name not in ("img", "area", "abbr"):
 42 |             grandchildren = list(get_children(poss_child))
 43 |             # if only one grandchild
 44 |             if len(grandchildren) == 1:
 45 |                 poss_child = grandchildren[0]
 46 |                 # if it is not img, area, abbr or is mf2 root then no possible child
 47 |                 if poss_child.name not in ("img", "area", "abbr") or mf2_classes.root(
 48 |                     poss_child.get("class", []), filtered_roots
 49 |                 ):
 50 |                     poss_child = None
 51 | 
 52 |     # if a possible child was found
 53 |     if poss_child is not None:
 54 |         # use alt if possible child is img or area
 55 |         prop_value = get_attr(poss_child, "alt", check_name=("img", "area"))
 56 |         if non_empty(prop_value):
 57 |             return prop_value
 58 | 
 59 |         # use title if possible child is abbr
 60 |         prop_value = get_attr(poss_child, "title", check_name="abbr")
 61 |         if non_empty(prop_value):
 62 |             return prop_value
 63 | 
 64 |     # use text if all else fails
 65 |     # replace images with alt but not with src in implied name
 66 |     # proposal: https://github.com/microformats/microformats2-parsing/issues/35#issuecomment-393615508
 67 |     return get_textContent(el, replace_img=True, img_to_src=False, base_url=base_url)
 68 | 
 69 | 
 70 | def photo(el, base_url, filtered_roots):
 71 |     """Find an implied photo property
 72 | 
 73 |     Args:
 74 |       el (bs4.element.Tag): a DOM element
 75 |       base_url (string): the base URL to use, to reconcile relative URLs
 76 | 
 77 |     Returns:
 78 |       string or dictionary: the implied photo value or implied photo as a dictionary with alt value
 79 |     """
 80 | 
 81 |     def get_photo_child(children):
 82 |         "take a list of children and finds a valid child for photo property"
 83 | 
 84 |         # if element has one image child use source if exists and img is
 85 |         # not root class
 86 |         poss_imgs = [c for c in children if c.name == "img"]
 87 |         if len(poss_imgs) == 1:
 88 |             poss_img = poss_imgs[0]
 89 |             if not mf2_classes.root(poss_img.get("class", []), filtered_roots):
 90 |                 return poss_img
 91 | 
 92 |         # if element has one object child use data if exists and object is
 93 |         # not root class
 94 |         poss_objs = [c for c in children if c.name == "object"]
 95 |         if len(poss_objs) == 1:
 96 |             poss_obj = poss_objs[0]
 97 |             if not mf2_classes.root(poss_obj.get("class", []), filtered_roots):
 98 |                 return poss_obj
 99 | 
100 |     def resolve_relative_url(prop_value):
101 |         if isinstance(prop_value, dict):
102 |             prop_value["value"] = try_urljoin(base_url, prop_value["value"])
103 |         else:
104 |             prop_value = try_urljoin(base_url, prop_value)
105 |         return prop_value
106 | 
107 |     # if element is an img use source if exists
108 |     if prop_value := get_img(el, base_url):
109 |         return resolve_relative_url(prop_value)
110 | 
111 |     # if element is an object use data if exists
112 |     if prop_value := get_attr(el, "data", check_name="object"):
113 |         return resolve_relative_url(prop_value)
114 | 
115 |     # find candidate child or grandchild
116 |     poss_child = None
117 |     children = list(get_children(el))
118 | 
119 |     poss_child = get_photo_child(children)
120 | 
121 |     # if no possible child found then look for grandchild if only one child which is not not mf2 root
122 |     if (
123 |         poss_child is None
124 |         and len(children) == 1
125 |         and not mf2_classes.root(children[0].get("class", []), filtered_roots)
126 |     ):
127 |         grandchildren = list(get_children(children[0]))
128 |         poss_child = get_photo_child(grandchildren)
129 | 
130 |     # if a possible child was found parse
131 |     if poss_child is not None:
132 |         # img get src
133 |         if prop_value := get_img(poss_child, base_url):
134 |             return resolve_relative_url(prop_value)
135 | 
136 |         # object get data
137 |         if prop_value := get_attr(poss_child, "data", check_name="object"):
138 |             return resolve_relative_url(prop_value)
139 | 
140 | 
141 | def url(el, base_url, filtered_roots):
142 |     """Find an implied url property
143 | 
144 |     Args:
145 |       el (bs4.element.Tag): a DOM element
146 |       base_url (string): the base URL to use, to reconcile relative URLs
147 | 
148 |     Returns:
149 |       string: the implied url value
150 |     """
151 | 
152 |     def get_url_child(children):
153 |         "take a list of children and finds a valid child for url property"
154 | 
155 |         # if element has one <a> child use if not root class
156 |         poss_as = [c for c in children if c.name == "a"]
157 |         if len(poss_as) == 1:
158 |             poss_a = poss_as[0]
159 |             if not mf2_classes.root(poss_a.get("class", []), filtered_roots):
160 |                 return poss_a
161 | 
162 |         # if element has one area child use if not root class
163 |         poss_areas = [c for c in children if c.name == "area"]
164 |         if len(poss_areas) == 1:
165 |             poss_area = poss_areas[0]
166 |             if not mf2_classes.root(poss_area.get("class", []), filtered_roots):
167 |                 return poss_area
168 | 
169 |     # if element is a <a> or area use its href if exists
170 |     prop_value = get_attr(el, "href", check_name=("a", "area"))
171 |     if prop_value is not None:  # an empty href is valid
172 |         return try_urljoin(base_url, prop_value)
173 | 
174 |     # find candidate child or grandchild
175 |     poss_child = None
176 |     children = list(get_children(el))
177 | 
178 |     poss_child = get_url_child(children)
179 | 
180 |     # if no possible child found then look for grandchild if only one child which is not mf2 root
181 |     if (
182 |         poss_child is None
183 |         and len(children) == 1
184 |         and not mf2_classes.root(children[0].get("class", []), filtered_roots)
185 |     ):
186 |         grandchildren = list(get_children(children[0]))
187 |         poss_child = get_url_child(grandchildren)
188 | 
189 |     # if a possible child was found parse
190 |     if poss_child is not None:
191 |         prop_value = get_attr(poss_child, "href", check_name=("a", "area"))
192 |         if prop_value is not None:  # an empty href is valid
193 |             return try_urljoin(base_url, prop_value)
194 | 


--------------------------------------------------------------------------------
/mf2py/metaformats.py:
--------------------------------------------------------------------------------
 1 | """Metaformats parser.
 2 | 
 3 | https://microformats.org/wiki/metaformats
 4 | 
 5 | TODO:
 6 | * explicit mf2 classes on meta tags
 7 |   https://microformats.org/wiki/metaformats#parsing_an_element_for_properties
 8 | """
 9 | from .dom_helpers import try_urljoin
10 | from .mf2_classes import filter_classes
11 | 
12 | METAFORMAT_TO_MF2 = [
13 |     # in priority order, descending
14 |     # OGP
15 |     ("property", "article:author", "author"),
16 |     ("property", "article:published_time", "published"),
17 |     ("property", "article:modified_time", "updated"),
18 |     ("property", "og:audio", "audio"),
19 |     ("property", "og:description", "summary"),
20 |     ("property", "og:image", "photo"),
21 |     ("property", "og:title", "name"),
22 |     ("property", "og:video", "video"),
23 |     # Twitter
24 |     ("name", "twitter:title", "name"),
25 |     ("name", "twitter:description", "summary"),
26 |     ("name", "twitter:image", "photo"),
27 |     # HTML standard meta names
28 |     # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name
29 |     ("name", "description", "summary"),
30 | ]
31 | OGP_TYPE_TO_MF2 = {
32 |     "article": "h-entry",
33 |     "movie": "h-cite",
34 |     "music": "h-cite",
35 |     "profile": "h-card",
36 | }
37 | URL_PROPERTIES = {
38 |     "article:author",
39 |     "og:audio",
40 |     "og:image",
41 |     "og:video",
42 |     "twitter:image",
43 | }
44 | 
45 | 
46 | def parse(soup, url=None):
47 |     """Extracts and returns a metaformats item from a BeautifulSoup parse tree.
48 | 
49 |     Args:
50 |       soup (bs4.BeautifulSoup): parsed HTML
51 |       url (str): URL of document
52 | 
53 |     Returns:
54 |       dict: mf2 item, or None if the input is not eligible for metaformats
55 |     """
56 |     if not soup.head:
57 |         return None
58 | 
59 |     # Is there a microformat2 root class on the html element?
60 |     if filter_classes(soup.get("class", []))["h"]:
61 |         return None
62 | 
63 |     parsed = {"properties": {}, "source": "metaformats"}
64 |     props = parsed["properties"]
65 | 
66 |     # Properties
67 |     for attr, meta, mf2 in METAFORMAT_TO_MF2:
68 |         if val := soup.head.find("meta", attrs={attr: meta}):
69 |             if content := val.get("content"):
70 |                 if meta in URL_PROPERTIES:
71 |                     content = try_urljoin(url, content)
72 |                 props.setdefault(mf2, [content])
73 | 
74 |     if soup.head.title:
75 |         if text := soup.head.title.text:
76 |             props.setdefault("name", [text])
77 | 
78 |     if not props:
79 |         # No OGP or Twitter properties
80 |         return None
81 | 
82 |     # type from OGP or default to h-entry
83 |     parsed["type"] = ["h-entry"]
84 |     if ogp_type := soup.head.find("meta", property="og:type"):
85 |         if content := ogp_type.get("content"):
86 |             if mf2_type := OGP_TYPE_TO_MF2.get(content.split(".")[0]):
87 |                 parsed["type"] = [mf2_type]
88 | 
89 |     return parsed
90 | 


--------------------------------------------------------------------------------
/mf2py/mf2_classes.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | _mf2_classes_re = re.compile("(p|e|u|dt|h)-((:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*)$")
 4 | _mf2_roots_re = re.compile("h-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")
 5 | _mf2_properties_re = re.compile("(p|e|u|dt)-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")
 6 | _mf2_e_properties_re = re.compile("e-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")
 7 | 
 8 | CONFLICTING_ROOTS_TAILWIND = {"auto", "fit", "full", "max", "min", "px", "screen"}
 9 | 
10 | 
11 | def filter_classes(classes, regex=_mf2_classes_re):
12 |     """detect classes that are valid names for mf2, sort in dictionary by prefix"""
13 | 
14 |     types = {x: set() for x in ("u", "p", "dt", "e", "h")}
15 |     for c in classes:
16 |         match = regex.match(c)
17 |         if match:
18 |             if c[0] == "h":
19 |                 types["h"].add(c)
20 |             else:
21 |                 types[match.group(1)].add(match.group(2))
22 |     return types
23 | 
24 | 
25 | def root(classes, filtered_roots):
26 |     return {
27 |         c for c in classes if _mf2_roots_re.match(c) and c[2:] not in filtered_roots
28 |     }
29 | 
30 | 
31 | def is_property_class(class_):
32 |     return _mf2_properties_re.match(class_)
33 | 
34 | 
35 | def has_embedded_class(classes):
36 |     return any(_mf2_e_properties_re.match(c) for c in classes)
37 | 


--------------------------------------------------------------------------------
/mf2py/mf_helpers.py:
--------------------------------------------------------------------------------
 1 | # don't need anymore defer to mf2util instead (mf2util does not have this functionality)
 2 | 
 3 | 
 4 | def get_url(mf):
 5 |     """Given a property value that may be a list of simple URLs or complex
 6 |     h-* dicts (with a url property), extract a list of URLs. This is useful
 7 |     when parsing e.g., in-reply-to.
 8 | 
 9 |     Args:
10 |       mf (string or dict): URL or h-cite-style dict
11 | 
12 |     Returns:
13 |       list: a list of URLs
14 |     """
15 | 
16 |     urls = []
17 |     for item in mf:
18 |         if isinstance(item, str):
19 |             urls.append(item)
20 |         elif isinstance(item, dict) and any(
21 |             x.startswith("h-") for x in item.get("type", [])
22 |         ):
23 |             urls.extend(item.get("properties", {}).get("url", []))
24 | 
25 |     return urls
26 | 
27 | 
28 | def unordered_list(l):
29 |     """given a list, returns another list with unique and alphabetically sorted elements.
30 |     use for HTML attributes that have no semantics to their order e.g. class, rel.
31 |     """
32 |     return sorted(set(l))
33 | 


--------------------------------------------------------------------------------
/mf2py/parse_property.py:
--------------------------------------------------------------------------------
  1 | """functions to parse the properties of elements"""
  2 | 
  3 | import re
  4 | 
  5 | from . import value_class_pattern
  6 | from .datetime_helpers import DATETIME_RE, TIME_RE, normalize_datetime
  7 | from .dom_helpers import get_attr, get_img, get_textContent, try_urljoin
  8 | 
  9 | 
 10 | def text(el, base_url=""):
 11 |     """Process p-* properties"""
 12 | 
 13 |     # handle value-class-pattern
 14 |     prop_value = value_class_pattern.text(el)
 15 |     if prop_value is not None:
 16 |         return prop_value
 17 | 
 18 |     prop_value = get_attr(el, "title", check_name=("abbr", "link"))
 19 |     if prop_value is None:
 20 |         prop_value = get_attr(el, "value", check_name=("data", "input"))
 21 |     if prop_value is None:
 22 |         prop_value = get_attr(el, "alt", check_name=("img", "area"))
 23 |     if prop_value is None:
 24 |         prop_value = get_textContent(el, replace_img=True, base_url=base_url)
 25 | 
 26 |     return prop_value
 27 | 
 28 | 
 29 | def url(el, base_url=""):
 30 |     """Process u-* properties"""
 31 | 
 32 |     prop_value = get_attr(el, "href", check_name=("a", "area", "link"))
 33 |     if prop_value is None:
 34 |         prop_value = get_img(el, base_url)
 35 |         if prop_value is not None:
 36 |             return prop_value
 37 |     if prop_value is None:
 38 |         prop_value = get_attr(
 39 |             el, "src", check_name=("audio", "video", "source", "iframe")
 40 |         )
 41 |     if prop_value is None:
 42 |         prop_value = get_attr(el, "poster", check_name="video")
 43 |     if prop_value is None:
 44 |         prop_value = get_attr(el, "data", check_name="object")
 45 | 
 46 |     if prop_value is None:
 47 |         # handle value-class-pattern
 48 |         prop_value = value_class_pattern.text(el)
 49 | 
 50 |     if prop_value is None:
 51 |         prop_value = get_attr(el, "title", check_name="abbr")
 52 |     if prop_value is None:
 53 |         prop_value = get_attr(el, "value", check_name=("data", "input"))
 54 |     if prop_value is None:
 55 |         prop_value = get_textContent(el)
 56 | 
 57 |     return try_urljoin(base_url, prop_value)
 58 | 
 59 | 
 60 | def datetime(el, default_date=None):
 61 |     """Process dt-* properties
 62 | 
 63 |     Args:
 64 |       el (bs4.element.Tag): Tag containing the dt-value
 65 | 
 66 |     Returns:
 67 |       a tuple (string string): a tuple of two strings, (datetime, date)
 68 |     """
 69 | 
 70 |     # handle value-class-pattern
 71 |     prop_value = value_class_pattern.datetime(el, default_date)
 72 |     if prop_value is not None:
 73 |         return prop_value
 74 | 
 75 |     prop_value = get_attr(el, "datetime", check_name=("time", "ins", "del"))
 76 |     if prop_value is None:
 77 |         prop_value = get_attr(el, "title", check_name="abbr")
 78 |     if prop_value is None:
 79 |         prop_value = get_attr(el, "value", check_name=("data", "input"))
 80 |     if prop_value is None:
 81 |         prop_value = get_textContent(el)
 82 | 
 83 |     # if this is just a time, augment with default date
 84 |     match = re.match(TIME_RE + "$", prop_value)
 85 |     if match and default_date:
 86 |         prop_value = "%s %s" % (default_date, prop_value)
 87 |         return normalize_datetime(prop_value), default_date
 88 | 
 89 |     # otherwise, treat it as a full date
 90 |     match = re.match(DATETIME_RE + "$", prop_value)
 91 |     return (
 92 |         normalize_datetime(prop_value, match=match),
 93 |         match and match.group("date"),
 94 |     )
 95 | 
 96 | 
 97 | def embedded(el, base_url, root_lang, document_lang, expose_dom):
 98 |     """Process e-* properties"""
 99 |     for tag in el.find_all():
100 |         for attr in ("href", "src", "cite", "data", "poster"):
101 |             if attr in tag.attrs:
102 |                 tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr])
103 |     prop_value = {
104 |         "value": get_textContent(el, replace_img=True, base_url=base_url),
105 |     }
106 |     if lang := el.attrs.get("lang"):
107 |         prop_value["lang"] = lang
108 |     elif root_lang:
109 |         prop_value["lang"] = root_lang
110 |     elif document_lang:
111 |         prop_value["lang"] = document_lang
112 |     if expose_dom:
113 |         prop_value["dom"] = el
114 |     else:
115 |         prop_value["html"] = el.decode_contents().strip()
116 |     return prop_value
117 | 


--------------------------------------------------------------------------------
/mf2py/parser.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | from urllib.parse import urlparse
  4 | 
  5 | import requests
  6 | from bs4 import BeautifulSoup, FeatureNotFound
  7 | from bs4.element import Tag
  8 | 
  9 | from . import (
 10 |     backcompat,
 11 |     implied_properties,
 12 |     metaformats,
 13 |     mf2_classes,
 14 |     parse_property,
 15 |     temp_fixes,
 16 | )
 17 | from .dom_helpers import get_attr, get_children, get_descendents, try_urljoin
 18 | from .mf_helpers import unordered_list
 19 | from .version import __version__
 20 | 
 21 | 
 22 | def parse(
 23 |     doc=None,
 24 |     url=None,
 25 |     html_parser=None,
 26 |     expose_dom=False,
 27 |     metaformats=False,
 28 |     filter_roots=False,
 29 | ):
 30 |     """
 31 |     Parse a document or URL for microformats and return a dictionary in mf2json format.
 32 | 
 33 |     Args:
 34 |       doc (file, string or BeautifulSoup doc): file handle, text of content
 35 |         to parse, or BeautifulSoup document. If None it will be fetched from
 36 |         given URL.
 37 |       url (string): URL of the file to be processed. If None it will be
 38 |         extracted from the `<base>` element of given doc.
 39 |       html_parser (string): optional, select a specific HTML parser. Valid options
 40 |         from the BeautifulSoup documentation are: "html", "xml","html5", "lxml",
 41 |         "html5lib", and "html.parser".
 42 |       expose_dom (boolean): optional, expose the DOM of embedded properties.
 43 |       metaformats (boolean): optional, include metaformats extracted from OGP
 44 |         and Twitter card data: https://microformats.org/wiki/metaformats
 45 |       filter_roots (boolean or list): optional, filter root class names. Use
 46 |         True to filter known conflicting classes, otherwise filter given list.
 47 | 
 48 |     Return: a mf2json dict representing the structured data in the document
 49 | 
 50 |     """
 51 |     return Parser(
 52 |         doc,
 53 |         url,
 54 |         html_parser,
 55 |         expose_dom=expose_dom,
 56 |         metaformats=metaformats,
 57 |         filter_roots=filter_roots,
 58 |     ).to_dict()
 59 | 
 60 | 
 61 | class Parser(object):
 62 |     """
 63 |     Parser to parse a document or URL for microformats and output in various formats.
 64 | 
 65 |     Args:
 66 |       doc (file, string or BeautifulSoup doc): file handle, text of content
 67 |         to parse, or BeautifulSoup document. If None it will be fetched from
 68 |         given URL.
 69 |       url (string): URL of the file to be processed. If None it will be
 70 |         extracted from the `<base>` element of given doc.
 71 |       html_parser (string): optional, select a specific HTML parser. Valid options
 72 |         from the BeautifulSoup documentation are: "html", "xml","html5", "lxml",
 73 |         "html5lib", and "html.parser".
 74 |       expose_dom (boolean): optional, expose the DOM of embedded properties.
 75 |       metaformats (boolean): optional, include metaformats extracted from OGP
 76 |         and Twitter card data: https://microformats.org/wiki/metaformats
 77 |       filter_roots (boolean or list): optional, filter root class names. Use
 78 |         True to filter known conflicting classes, otherwise filter given list.
 79 | 
 80 |     Attributes:
 81 |       useragent (string): the User-Agent string for the Parser
 82 | 
 83 |     """
 84 | 
 85 |     ua_desc = "mf2py - microformats2 parser for python"
 86 |     ua_url = "https://github.com/microformats/mf2py"
 87 |     useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)
 88 | 
 89 |     def __init__(
 90 |         self,
 91 |         doc=None,
 92 |         url=None,
 93 |         html_parser=None,
 94 |         expose_dom=False,
 95 |         metaformats=False,
 96 |         filter_roots=False,
 97 |     ):
 98 |         self.__url__ = None
 99 |         self.__doc__ = None
100 |         self._preserve_doc = False
101 |         self.__parsed__ = {
102 |             "items": [],
103 |             "rels": {},
104 |             "rel-urls": {},
105 |             "debug": {
106 |                 "description": self.ua_desc,
107 |                 "source": self.ua_url,
108 |                 "version": __version__,
109 |             },
110 |         }
111 |         self.lang = None
112 |         self.expose_dom = expose_dom
113 |         self.__metaformats = metaformats
114 |         try:
115 |             self.filtered_roots = set(filter_roots)
116 |         except TypeError:
117 |             if filter_roots:
118 |                 self.filtered_roots = mf2_classes.CONFLICTING_ROOTS_TAILWIND
119 |             else:
120 |                 self.filtered_roots = []
121 | 
122 |         # use default parser if none specified
123 |         self.__html_parser__ = html_parser or "html5lib"
124 | 
125 |         if url is not None:
126 |             self.__url__ = url
127 | 
128 |             if doc is None:
129 |                 data = requests.get(
130 |                     self.__url__,
131 |                     headers={
132 |                         "User-Agent": self.useragent,
133 |                     },
134 |                 )
135 | 
136 |                 # update to final URL after redirects
137 |                 self.__url__ = data.url
138 | 
139 |                 # HACK: check for character encodings and use 'correct' data
140 |                 if "charset" in data.headers.get("content-type", ""):
141 |                     doc = data.text
142 |                 else:
143 |                     doc = data.content
144 | 
145 |         if doc is not None:
146 |             if isinstance(doc, BeautifulSoup) or isinstance(doc, Tag):
147 |                 self.__doc__ = doc
148 |                 self._preserve_doc = True
149 |             else:
150 |                 try:
151 |                     # try the user-given html parser or default html5lib
152 |                     self.__doc__ = BeautifulSoup(doc, features=self.__html_parser__)
153 |                 except FeatureNotFound:
154 |                     # maybe raise a warning?
155 |                     # else switch to default use
156 |                     self.__doc__ = BeautifulSoup(doc)
157 | 
158 |         # update actual parser used
159 |         # uses builder.NAME from BeautifulSoup
160 |         if isinstance(self.__doc__, BeautifulSoup) and self.__doc__.builder is not None:
161 |             self.__html_parser__ = self.__doc__.builder.NAME
162 |         else:
163 |             self.__html_parser__ = None
164 | 
165 |         # check for <base> tag
166 |         if self.__doc__:
167 |             poss_base = next(
168 |                 (el for el in get_descendents(self.__doc__) if el.name == "base"), None
169 |             )
170 |             if poss_base:
171 |                 poss_base_url = poss_base.get("href")  # try to get href
172 |                 if poss_base_url:
173 |                     if urlparse(poss_base_url).netloc:
174 |                         # base specifies an absolute path
175 |                         self.__url__ = poss_base_url
176 |                     elif self.__url__:
177 |                         # base specifies a relative path
178 |                         self.__url__ = try_urljoin(self.__url__, poss_base_url)
179 | 
180 |         if self.__doc__ is not None:
181 |             if document := self.__doc__.find("html"):
182 |                 self.lang = document.attrs.get("lang")
183 |             # parse!
184 |             self._parse()
185 | 
186 |     def _parse(self):
187 |         """Does the work of actually parsing the document. Done automatically
188 |         on initialization.
189 |         """
190 |         self._default_date = None
191 |         # _default_date exists to provide implementation for rules described
192 |         # in legacy value-class-pattern. basically, if you have two dt-
193 |         # properties and one does not have the full date, it can use the
194 |         # existing date as a template.
195 |         # see value-class-pattern#microformats2_parsers on wiki.
196 |         # see also the implied_relative_datetimes testcase.
197 | 
198 |         def handle_microformat(
199 |             root_class_names,
200 |             el,
201 |             value_property=None,
202 |             simple_value=None,
203 |             backcompat_mode=False,
204 |         ):
205 |             """Handles a (possibly nested) microformat, i.e. h-*"""
206 |             properties = {}
207 |             children = []
208 |             self._default_date = None
209 |             # for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
210 |             parsed_types_aggregation = set()
211 | 
212 |             if backcompat_mode:
213 |                 el = backcompat.apply_rules(
214 |                     el, self.__html_parser__, self.filtered_roots
215 |                 )
216 |                 root_class_names = mf2_classes.root(
217 |                     el.get("class", []), self.filtered_roots
218 |                 )
219 | 
220 |             root_lang = el.attrs.get("lang")
221 | 
222 |             # parse for properties and children
223 |             for child in get_children(el):
224 |                 (
225 |                     child_props,
226 |                     child_children,
227 |                     child_parsed_types_aggregation,
228 |                 ) = parse_props(child, root_lang)
229 |                 for key, new_value in child_props.items():
230 |                     prop_value = properties.get(key, [])
231 |                     prop_value.extend(new_value)
232 |                     properties[key] = prop_value
233 |                 children.extend(child_children)
234 |                 parsed_types_aggregation.update(child_parsed_types_aggregation)
235 | 
236 |             # complex h-* objects can take their "value" from the
237 |             # first explicit property ("name" for p-* or "url" for u-*)
238 |             if value_property and value_property in properties:
239 |                 simple_value = properties[value_property][0]
240 | 
241 |             # if some properties not already found find in implied ways unless in backcompat mode
242 |             if not backcompat_mode:
243 |                 # stop implied name if any p-*, e-*, h-* is already found
244 |                 if "name" not in properties and parsed_types_aggregation.isdisjoint(
245 |                     "peh"
246 |                 ):
247 |                     properties["name"] = [
248 |                         implied_properties.name(el, self.__url__, self.filtered_roots)
249 |                     ]
250 | 
251 |                 if "photo" not in properties and parsed_types_aggregation.isdisjoint(
252 |                     "uh"
253 |                 ):
254 |                     x = implied_properties.photo(el, self.__url__, self.filtered_roots)
255 |                     if x is not None:
256 |                         properties["photo"] = [x]
257 | 
258 |                 # stop implied url if any u-* or h-* is already found
259 |                 if "url" not in properties and parsed_types_aggregation.isdisjoint(
260 |                     "uh"
261 |                 ):
262 |                     x = implied_properties.url(el, self.__url__, self.filtered_roots)
263 |                     if x is not None:
264 |                         properties["url"] = [x]
265 | 
266 |             # build microformat with type and properties
267 |             microformat = {
268 |                 "type": [class_name for class_name in sorted(root_class_names)],
269 |                 "properties": properties,
270 |             }
271 |             if el.name == "area":
272 |                 shape = get_attr(el, "shape")
273 |                 if shape is not None:
274 |                     microformat["shape"] = shape
275 | 
276 |                 coords = get_attr(el, "coords")
277 |                 if coords is not None:
278 |                     microformat["coords"] = coords
279 | 
280 |             # insert children if any
281 |             if children:
282 |                 microformat["children"] = children
283 | 
284 |             Id = get_attr(el, "id")
285 |             if Id:
286 |                 microformat["id"] = Id
287 | 
288 |             # simple value is the parsed property value if it were not
289 |             # an h-* class
290 |             if simple_value is not None:
291 |                 if isinstance(simple_value, dict):
292 |                     # for e-* properties, the simple value will be
293 |                     # {"html":..., "value":...}  which we should fold
294 |                     # into the microformat object
295 |                     # details: https://github.com/microformats/mf2py/issues/35
296 |                     microformat.update(simple_value)
297 |                 else:
298 |                     microformat["value"] = simple_value
299 | 
300 |             if root_lang:
301 |                 microformat["lang"] = root_lang
302 |             elif self.lang:
303 |                 microformat["lang"] = self.lang
304 |             return microformat
305 | 
306 |         def parse_props(el, root_lang):
307 |             """Parse the properties from a single element"""
308 |             props = {}
309 |             children = []
310 |             # for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
311 |             parsed_types_aggregation = set()
312 | 
313 |             classes = el.get("class", [])
314 |             filtered_classes = mf2_classes.filter_classes(classes)
315 |             # Is this element a microformat2 root?
316 |             root_class_names = filtered_classes["h"]
317 |             backcompat_mode = False
318 | 
319 |             # Is this element a microformat1 root?
320 |             if not root_class_names:
321 |                 root_class_names = backcompat.root(classes)
322 |                 backcompat_mode = True
323 | 
324 |             if root_class_names:
325 |                 parsed_types_aggregation.add("h")
326 | 
327 |             # Is this a property element (p-*, u-*, etc.) flag
328 |             # False is default
329 |             is_property_el = False
330 | 
331 |             # Parse plaintext p-* properties.
332 |             p_value = None
333 |             for prop_name in filtered_classes["p"]:
334 |                 is_property_el = True
335 |                 parsed_types_aggregation.add("p")
336 |                 prop_value = props.setdefault(prop_name, [])
337 | 
338 |                 # if value has not been parsed then parse it
339 |                 if p_value is None:
340 |                     p_value = parse_property.text(el, base_url=self.__url__)
341 | 
342 |                 if root_class_names:
343 |                     prop_value.append(
344 |                         handle_microformat(
345 |                             root_class_names,
346 |                             el,
347 |                             value_property="name",
348 |                             simple_value=p_value,
349 |                             backcompat_mode=backcompat_mode,
350 |                         )
351 |                     )
352 |                 else:
353 |                     prop_value.append(p_value)
354 | 
355 |             # Parse URL u-* properties.
356 |             u_value = None
357 |             for prop_name in filtered_classes["u"]:
358 |                 is_property_el = True
359 |                 parsed_types_aggregation.add("u")
360 |                 prop_value = props.setdefault(prop_name, [])
361 | 
362 |                 # if value has not been parsed then parse it
363 |                 if u_value is None:
364 |                     u_value = parse_property.url(el, base_url=self.__url__)
365 | 
366 |                 if root_class_names:
367 |                     prop_value.append(
368 |                         handle_microformat(
369 |                             root_class_names,
370 |                             el,
371 |                             value_property="url",
372 |                             simple_value=u_value,
373 |                             backcompat_mode=backcompat_mode,
374 |                         )
375 |                     )
376 |                 else:
377 |                     if isinstance(u_value, dict):
378 |                         prop_value.append(u_value)
379 |                     else:
380 |                         prop_value.append(u_value)
381 | 
382 |             # Parse datetime dt-* properties.
383 |             dt_value = None
384 |             for prop_name in filtered_classes["dt"]:
385 |                 is_property_el = True
386 |                 parsed_types_aggregation.add("d")
387 |                 prop_value = props.setdefault(prop_name, [])
388 | 
389 |                 # if value has not been parsed then parse it
390 |                 if dt_value is None:
391 |                     dt_value, new_date = parse_property.datetime(el, self._default_date)
392 |                     # update the default date
393 |                     if new_date:
394 |                         self._default_date = new_date
395 | 
396 |                 if root_class_names:
397 |                     stops_implied_name = True
398 |                     prop_value.append(
399 |                         handle_microformat(
400 |                             root_class_names,
401 |                             el,
402 |                             simple_value=dt_value,
403 |                             backcompat_mode=backcompat_mode,
404 |                         )
405 |                     )
406 |                 else:
407 |                     if dt_value is not None:
408 |                         prop_value.append(dt_value)
409 | 
410 |             # Parse embedded markup e-* properties.
411 |             e_value = None
412 |             for prop_name in filtered_classes["e"]:
413 |                 is_property_el = True
414 |                 parsed_types_aggregation.add("e")
415 |                 prop_value = props.setdefault(prop_name, [])
416 | 
417 |                 # if value has not been parsed then parse it
418 |                 if e_value is None:
419 |                     # send original element for parsing backcompat
420 |                     if el.original is None:
421 |                         embedded_el = el
422 |                     else:
423 |                         embedded_el = el.original
424 |                     if self._preserve_doc:
425 |                         embedded_el = copy.copy(embedded_el)
426 |                     temp_fixes.rm_templates(embedded_el)
427 |                     e_value = parse_property.embedded(
428 |                         embedded_el, self.__url__, root_lang, self.lang, self.expose_dom
429 |                     )
430 | 
431 |                 if root_class_names:
432 |                     stops_implied_name = True
433 |                     prop_value.append(
434 |                         handle_microformat(
435 |                             root_class_names,
436 |                             el,
437 |                             simple_value=e_value,
438 |                             backcompat_mode=backcompat_mode,
439 |                         )
440 |                     )
441 |                 else:
442 |                     prop_value.append(e_value)
443 | 
444 |             # if this is not a property element, but it is a h-* microformat,
445 |             # add it to our list of children
446 |             if not is_property_el and root_class_names:
447 |                 children.append(
448 |                     handle_microformat(
449 |                         root_class_names, el, backcompat_mode=backcompat_mode
450 |                     )
451 |                 )
452 |             # parse child tags, provided this isn't a microformat root-class
453 |             if not root_class_names:
454 |                 for child in get_children(el):
455 |                     (
456 |                         child_properties,
457 |                         child_microformats,
458 |                         child_parsed_types_aggregation,
459 |                     ) = parse_props(child, root_lang)
460 |                     for prop_name in child_properties:
461 |                         v = props.get(prop_name, [])
462 |                         v.extend(child_properties[prop_name])
463 |                         props[prop_name] = v
464 |                     children.extend(child_microformats)
465 |                     parsed_types_aggregation.update(child_parsed_types_aggregation)
466 |             return props, children, parsed_types_aggregation
467 | 
468 |         def parse_rels(el):
469 |             """Parse an element for rel microformats"""
470 |             rel_attrs = get_attr(el, "rel")
471 |             # if rel attributes exist
472 |             if rel_attrs is not None:
473 |                 # find the url and normalise it
474 |                 url = try_urljoin(self.__url__, el.get("href", ""))
475 |                 value_dict = self.__parsed__["rel-urls"].get(url, {})
476 | 
477 |                 # 1st one wins
478 |                 if "text" not in value_dict:
479 |                     value_dict["text"] = el.get_text().strip()
480 | 
481 |                 url_rels = value_dict.get("rels", [])
482 |                 value_dict["rels"] = url_rels
483 | 
484 |                 for knownattr in ("media", "hreflang", "type", "title"):
485 |                     x = get_attr(el, knownattr)
486 |                     # 1st one wins
487 |                     if x is not None and knownattr not in value_dict:
488 |                         value_dict[knownattr] = x
489 | 
490 |                 self.__parsed__["rel-urls"][url] = value_dict
491 | 
492 |                 for rel_value in rel_attrs:
493 |                     value_list = self.__parsed__["rels"].get(rel_value, [])
494 |                     if url not in value_list:
495 |                         value_list.append(url)
496 |                     if rel_value not in url_rels:
497 |                         url_rels.append(rel_value)
498 | 
499 |                     self.__parsed__["rels"][rel_value] = value_list
500 |                 if "alternate" in rel_attrs:
501 |                     alternate_list = self.__parsed__.get("alternates", [])
502 |                     alternate_dict = {}
503 |                     alternate_dict["url"] = url
504 |                     x = " ".join([r for r in rel_attrs if not r == "alternate"])
505 |                     if x != "":
506 |                         alternate_dict["rel"] = x
507 |                     alternate_dict["text"] = el.get_text().strip()
508 |                     for knownattr in ("media", "hreflang", "type", "title"):
509 |                         x = get_attr(el, knownattr)
510 |                         if x is not None:
511 |                             alternate_dict[knownattr] = x
512 |                     alternate_list.append(alternate_dict)
513 |                     self.__parsed__["alternates"] = alternate_list
514 | 
515 |         def parse_el(el, ctx):
516 |             """Parse an element for microformats"""
517 |             classes = el.get("class", [])
518 | 
519 |             # find potential microformats in root classnames h-*
520 |             potential_microformats = mf2_classes.root(classes, self.filtered_roots)
521 | 
522 |             # if potential microformats found parse them
523 |             if potential_microformats:
524 |                 result = handle_microformat(potential_microformats, el)
525 |                 ctx.append(result)
526 |             else:
527 |                 # find backcompat root classnames
528 |                 potential_microformats = backcompat.root(classes)
529 |                 if potential_microformats:
530 |                     result = handle_microformat(
531 |                         potential_microformats, el, backcompat_mode=True
532 |                     )
533 |                     ctx.append(result)
534 |                 else:
535 |                     # parse child tags
536 |                     for child in get_children(el):
537 |                         parse_el(child, ctx)
538 | 
539 |         ctx = []
540 | 
541 |         if self.__metaformats:
542 |             # extract out a metaformats item, if available
543 |             self.__metaformats_item = metaformats.parse(self.__doc__, url=self.__url__)
544 | 
545 |         # start parsing at root element of the document
546 |         parse_el(self.__doc__, ctx)
547 |         self.__parsed__["items"] = ctx
548 |         if self.__metaformats and self.__metaformats_item:
549 |             self.__parsed__["items"].append(self.__metaformats_item)
550 | 
551 |         # parse for rel values
552 |         for el in get_descendents(self.__doc__):
553 |             if el.name in ("a", "area", "link") and el.has_attr("rel"):
554 |                 parse_rels(el)
555 | 
556 |         # sort the rels array in rel-urls since this should be unordered set
557 |         for url in self.__parsed__["rel-urls"]:
558 |             if "rels" in self.__parsed__["rel-urls"][url]:
559 |                 rels = self.__parsed__["rel-urls"][url]["rels"]
560 |                 self.__parsed__["rel-urls"][url]["rels"] = unordered_list(rels)
561 | 
562 |         # add actual parser used to debug
563 |         # uses builder.NAME from BeautifulSoup
564 |         if self.__html_parser__:
565 |             self.__parsed__["debug"]["markup parser"] = self.__html_parser__
566 |         else:
567 |             self.__parsed__["debug"]["markup parser"] = "unknown"
568 | 
569 |     def to_dict(self, filter_by_type=None):
570 |         """Get a dictionary version of the parsed microformat document.
571 | 
572 |         Args:
573 |           filter_by_type (string, optional): only include top-level items of
574 |             the given h-* type. Defaults to None.
575 | 
576 |         Returns:
577 |             dict: representation of the parsed microformats document
578 |         """
579 |         if filter_by_type is None:
580 |             return self.__parsed__
581 |         else:
582 |             return [x for x in self.__parsed__["items"] if filter_by_type in x["type"]]
583 | 
584 |     def to_json(self, pretty_print=False, filter_by_type=None):
585 |         """Get a json-encoding string version of the parsed microformats document
586 | 
587 |         Args:
588 |           pretty_print (bool, optional): Encode the json document with
589 |             linebreaks and indents to improve readability. Defaults to False.
590 |           filter_by_type (bool, optional): only include top-level items of
591 |             the given h-* type
592 | 
593 |         Returns:
594 |             string: a json-encoded string
595 |         """
596 | 
597 |         if pretty_print:
598 |             return json.dumps(
599 |                 self.to_dict(filter_by_type), indent=4, separators=(", ", ": ")
600 |             )
601 |         else:
602 |             return json.dumps(self.to_dict(filter_by_type))
603 | 


--------------------------------------------------------------------------------
/mf2py/temp_fixes.py:
--------------------------------------------------------------------------------
1 | def rm_templates(doc):
2 |     for el in doc.find_all("template"):
3 |         el.extract()
4 | 


--------------------------------------------------------------------------------
/mf2py/value_class_pattern.py:
--------------------------------------------------------------------------------
 1 | """functions to parse the properties of elements according to the value class pattern http://microformats.org/wiki/value-class-pattern """
 2 | 
 3 | import re
 4 | 
 5 | from .datetime_helpers import (
 6 |     DATE_RE,
 7 |     DATETIME_RE,
 8 |     TIME_RE,
 9 |     TIMEZONE_RE,
10 |     normalize_datetime,
11 | )
12 | from .dom_helpers import get_children
13 | 
14 | 
15 | def _get_vcp_value(el):
16 |     if "value-title" in el.get("class", []):
17 |         return el.get("title")
18 |     return el.get_text()
19 | 
20 | 
21 | def _get_vcp_children(el):
22 |     return [
23 |         c
24 |         for c in get_children(el)
25 |         if c.has_attr("class")
26 |         and ("value" in c["class"] or "value-title" in c["class"])
27 |     ]
28 | 
29 | 
30 | def text(el):
31 |     value_els = _get_vcp_children(el)
32 |     if value_els:
33 |         return "".join(_get_vcp_value(el) for el in value_els)
34 | 
35 | 
36 | def datetime(el, default_date=None):
37 |     value_els = _get_vcp_children(el)
38 |     if value_els:
39 |         date_parts = []
40 |         for value_el in value_els:
41 |             if "value-title" in value_el.get("class", []):
42 |                 title = el.get("title")
43 |                 if title:
44 |                     date_parts.append(title.strip())
45 |             elif value_el.name in ("img", "area"):
46 |                 alt = value_el.get("alt") or value_el.get_text()
47 |                 if alt:
48 |                     date_parts.append(alt.strip())
49 |             elif value_el.name == "data":
50 |                 val = value_el.get("value") or value_el.get_text()
51 |                 if val:
52 |                     date_parts.append(val.strip())
53 |             elif value_el.name == "abbr":
54 |                 title = value_el.get("title") or value_el.get_text()
55 |                 if title:
56 |                     date_parts.append(title.strip())
57 |             elif value_el.name in ("del", "ins", "time"):
58 |                 dt = value_el.get("datetime") or value_el.get_text()
59 |                 if dt:
60 |                     date_parts.append(dt.strip())
61 |             else:
62 |                 val = value_el.get_text()
63 |                 if val:
64 |                     date_parts.append(val.strip())
65 | 
66 |         date_part = time_part = tz_part = None
67 | 
68 |         for part in date_parts:
69 |             match = re.match(DATETIME_RE + "$", part)
70 |             if match:
71 |                 # if it's a full datetime, then we're done
72 |                 date_part = match.group("date")
73 |                 return normalize_datetime(part, match=match), date_part
74 | 
75 |             # only use first found value
76 |             if re.match(TIME_RE + "$", part) and time_part is None:
77 |                 time_part = part
78 |             elif re.match(DATE_RE + "$", part) and date_part is None:
79 |                 date_part = part
80 |             elif re.match(TIMEZONE_RE + "$", part) and tz_part is None:
81 |                 tz_part = part
82 | 
83 |         # use default date
84 |         if date_part is None:
85 |             date_part = default_date
86 | 
87 |         if date_part and time_part:
88 |             date_time_value = "%s %s" % (date_part, time_part)
89 |         else:
90 |             date_time_value = date_part or time_part
91 | 
92 |         if tz_part:
93 |             date_time_value += tz_part
94 | 
95 |         return normalize_datetime(date_time_value), date_part
96 | 


--------------------------------------------------------------------------------
/mf2py/version.py:
--------------------------------------------------------------------------------
1 | # Define the version number. This class is exec'd by setup.py to read
2 | # the value without loading mf2py (loading mf2py is bad if its dependencies
3 | # haven't been installed yet, which is common during setup)
4 | 
5 | import importlib.metadata
6 | 
7 | __version__ = importlib.metadata.metadata("mf2py")["Version"]
8 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: mf2py
 2 | site_url: https://microformats.github.io/mf2py
 3 | site_author: Microformats
 4 | site_description: A microformats2 parser written in Python.
 5 | repo_name: microformats/mf2py
 6 | repo_url: https://github.com/microformats/mf2py
 7 | edit_uri: https://github.com/microformats/mf2py/tree/main/docs
 8 | copyright: MIT License
 9 | 
10 | extra_css:
11 |   - stylesheets/styles.css
12 | 
13 | nav:
14 |   - Home: index.md
15 |   - Parser Object: parser.md
16 |   - Change Log: changelog.md
17 | 
18 | theme:
19 |   name: 'material'
20 |   logo: https://microformats.org/microformats-logo.png
21 |   favicon: https://microformats.org/microformats-logo.png
22 |   palette:
23 |     # Palette for light mode
24 |     - scheme: default
25 |       primary: 'custom'
26 |       toggle:
27 |         icon: material/brightness-7
28 |         name: Switch to dark mode
29 |   font:
30 |     text: Roboto
31 |     code: Roboto Mono
32 | 
33 | plugins:
34 |   - mkdocstrings
35 |   - search
36 | 
37 | markdown_extensions:
38 |   - admonition
39 |   - pymdownx.details
40 |   - pymdownx.superfences
41 |   - attr_list
42 |   - md_in_html
43 |   - pymdownx.tabbed:
44 |       alternate_style: true
45 |   - toc:
46 |       permalink: true
47 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "mf2py"
 3 | version = "2.0.1"
 4 | description = "Microformats parser"
 5 | readme = "README.md"
 6 | authors = ["Tom Morris <tom@tommorris.org>"]
 7 | license = "MIT"
 8 | classifiers = [
 9 |     "Intended Audience :: Developers",
10 |     "License :: OSI Approved :: MIT License",
11 |     "Programming Language :: Python :: 3",
12 |     "Topic :: Text Processing :: Markup :: HTML"
13 | ]
14 | 
15 | [tool.poetry.dependencies]
16 | python = ">=3.8"
17 | html5lib = "^1.1"
18 | requests = "^2.28.2"
19 | beautifulsoup4 = "^4.11.1"
20 | 
21 | [tool.poetry.group.dev.dependencies]
22 | lxml = "^4.9.2"
23 | pytest = "^7.2.1"
24 | black = "^23.3.0"
25 | isort = "^5.12.0"
26 | mkdocs = "^1.5.3"
27 | mkdocs-material = "^9.5.0"
28 | mkdocstrings = {extras = ["python"], version = "^0.24.0"}
29 | 
30 | [tool.pytest.ini_options]
31 | doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL"
32 | 
33 | [tool.isort]
34 | profile = "black"
35 | 
36 | [build-system]
37 | requires = ["poetry-core>=1.0.0"]
38 | build-backend = "poetry.core.masonry.api"
39 | 


--------------------------------------------------------------------------------
/test/examples/area.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8" />
 5 |   <title>hCard Area Example</title>
 6 | </head>
 7 | <body>
 8 |   <p>Hey Brian.</p>
 9 |   <img src="https://farm4.staticflickr.com/3836/14964945447_f750df4901.jpg" border="0" width="333" height="500" usemap="#sudamap" alt="Brian Suda at dConstruct" />
10 |   <map id="sudamap" name="sudamap">
11 |     <area alt="Brian Suda" title="Brian Suda" href="http://suda.co.uk" shape="poly" coords="83,195,116,169,160,97,215,108,214,144,193,191,194,237,196,294,236,313,229,341,207,351,195,373,205,496,78,498,70,365" style="outline:none;" class="h-card" />
12 |   </map>
13 | </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/feed_with_rel_bookmark.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Backcompat test for hEntry with nested rel=bookmark</title>
 5 |     <!-- This should not affect parsing elsewhere -->
 6 |     <link rel="bookmark" href="/about">
 7 |   </head>
 8 |   <body>
 9 |     <!-- This should not affect parsing elsewhere -->
10 |     <a rel="bookmark" href="/"></a>
11 | 
12 |     <article class="hentry">
13 |       <span class="author">Lee Adama</span>
14 |       <span class="entry-title">Jumping Rope for Weight Loss</span>
15 |       <div class="entry-content">Some Content</div>
16 |       <a rel="bookmark" href="/2014/11/24/jump-rope">Nov 24, 2014</a>
17 |     </article>
18 | 
19 |     <article class="hentry">
20 |       <span class="author">Kara Thrace</span>
21 |       <span class="entry-title">Abstract Art in Graffiti</span>
22 |       <div class="entry-content">More Content</div>
23 |       <a rel="bookmark" href="/2014/11/23/graffiti">Nov 23, 2014</a>
24 |     </article>
25 | 
26 |     <article class="hentry">
27 |       <span class="author">President Roslyn</span>
28 |       <span class="entry-title">Dreams of Earth</span>
29 |       <div class="entry-content">Additional Content</div>
30 |       <a rel="bookmark" href="/2014/11/21/earth">Nov 21, 2014</a>
31 |     </article>
32 | 
33 |     <article class="hentry">
34 |       <span class="author">Chief Tyrrol</span>
35 |       <span class="entry-title">Organized Labor in Mining Colonies</span>
36 |       <div class="entry-content">More Content</div>
37 |       <a rel="bookmark" href="/2014/11/19/labor">Nov 19, 2014</a>
38 |     </article>
39 | 
40 |   </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hentry.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Backcompat Properties</title>
 6 | </head>
 7 | <body>
 8 | 
 9 |   <article class="hentry">
10 |     <span class="author">Tom Morris</span>
11 |     <span class="entry-title">A Title</span>
12 |     <div class="entry-content">Some Content</div>
13 |   </article>
14 | 
15 | </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hentry_content_html.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Backcompat Properties</title>
 6 | </head>
 7 | <body>
 8 | 
 9 | <article class="hentry">
10 |     <section class="entry-content">
11 |         <p class="entry-summary">This is a summary</p> 
12 |         <p>This is <a href="/tags/mytag" rel="tag">mytag</a> inside content. </p>
13 |     </section>
14 | </article>
15 | 
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hentry_with_rel_bookmark.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Backcompat rel=bookmark</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="hentry">
 9 |       <a rel="bookmark" href="https://example.com/bookmark">rhinoceros</a>
10 |       <a rel="bookmark u-url" href="https://example.com/bookmark-url">rhinoceros</a>
11 |       <a rel="u-url" href="https://example.com/url">rhinoceros</a>
12 |     </article>
13 |   </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hentry_with_rel_tag.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Backcompat rel=tag on hentry</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="hentry">
 9 |       <a rel="tag" href="https://example.com/tags/cat">rhinoceros</a>
10 |       <a rel="tag" href="https://example.com/tags/dog/">giraffe</a>
11 |       <a rel="tag" href="https://example.com/tags/mountain%20lion/">hyena</a>
12 |       <!-- throw some bad input at the parser -->
13 |       <a rel="tag"></a>
14 |       <a rel="tag" href=""></a>
15 |       <!-- throw some mf2 at the parser to ignore -->
16 |       <a rel="tag" class="p-category" href="https://example.com/tags/mouse">elephant</a>
17 |       <a rel="tag" class="u-category" href="https://example.com/tags/meerkat">dinosaur</a>
18 |     </article>
19 |   </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hentry_with_rel_tag_entry_title.html:
--------------------------------------------------------------------------------
1 | <article class="hentry">
2 |       <a rel="tag" class="entry-title" href="https://example.com/tags/cat">rhinoceros</a>
3 | </article>
4 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hfeed_with_rel_tag.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Backcompat rel=tag on hfeed</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="hfeed">
 9 |       <a rel="tag" href="https://example.com/tags/cat">rhinoceros</a>
10 |       <a rel="tag" href="https://example.com/tags/dog/">giraffe</a>
11 |       <a rel="tag" href="https://example.com/tags/mountain%20lion/">hyena</a>
12 |       <!-- throw some bad input at the parser -->
13 |       <a rel="tag"></a>
14 |       <a rel="tag" href=""></a>
15 |       <!-- throw some mf2 at the parser to ignore -->
16 |       <a rel="tag" class="p-category" href="https://example.com/tags/mouse">elephant</a>
17 |       <a rel="tag" class="u-category" href="https://example.com/tags/meerkat">dinosaur</a>
18 |     </article>
19 |   </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hproduct.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>hProduct Backcompat Test</title>
 5 | </head>
 6 | <body>
 7 | <div class="hproduct">
 8 |   <h1 class="fn">Tom's Magical Quack Tincture</h1>
 9 |   <dl>
10 |     <dt>Brand</dt>
11 |     <dd class="brand">Quacktastic Products</dd>
12 |     <dt>Category</dt>
13 |     <dd class="category">bullshit</dd>
14 |     <dt>Price</dt>
15 |     <dd class="price">£299.99</dd>
16 |     <dt>Description</dt>
17 |     <dd class="description">Magical tasty sugar pills that don't do anything.</dd>
18 |     <dt>Identifier</dt>
19 |     <dd><a href="#BULLSHIT-001" class="identifier">BULLSHIT-001</a></dd>
20 |   </dl>
21 |   <p class="review">
22 |     I'm a gullible idiot and I love giving money to random developers on the internet
23 |     who are pretending to be quacks for a laugh while writing tests. I love this product!
24 |   </p>
25 | </div>
26 | </body>
27 | </html>
28 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hproduct_hreview_nested.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>hProduct Backcompat Test</title>
 5 | </head>
 6 | <body>
 7 | <div class="hproduct">
 8 |   <h1 class="fn">Tom's Magical Quack Tincture</h1>
 9 |   <dl>
10 |     <dt>Brand</dt>
11 |     <dd class="brand">Quacktastic Products</dd>
12 |     <dt>Category</dt>
13 |     <dd class="category">bullshit</dd>
14 |     <dt>Price</dt>
15 |     <dd class="price">£299.99</dd>
16 |     <dt>Description</dt>
17 |     <dd class="description">Magical tasty sugar pills that don't do anything.</dd>
18 |     <dt>Identifier</dt>
19 |     <dd><a href="#BULLSHIT-001" class="identifier">BULLSHIT-001</a></dd>
20 |   </dl>
21 |   <p class="hreview">
22 |     I'm a gullible idiot and I love giving money to random developers on the internet
23 |     who are pretending to be quacks for a laugh while writing tests. I love this product!
24 |   </p>
25 | </div>
26 | </body>
27 | </html>
28 | 
29 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hrecipe_with_rel_tag.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Backcompat rel=tag on hrecipe</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="hrecipe">
 9 |       <a rel="tag" href="https://example.com/tags/cat">rhinoceros</a>
10 |       <a rel="tag" href="https://example.com/tags/dog/">giraffe</a>
11 |       <a rel="tag" href="https://example.com/tags/mountain%20lion/">hyena</a>
12 |       <!-- throw some bad input at the parser -->
13 |       <a rel="tag"></a>
14 |       <a rel="tag" href=""></a>
15 |       <!-- throw some mf2 at the parser to ignore -->
16 |       <a rel="tag" class="p-category" href="https://example.com/tags/mouse">elephant</a>
17 |       <a rel="tag" class="u-category" href="https://example.com/tags/meerkat">dinosaur</a>
18 |     </article>
19 |   </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hreview_hentry_with_rel_tag_bookmark.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Backcompat rel=tag and rel=bookmark on hreview and hentry</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="hreview hentry">
 9 |       <a rel="tag" href="https://example.com/tags/cat">rhinoceros</a>
10 |       <a rel="tag" href="https://example.com/tags/dog/">giraffe</a>
11 |       <a rel="tag" href="https://example.com/tags/mountain%20lion/">hyena</a>
12 |       <!-- throw some bad input at the parser -->
13 |       <a rel="tag"></a>
14 |       <a rel="tag" href=""></a>
15 |       <!-- throw some mf2 at the parser to ignore -->
16 |       <a rel="tag" class="p-category" href="https://example.com/tags/mouse">elephant</a>
17 |       <a rel="tag" class="u-category" href="https://example.com/tags/meerkat">dinosaur</a>
18 |       <a rel="bookmark" href="https://example.com/bookmark">rhinoceros</a>
19 |       <a rel="bookmark u-url" href="https://example.com/bookmark-url">rhinoceros</a>
20 |       <a rel="u-url" href="https://example.com/url">rhinoceros</a>
21 |     </article>
22 |   </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hreview_nested_card_event_product.html:
--------------------------------------------------------------------------------
 1 | <article class="hreview">
 2 |     <div class="item vevent">
 3 |         <span class="summary">event name</span>
 4 |         <a class="url" href="http://example.com/event-url">event url</a>
 5 |     </div>
 6 | 
 7 |     <div class="item vcard">
 8 |         <span class="fn">card name</span>
 9 |         <a class="url" href="http://example.com/card-url">card url</a>
10 |     </div>
11 | 
12 |     <div class="item hproduct">
13 |         <span class="fn">product name</span>
14 |         <a class="url" href="http://example.com/product-url">product url</a>
15 |     </div>
16 | </article>
17 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/hreview_with_rel_tag_bookmark.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Backcompat rel=tag and rel=bookmark on hreview</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="hreview">
 9 |       <a rel="tag" href="https://example.com/tags/cat">rhinoceros</a>
10 |       <a rel="tag" href="https://example.com/tags/dog/">giraffe</a>
11 |       <a rel="tag" href="https://example.com/tags/mountain%20lion/">hyena</a>
12 |       <!-- throw some bad input at the parser -->
13 |       <a rel="tag"></a>
14 |       <a rel="tag" href=""></a>
15 |       <!-- throw some mf2 at the parser to ignore -->
16 |       <a rel="tag" class="p-category" href="https://example.com/tags/mouse">elephant</a>
17 |       <a rel="tag" class="u-category" href="https://example.com/tags/meerkat">dinosaur</a>
18 |       <a rel="bookmark" href="https://example.com/not-bookmark">rhinoceros</a>
19 |       <a rel="self bookmark" href="https://example.com/bookmark">rhinoceros</a>
20 |       <a rel="bookmark" href="https://example.com/not-bookmark">rhinoceros</a>
21 |       <a rel="self bookmark u-url" href="https://example.com/bookmark-url">rhinoceros</a>
22 |       <a rel="u-url" href="https://example.com/url">rhinoceros</a>
23 |     </article>
24 |   </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/ignore_mf1_properties_in_mf2_root.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Ignore mf2 properties for mf1 roots</title>
 5 | </head>
 6 | <body>
 7 | <div class="h-entry">
 8 | <span class="p-name">Correct name</span>
 9 | <span class="entry-title">Wrong name</span>
10 | 
11 | <span class="p-summary">Correct summary</span>
12 | <span class="entry-summary">Wrong summary</span>
13 | </div>
14 | </body>
15 | </html>
16 | 
17 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/ignore_mf1_root_if_mf2_present.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Ignore mf1 root class if mf2 root present test</title>
 5 | </head>
 6 | <body>
 7 | <div class="hentry h-event">
 8 |   <h1 class="p-name">My awesome event</h1>
 9 | <time class="dt-published dt-updated" datetime="2018-03-09T12:15:40-05:00">at 12:15 on 09 Mar 2018 (-0500)</time>
10 | <p class="p-summary">This will be an awesome event</p>
11 | </div>
12 | </body>
13 | </html>
14 | 
15 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/ignore_mf2_properties_in_mf1_root.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Ignore mf2 properties for mf1 roots</title>
 5 | </head>
 6 | <body>
 7 | <div class="hentry">
 8 | <span class="p-name">Wrong name</span>
 9 | <span class="entry-title">Correct name</span>
10 | 
11 | <span class="p-summary">Wrong summary</span>
12 | <span class="entry-summary">Correct summary</span>
13 | </div>
14 | </body>
15 | </html>
16 | 
17 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/nested_mf1_in_mf2.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Nested mf1 in mf2</title>
 5 | </head>
 6 | <body class="h-feed">
 7 | <div class="hentry">
 8 | <span class="entry-title">Correct name</span>
 9 | 
10 | <span class="entry-summary">Correct summary</span>
11 | </div>
12 | </body>
13 | </html>
14 | 
15 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/nested_mf1_in_mf2_e_content.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Nested mf1 in mf2 e-content</title>
 5 | </head>
 6 | <body class="h-entry">
 7 | <div class="e-content">
 8 | <div class="hentry">
 9 | <span class="entry-title">Correct name</span>
10 | 
11 | <span class="entry-summary">Correct summary</span>
12 | </div>
13 | </div>
14 | </body>
15 | </html>
16 | 
17 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/nested_mf2_in_mf1.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Nested mf2 in mf1</title>
 5 | </head>
 6 | <body class="hfeed">
 7 | <div class="h-entry">
 8 | <span class="p-name">Correct name</span>
 9 | 
10 | <span class="p-summary">Correct summary</span>
11 | </div>
12 | </body>
13 | </html>
14 | 
15 | 


--------------------------------------------------------------------------------
/test/examples/backcompat/no_implied_properties_mf1_root.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>No implied properties for mf1 roots</title>
 5 | </head>
 6 | <body>
 7 | <div class="hentry">
 8 | This should not be in the name property
 9 | <a href="http://example.com">http://example.com should not be in url property</a>
10 | <img src="http://example.com/photo">http://example.com/photo should not be in photo property
11 | </div>
12 | </body>
13 | </html>
14 | 
15 | 


--------------------------------------------------------------------------------
/test/examples/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Hello World</title>
 6 |   <base href="http://tantek.com/" />
 7 | </head>
 8 | <body>
 9 |   <p>Hello world!</p>  
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/examples/broken_url.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |   <title>Hello World</title>
 4 | </head>
 5 | <body>
 6 | <article class="h-entry">
 7 |     <h1><a  class="p-name">urls with broken domains</a></h1>
 8 |     <a class="u-url" href="http://www.[w3.org/">Should not change: http://www.[w3.org/</a>
 9 |     <a class="u-relative" href="foo.html">Should be relative to base url</a>
10 |     <img src="http://www.w3].org/20[08/site/images/logo-w3c-mobile-lg" class="u-photo">
11 | </article>
12 | </body>
13 | 


--------------------------------------------------------------------------------
/test/examples/class_names_format.html:
--------------------------------------------------------------------------------
1 | <article class="h-x-test h-p3k-entry h-feed h-Entry h-p3k-fEed h--d h-test-">
2 |     <a class="u-url u-Url u-p3k-url u--url u-url-" href="example.com" >URL </a>
3 |     <span class="p-name p-nAme p-p3k-name p--name p-name-" >name</span>
4 | </article>
5 | 
6 | 


--------------------------------------------------------------------------------
/test/examples/complex_e_content.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Complex e-content test</title>
 6 | </head>
 7 | <body>
 8 | <div class="h-entry">
 9 | <div class="h-card e-content"><p>Hello</p></div></div>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/examples/datetimes.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |  <meta http-equiv="content-type" content="text/html; charset=utf-8">
  5 |  <title>Hello World</title>
  6 | </head>
  7 | <body>
  8 |  <div class="h-event">
  9 |   <h1 class="p-name">Microformats Hootenanny</h1>
 10 |   Arrive <span class="dt-start">2014-01-01T12:00:00+00:00</span>, stay until
 11 |   <time class="dt-end" datetime="3014-01-01T18:00:00+00:00">six pee em GMT on the first day of the first month, year three thousand, four and ten.</time>
 12 | 
 13 |   That’s right, the microformats <del class="dt-updated" datetime="2011-08-26T00:01:21+00:00">party</del><ins class="dt-updated" datetime="2011-08-26T00:01:21+00:00">hootenanny</ins> lasts for <abbr class="dt-duration" title="P1000Y">ONE THOUSAND YEARS</abbr>.
 14 |  </div>
 15 | 
 16 |  <div class="h-entry">
 17 |    <span class="dt-published">
 18 |      <img class="value" alt="3014-01-01" src="onethousandyears.png"/> at precisely <data class="value" value="01:21Z">1:21 am</data>
 19 |    </span>
 20 |  </div>
 21 | 
 22 |  <div class="h-entry">
 23 |    <span class="dt-updated">
 24 |      <abbr class="value" title="2014-03-11 09:55">3/11 9:55</abbr>
 25 |    </span>
 26 |  </div>
 27 | 
 28 |  <div class="h-entry">
 29 |    <!-- a la Tantek -->
 30 |    <span class="dt-published published dt-updated updated">
 31 |      <time class="value">3:28pm</time> on <time class="value">2014-01-30</time>
 32 |    </span>
 33 |  </div>
 34 | 
 35 |  <div class="h-entry">
 36 |    <span class="dt-published">
 37 |      <b class="value">9999-01-14T11:52+08:00</b>
 38 |    </span>
 39 |  </div>
 40 | 
 41 |  <div class="h-entry">
 42 |    <!-- wiki event with UTC offset -->
 43 |    <span class="dt-published">
 44 |      <span class="value" title="June 1, 2014">2014-06-01</span>
 45 |      <span class="value" title="12:30">12:30<span style="display: none;">-06:00</span></span>
 46 |    </span>
 47 |  </div>
 48 | 
 49 |  <div class="h-event">
 50 |    <h1 class="p-name">Implied Date wo Timezone</h1>
 51 | 
 52 |    This test case and the next are courtesy of event templates on
 53 |    http://indiewebcamp.com/User:Gregorlove.com/sandbox
 54 | 
 55 |    <p> When:
 56 |      <span class="dt-start">
 57 |        <span class="value" title="May 21, 2014">2014-05-21</span>
 58 |        <span class="value" title="18:30">18:30</span>
 59 |        –
 60 |        <span class="dt-end">19:30</span>
 61 |      </span> (local time)
 62 |    </p>
 63 |  </div>
 64 | 
 65 |  <div class="h-event">
 66 |    <h1 class="p-name">Implied Date w/ Timezone</h1>
 67 | 
 68 |    <p> When:
 69 |      <span class="dt-start">
 70 |        <span class="value" title="June 1, 2014">2014-06-01</span>
 71 |        <span class="value" title="12:30">12:30<span style="display: none;">-06:00</span></span>
 72 |        –
 73 |        <span class="dt-end">19:30<span style="display: none;">-06:00</span></span>
 74 |      </span> (-06:00 <abbr>UTC</abbr>)
 75 |    </p>
 76 | 
 77 |  </div>
 78 | 
 79 |  <div class="h-event">
 80 |    <h1 class="p-name">Implied Date w/ separate Timezone</h1>
 81 | 
 82 |    <p> When:
 83 |      <span class="dt-start">
 84 |        <span class="value" title="June 1, 2014">2014-06-01</span>
 85 |        <span class="value" title="12:30">12:30</span>
 86 |        (UTC<span class="value">-06:00</span>)
 87 |        –
 88 |        <span class="dt-end">19:30</span>
 89 |    </p>
 90 | 
 91 |  </div>
 92 | 
 93 |  <div class="h-event">
 94 |    <h1 class="p-name">Multiple date and time values</h1>
 95 | 
 96 |    <p> When:
 97 |      <span class="dt-start">
 98 |        <span class="value" title="June 1, 2014">2014-06-01</span>
 99 |        <span class="value" title="June 1, 3014">3014-06-01</span>
100 |        <span class="value" title="12:30">12:30</span>
101 |        (UTC<span class="value">-06:00</span>)
102 |        <span class="value" title="23:00">23:00</span>
103 |        (UTC<span class="value">+01:00</span>)
104 |        –
105 |        <span class="dt-end">19:30</span>
106 |    </p>
107 | 
108 |  </div>
109 | 
110 |  <div class="h-event">
111 |    <h1 class="p-name">AM PM conversion</h1>
112 | 
113 |    <p> When:
114 |      <span class="dt-start">
115 |        <span class="value" title="June 1, 2014">2014-06-01</span>
116 |        <span class="value">12:30am</span>
117 |        (UTC<span class="value">-06:00</span>)
118 |        –
119 |        <span class="dt-end">12:15p.m.</span>
120 |    </p>
121 | 
122 |    <p> When:
123 |      <span class="dt-start">
124 |        <span class="value" title="June 1, 2014">2014-06-01</span>
125 |        <span class="value">12:30AM</span>
126 |        (UTC<span class="value">-06:00</span>)
127 |        –
128 |        <span class="dt-end">12:15P.M.</span>
129 |    </p>
130 |  </div>
131 | 
132 |  <div class="h-event">
133 |    <h1 class="p-name">Ordinal date</h1>
134 | 
135 |    <p> When:
136 |      <span class="dt-start">
137 |        <span class="value">2016-062</span>
138 |        <span class="value">12:30AM</span>
139 |        (UTC<span class="value">-06:00</span>)
140 |    </p>
141 |  </div>
142 | 
143 |  <div class="h-entry">
144 |    <!-- wiki event with UTC offset without leading zero -->
145 |    <span class="dt-start">
146 |      <span class="value" title="June 1, 2014">2014-06-01</span>
147 |      <span class="value" title="12:30">12:30<span style="display: none;">-6:00</span></span>
148 |    </span>
149 |  </div>
150 | 
151 |  <div class="h-entry">
152 |    <!-- wiki event with positive UTC offset without leading zero-->
153 |    <span class="dt-start">
154 |      <span class="value" title="June 1, 2014">2014-06-01</span>
155 |      <span class="value" title="12:30">12:30<span style="display: none;">+6:00</span></span>
156 |    </span>
157 |  </div>
158 | 
159 | <div class="h-entry">
160 |    <!-- wiki event with Zulu time-->
161 |    <span class="dt-start">
162 |      <span class="value" title="June 1, 2014">2014-06-01</span>
163 |      <span class="value" title="12:30">12:30<span style="display: none;">Z</span></span>
164 |    </span>
165 |  </div>
166 | 
167 |  <div class="h-entry">
168 |    <!-- with UTC offset without leading zero in extra value part -->
169 |    <span class="dt-start">
170 |      <span class="value" title="June 1, 2014">2014-06-01</span>
171 |      <span class="value" title="12:30">12:30</span><span class="value">-6:00</span></span>
172 |    </span>
173 |  </div>
174 | 
175 | 
176 | </body>
177 | </html>
178 | 


--------------------------------------------------------------------------------
/test/examples/embedded.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Hello World</title>
 6 | </head>
 7 | <body>
 8 |  <div class="h-entry">
 9 |   <h1 class="p-name">A post with embedded markup in</h1>
10 |   
11 |   <div class="e-content">
12 |    <p>Blah blah blah blah blah.</p>
13 |    <p>Blah.</p>
14 |    <p>Blah blah blah.</p>
15 |   </div>
16 |  </div>
17 | </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/test/examples/empty.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Hello World</title>
 6 | </head>
 7 | <body>
 8 |   <p>Hello world!</p>  
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/test/examples/eras.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang=en-us>
 3 | <head>
 4 | <base href=https://example.com>
 5 | <link rel=webmention href=/mentions>
 6 | </head>
 7 | <body>
 8 | <div class=h-entry>
 9 |   <h1 class=p-name>Excited for the Taylor Swift Eras Tour</h1>
10 |   <p>Published by <a class="p-author h-card" href=/>James</a>
11 |     <time class=dt-published datetime=2023-11-30T19:08:09>Thursday, November 30</time></p>
12 |   <div><img class=u-featured alt="Eras tour poster" src=/eras.jpg></div>
13 |   <div class=e-content><p>I can't decide which era is my favorite.</p></div>
14 |   <p><span class=p-category>music</span>, <span class=p-category>Taylor Swift</span></p>
15 | </div>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/examples/festivus.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <h2 class=h-card>Jerry</h2>
 3 | <p class=h-entry><q class=p-content>Happy Festivus!</q></p>
 4 | <h2 class=h-card>Frank</h2>
 5 | <p class=h-entry><q class=p-content>It's time for the Festivus feats of
 6 | strength.</q></p>
 7 | <p class=h-entry><q class=p-content>The tradition of Festivus begins with
 8 | the airing of grievances.</q></p>
 9 | <h2 class=h-card>Cosmo</h2>
10 | <p class=h-entry><q class=p-content>It's a Festivus miracle!</q></p>
11 | 


--------------------------------------------------------------------------------
/test/examples/filter_roots.html:
--------------------------------------------------------------------------------
 1 | <h2>Tailwind root filter</h2>
 2 | <div class=h-card>fnord</div>
 3 | <div class=h-auto>fnord</div>
 4 | <div class=h-fit>fnord</div>
 5 | <div class=h-full>fnord</div>
 6 | <div class=h-max>fnord</div>
 7 | <div class=h-min>fnord</div>
 8 | <div class=h-px>fnord</div>
 9 | <div class=h-screen>fnord</div>
10 | 


--------------------------------------------------------------------------------
/test/examples/filter_roots_custom.html:
--------------------------------------------------------------------------------
1 | <h2>Custom root filter</h2>
2 | <div class=h-card>fnord</div>
3 | <div class=h-foo>fnord</div>
4 | <div class=h-bar>fnord</div>
5 | <div class=h-bat>fnord</div>
6 | <div class=h-baz>fnord</div>
7 | 


--------------------------------------------------------------------------------
/test/examples/hcard_with_empty_url.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title></title>
 5 |   </head>
 6 |   <body>
 7 |     <div class="h-card">
 8 |       <span class="p-name">Cheri Oteri</span>
 9 |       <a class="u-url" href="">Blank URL</a>
10 |     </div>
11 |     <div class="h-card">
12 |       <span class="p-name">Maya Rudolph</span>
13 |       <a class="u-url" href=""></a>
14 |     </div>
15 |   </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/test/examples/hfeed_on_html_tag.html:
--------------------------------------------------------------------------------
 1 | <html class="h-feed" lang="en">
 2 |   <head>
 3 |     <title>html tag with class h-feed</title>
 4 |   </head>
 5 |   <body>
 6 | 
 7 |     <article class="h-entry">
 8 |       <p class="name">entry1</p>
 9 |     </article>
10 | 
11 |     <article class="h-entry">
12 |       <p class="name">entry2</p>
13 |     </article>
14 | 
15 |   </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/test/examples/img_with_alt.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Photo with alt attribute</title>
 6 | </head>
 7 | <body>
 8 | <article class="h-entry">
 9 |   <span class="p-name">name</span>
10 |   <img class="u-photo" src="/photo.jpg"/>
11 | </article>
12 | 
13 | <article class="h-entry">
14 |   <span class="p-name">name</span>
15 |   <img class="u-url" src="/photo.jpg" alt="alt text" />
16 | </article>
17 | 
18 | <article class="h-entry">
19 |   <span class="p-name">name</span>
20 |   <img class="u-in-reply-to" src="/photo.jpg" alt="" />
21 | </article>
22 | 
23 | <article class="h-entry">
24 |   <span class="p-name">name</span>
25 |   <img class="u-in-reply-to h-cite" src="/photo.jpg"/>
26 | </article>
27 | 
28 | <article class="h-entry">
29 |   <span class="p-name">name</span>
30 |   <img class="u-in-reply-to h-cite" src="/photo.jpg" alt="alt text" />
31 | </article>
32 | 
33 | <article class="h-entry">
34 |   <span class="p-name">name</span>
35 |   <img class="u-in-reply-to h-cite" src="/photo.jpg" alt="" />
36 | </article>
37 | </body>
38 | </html>
39 | 


--------------------------------------------------------------------------------
/test/examples/img_with_srcset.html:
--------------------------------------------------------------------------------
 1 | <div class="h-entry">
 2 |   <img class="u-photo"
 3 |        srcset="elva-fairy-480w.jpg 480w,
 4 |                elva-fairy-800w.jpg 800w"
 5 |        sizes="(max-width: 600px) 480px,
 6 |               800px"
 7 |        src="elva-fairy-800w.jpg"
 8 |        alt="Elva dressed as a fairy">
 9 | </div>
10 | 
11 | <div class="h-entry">
12 |   <img class="u-photo"
13 |        srcset="elva-fairy-320w.jpg,
14 |                elva-fairy-480w.jpg 1.5x,
15 |                elva-fairy-640w.jpg 2x,
16 |                elva-fairy-1.5w.jpg 1.5x,
17 |                elva-fairy-2w.jpg 2x"
18 |        src="elva-fairy-320w.jpg"
19 |        alt="Elva dressed as a fairy">
20 | </div>
21 | 
22 | <div class="h-entry">
23 |   <img class="u-photo"
24 |        srcset="elva-fairy,320w.jpg, elva-fairy,480w.jpg 1.5x"
25 |        src="elva-fairy-320w.jpg"
26 |        alt="Elva dressed as a fairy">
27 | </div>
28 | 
29 | <div class="h-entry">
30 |   <img class="u-photo"
31 |        srcset="elva-fairy,320w.jpg ,elva-fairy,480w.jpg 1.5x"
32 |        src="elva-fairy-320w.jpg"
33 |        alt="Elva dressed as a fairy">
34 | </div>
35 | 
36 | <div class="h-entry">
37 |   <img class="u-photo"
38 |        srcset="elva-fairy,320w.jpg 1x,elva-fairy,480w.jpg 1.5x"
39 |        src="elva-fairy-320w.jpg"
40 |        alt="Elva dressed as a fairy">
41 | </div>
42 | 
43 | <div class="h-entry">
44 |   <img class="u-photo"
45 |        srcset="elva-fairy,320w.jpg 1x ,elva-fairy,480w.jpg 1.5x"
46 |        src="elva-fairy-320w.jpg"
47 |        alt="Elva dressed as a fairy">
48 | </div>
49 | 
50 | <div class="h-entry">
51 |   <img class="u-photo"
52 |        srcset="elva-fairy,320w.jpg 1x , elva-fairy,480w.jpg 1.5x"
53 |        src="elva-fairy-320w.jpg"
54 |        alt="Elva dressed as a fairy">
55 | </div>
56 | 
57 | <div class="h-entry">
58 |   <img class="u-photo"
59 |        srcset="elva-fairy,320w.jpg"
60 |        src="elva-fairy-320w.jpg"
61 |        alt="Elva dressed as a fairy">
62 | </div>
63 | 
64 | <div class="h-entry">
65 |   <img class="u-photo"
66 |        srcset="
67 |            elva-fairy,320w.jpg,
68 |                elva-fairy,480w.jpg
69 |        1.5x,
70 |                elva-fairy,640w.jpg 2x   ,
71 | 
72 |                elva-fairy,1.5w.jpg     1.5x     ,
73 |                elva-fairy,2w.jpg 2x
74 |        "
75 |        src="elva-fairy-320w.jpg"
76 |        alt="Elva dressed as a fairy">
77 | </div>
78 | 


--------------------------------------------------------------------------------
/test/examples/img_with_srcset_with_base.html:
--------------------------------------------------------------------------------
 1 | <base href="https://example.com">
 2 | 
 3 | <div class="h-entry">
 4 |   <img class="u-photo"
 5 |        srcset="elva-fairy-480w.jpg 480w,
 6 |                elva-fairy-800w.jpg 800w"
 7 |        sizes="(max-width: 600px) 480px,
 8 |               800px"
 9 |        src="elva-fairy-800w.jpg"
10 |        alt="Elva dressed as a fairy">
11 | </div>
12 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_name_alt.html:
--------------------------------------------------------------------------------
1 | <article class="h-entry">
2 |   <div class="author h-card">
3 |     <img alt="Avatar of" class="u-photo" src="avatar.jpg" /> <span>Stephen</span>
4 |   </div>
5 | </article>
6 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_name_empty_alt.html:
--------------------------------------------------------------------------------
1 | <a class="h-card" href="https://twitter.com/kylewmahan">
2 |   <img src="https://example.org/test.jpg" alt="">
3 |   @kylewmahan
4 | </a>
5 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_photo.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
  5 |   <title>Implied Photo</title>
  6 | </head>
  7 | <body>
  8 | 
  9 | <!--if img.h-x[src], then use src for photo-->
 10 | <img class="h-entry" src="http://example.com/photo.jpg"/>
 11 | 
 12 | <!--else if object.h-x[data] then use data for photo-->
 13 | <object class="h-entry" data="http://example.com/photo.jpg"></object>
 14 | 
 15 | <!--else if .h-x>img[src]:only-of-type:not[.h-*] then use that img src for photo-->
 16 | <article class="h-entry">
 17 |     <img src="http://example.com/photo.jpg"/>
 18 | </article>
 19 | 
 20 | <article class="h-entry">
 21 |     <img src="http://example.com/photo.jpg"/>
 22 |     <span> test </span>
 23 | </article>
 24 | 
 25 | <!--else if .h-x>object[data]:only-of-type:not[.h-*] then use that object’s data for photo-->
 26 | <article class="h-entry">
 27 |     <object data="http://example.com/photo.jpg"></object>
 28 | </article>
 29 | 
 30 | <article class="h-entry">
 31 |     <object data="http://example.com/photo.jpg"></object>
 32 |     <span> test </span>
 33 | </article>
 34 | 
 35 | <!--else if .h-x>:only-child:not[.h-*]>img[src]:only-of-type:not[.h-*], then use that img’s src for photo-->
 36 | <article class="h-entry">
 37 |     <div>
 38 |         <img src="http://example.com/photo.jpg"/>
 39 |     </div>
 40 | </article>
 41 | 
 42 | <article class="h-entry">
 43 |     <div>
 44 |         <img src="http://example.com/photo.jpg"/>
 45 |         <span>test</span>
 46 |     </div>
 47 | </article>
 48 | 
 49 | <!--else if .h-x>:only-child:not[.h-*]>object[data]:only-of-type:not[.h-*], then use that object’s data for photo -->
 50 | 
 51 | <article class="h-entry">
 52 |     <div>
 53 |         <object data="http://example.com/photo.jpg"></object>
 54 |     </div>
 55 | </article>
 56 | 
 57 | <article class="h-entry">
 58 |     <div>
 59 |         <object data="http://example.com/photo.jpg"></object>
 60 |         <span>test</span>
 61 |     </div>
 62 | </article>
 63 | 
 64 | <!--priority tests; prefer img over object-->
 65 | 
 66 | <article class="h-entry">
 67 |     <object data="http://example.com/photo2.jpg"></object>
 68 |     <img src="http://example.com/photo.jpg"/>
 69 | </article>
 70 | 
 71 | <article class="h-entry">
 72 |     <div>
 73 |         <object data="http://example.com/photo2.jpg"></object>
 74 |         <img src="http://example.com/photo.jpg"/>
 75 |     </div>
 76 | </article>
 77 | 
 78 | <!-- no photo tests -->
 79 | <article class="h-entry">
 80 |     <img src="http://example.com/photo.jpg"/>
 81 |     <img src="http://example.com/photo2.jpg"/>
 82 | </article>
 83 | 
 84 | <article class="h-entry">
 85 |     <img class="h-cite" src="http://example.com/photo.jpg"/>
 86 | </article>
 87 | 
 88 | <article class="h-entry">
 89 |     <object data="http://example.com/photo.jpg"></object>
 90 |     <object data="http://example.com/photo2.jpg"></object>
 91 | </article>
 92 | 
 93 | <article class="h-entry">
 94 |     <object class="h-cite" data="http://example.com/photo.jpg"></object>
 95 | </article>
 96 | 
 97 | 
 98 | <article class="h-entry">
 99 |     <div>
100 |         <img src="http://example.com/photo.jpg"/>
101 |     </div>
102 |     <span>test</span>
103 | </article>
104 | 
105 | <article class="h-entry">
106 |     <div class="h-cite">
107 |         <img src="http://example.com/photo.jpg"/>
108 |     </div>
109 | </article>
110 | 
111 | <article class="h-entry">
112 |     <div>
113 |         <img src="http://example.com/photo.jpg"/>
114 |         <img src="http://example.com/photo2.jpg"/>
115 |     </div>
116 | </article>
117 | 
118 | <article class="h-entry">
119 |     <div class="h-cite">
120 |         <img class="h-cite" src="http://example.com/photo.jpg"/>
121 |     </div>
122 | </article>
123 | 
124 | <article class="h-entry">
125 |     <div>
126 |         <object data="http://example.com/photo.jpg"></object>
127 |     </div>
128 |     <span>test</span>
129 | </article>
130 | 
131 | <article class="h-entry">
132 |     <div class="h-cite">
133 |         <object data="http://example.com/photo.jpg"></object>
134 |     </div>
135 | </article>
136 | 
137 | <article class="h-entry">
138 |     <div>
139 |         <object data="http://example.com/photo.jpg"></object>
140 |         <object data="http://example.com/photo2.jpg"></object>
141 |     </div>
142 | </article>
143 | 
144 | </body>
145 | </html>
146 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_photo_relative_url.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <base href="http://example.com">
3 | <img class="h-card" alt="Jane Doe" src="jane-img.jpeg">
4 | <object class="h-card" data="jane-object.jpeg">Jane Doe</object>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_properties.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Implied Properties</title>
 6 | </head>
 7 | <body>
 8 | 	<span class="h-card">Tom Morris</span>
 9 | 
10 | 	<a class="h-card" href="http://tommorris.org/">Tom Morris</a>
11 | 
12 | 	<a class="h-card" href="http://tommorris.org/"><img src="http://tommorris.org/photo.png" alt="" />Tom Morris</a>
13 | 
14 | 	<a class="h-card" href="http://tommorris.org/"><img src="http://tommorris.org/photo.png"/>Tom Morris</a>
15 | 
16 | 	<a class="h-card" href="http://tommorris.org/"><img src="http://tommorris.org/photo.png" alt="Tom Morris" /></a>
17 | 
18 | 	<img class="h-card" src="http://tommorris.org/photo.png" alt="Tom Morris" />
19 | 
20 |         <!-- Make sure "" is interpreted as a relative URL -->
21 | 	<a class="h-card" href=""><img src="" />Tom Morris</a>
22 | </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_properties_silo_pub.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html>
 3 | <html>
 4 |     <head>
 5 |         <meta charset="utf-8">
 6 |         <link rel="authorization_endpoint" href="https://silo.pub/indieauth">
 7 |         <link rel="token_endpoint" href="https://silo.pub/token">
 8 |         <link rel="micropub" href="https://silo.pub/micropub">
 9 |         <link rel="me" href="https://twitter.com/kylewmahan">
10 |         <link href="https://abs.twimg.com/favicons/favicon.ico" rel="shortcut icon">
11 |         <style>img { max-height: 4em; }</style>
12 |     </head>
13 |     <body>
14 |         Micropub proxy for
15 |         <div class="h-x-syndication-target">
16 |           <a class="p-user h-card" href="https://twitter.com/kylewmahan">
17 |             <img src="https://pbs.twimg.com/profile_images/641457114381074432/vUdKopH8_normal.jpg" alt="" />
18 |             @kylewmahan
19 |           </a>
20 |           on
21 |           <a class="p-service h-card" href="https://twitter.com/">
22 |             <img src="https://abs.twimg.com/favicons/favicon.ico" alt="" />
23 |             Twitter
24 |           </a>
25 |         </div>
26 |     </body>
27 | </html>
28 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_relative_datetimes.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |     <title>Implied_relative_datetimes</title>
 6 |   </head>
 7 |   <body>
 8 |     <article class="h-entry">
 9 |       <div class="e-content">This is a post.</div> 
10 |       <time class="dt-published" datetime="2015-01-02 03:04:05Z">2015-01-02 03:04</time>
11 |       (updated: <time class="dt-updated">05:06</time>)
12 |     </article>
13 |     <p>
14 |       Explanation: this is to test for the behaviour described
15 |       <a href="http://microformats.org/wiki/value-class-pattern#microformats2_parsers_implied_date">here</a>.
16 |     </p>
17 |   </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/implied_url.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
  5 |   <title>Implied Photo</title>
  6 | </head>
  7 | <body>
  8 | 
  9 | <!--if a.h-x[href] or area.h-x[href] then use that [href] for url-->
 10 | <a class="h-entry" href="http://example.com">this be link</a>
 11 | 
 12 | <area class="h-entry" href="http://example.com"/>
 13 | 
 14 | <!--else if .h-x>a[href]:only-of-type:not[.h-*], then use that [href] for url-->
 15 | <article class="h-entry">
 16 |     <a href="http://example.com">this be link</a>
 17 | </article>
 18 | 
 19 | <article class="h-entry">
 20 |     <a href="http://example.com">this be link</a>
 21 |     <span> test </span>
 22 | </article>
 23 | 
 24 | <!--else if .h-x>area[href]:only-of-type:not[.h-*], then use that [href] for url-->
 25 | <article class="h-entry">
 26 |     <area href="http://example.com"/>
 27 | </article>
 28 | 
 29 | <article class="h-entry">
 30 |     <area href="http://example.com"/>
 31 |     <span> test </span>
 32 | </article>
 33 | 
 34 | <!--else if .h-x>:only-child:not[.h-*]>a[href]:only-of-type:not[.h-*], then use that [href] for url-->
 35 | <article class="h-entry">
 36 |     <div>
 37 |         <a href="http://example.com">this be link</a>
 38 |     </div>
 39 | </article>
 40 | 
 41 | <article class="h-entry">
 42 |     <div>
 43 |         <a href="http://example.com">this be link</a>
 44 |         <span>test</span>
 45 |     </div>
 46 | </article>
 47 | 
 48 | <!--else if .h-x>:only-child:not[.h-*]>area[href]:only-of-type:not[.h-*], then use that [href] for url -->
 49 | <article class="h-entry">
 50 |     <div>
 51 |         <area href="http://example.com"/>
 52 |     </div>
 53 | </article>
 54 | 
 55 | <article class="h-entry">
 56 |     <div>
 57 |         <area href="http://example.com"/>
 58 |         <span>test</span>
 59 |     </div>
 60 | </article>
 61 | 
 62 | <!--priority tests; prefer <a> over <area>-->
 63 | 
 64 | <article class="h-entry">
 65 |     <area href="http://example.com/not"/>
 66 |     <a href="http://example.com">this be link</a>
 67 | </article>
 68 | 
 69 | <article class="h-entry">
 70 |     <div>
 71 |         <area href="http://example.com/not"/>
 72 |         <a href="http://example.com">this be link</a>
 73 |     </div>
 74 | </article>
 75 | 
 76 | <!-- no url tests -->
 77 | <article class="h-entry">
 78 |     <a href="http://example.com">this be link</a>
 79 |     <a href="http://example.com/2">this be link</a>
 80 | </article>
 81 | 
 82 | <article class="h-entry">
 83 |     <a class="h-cite" href="http://example.com">this be link</a>
 84 | </article>
 85 | 
 86 | <article class="h-entry">
 87 |     <area href="http://example.com"/>
 88 |     <area href="http://example.com/2"/>
 89 | </article>
 90 | 
 91 | <article class="h-entry">
 92 |     <area class="h-cite" href="http://example.com"/>
 93 | </article>
 94 | 
 95 | 
 96 | <article class="h-entry">
 97 |     <div>
 98 |         <a href="http://example.com">this be link</a>
 99 |     </div>
100 |     <span>test</span>
101 | </article>
102 | 
103 | <article class="h-entry">
104 |     <div class="h-cite">
105 |         <a href="http://example.com">this be link</a>
106 |     </div>
107 | </article>
108 | 
109 | <article class="h-entry">
110 |     <div>
111 |         <a href="http://example.com">this be link</a>
112 |         <a href="http://example.com/2">this be link</a>
113 |     </div>
114 | </article>
115 | 
116 | <article class="h-entry">
117 |     <div class="h-cite">
118 |         <a class="h-cite" href="http://example.com">this be link</a>
119 |     </div>
120 | </article>
121 | 
122 | <article class="h-entry">
123 |     <div>
124 |         <area href="http://example.com"/>
125 |     </div>
126 |     <span>test</span>
127 | </article>
128 | 
129 | <article class="h-entry">
130 |     <div class="h-cite">
131 |         <area href="http://example.com"/>
132 |     </div>
133 | </article>
134 | 
135 | <article class="h-entry">
136 |     <div>
137 |         <area href="http://example.com"/>
138 |         <area href="http://example.com/2"/>
139 |     </div>
140 | </article>
141 | 
142 | 
143 | </body>
144 | </html>
145 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/simple_person_reference_implied.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Simple_person_reference</title>
 6 | </head>
 7 | <body>
 8 |   <span class="h-card">Frances Berriman</span>
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/stop_implied_name_e_content.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Stop implied name due to e-content</title>
 6 | </head>
 7 | <body>
 8 | <article class="h-entry">
 9 |   <div class="e-content">
10 |     <p>Wanted content.</p>
11 |   </div>
12 |   <footer>
13 |     <p>Footer to be ignored.</p>
14 |   </footer>
15 | </article>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/stop_implied_name_nested_h.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Stop implied name due to embedded h-*</title>
 6 | </head>
 7 | <body>
 8 | <article class="h-entry">
 9 |   <div class="u-like-of h-cite">
10 |     <p>I really like <a class="p-name u-url" href="http://microformats.org/">Microformats</a></p>
11 |   </div>
12 |   <footer>
13 |     <p>Footer to be ignored.</p>
14 |   </footer>
15 | </article>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/stop_implied_name_p_content.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Stop implied name due to p-content</title>
 6 | </head>
 7 | <body>
 8 | <article class="h-entry">
 9 |   <div class="p-content">
10 |     <p>Wanted content.</p>
11 |   </div>
12 |   <footer>
13 |     <p>Footer to be ignored.</p>
14 |   </footer>
15 | </article>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/examples/implied_properties/stop_implied_url.html:
--------------------------------------------------------------------------------
 1 | <h1>real world example</h1>
 2 | <article class="post h-entry">
 3 |   <h1 class="p-name"><a href="/Waar-te-beginnen-met-Webmentions/">Waar te beginnen met Webmentions</a></h1>
 4 |   <div class="entry e-content">
 5 |     <p>Er zijn van die momenten dat ik het liefste de hele dag ga zitten puzzelen hoe ik nu de <a href="/webmentions/">webmentions</a> op deze site in orde moet maken. Het loopt allemaal nog steeds niet lekker, maar ik weet niet goed welke kant ik op moet denken en werken voor een oplossing.</p>
 6 |     <p>Waar loop ik nog tegen aan?</p>
 7 |     <p>...</p>    
 8 |   </div>
 9 |   <a href="https://news.indieweb.org/nl" class="u-syndication"></a>
10 | </article>
11 | 
12 | <h1>synthetic test cases</h1>
13 | 
14 | <article class="h-entry">
15 |   <h1>u- on only link stops implied url</h1>
16 |   <a href="http://example.com/" class="u-property"></a>
17 | </article>
18 | 
19 | <article class="h-entry">
20 |   <h1><a href="http://example.com/post" class="u-property">u- on second link stops implied url</a></h1>
21 |   <a href="http://example.com/"></a>
22 | </article>
23 | 
24 | <article class="h-entry">
25 |   <h1 class="p-property h-entry">nested object in property stops u-url parsing</h1>
26 |   <a href="http://example.com/"></a>
27 | </article>
28 | 
29 | <article class="h-entry">
30 |   <h1 class="h-entry">nested child object stops u-url parsing</h1>
31 |   <a href="http://example.com/"></a>
32 | </article>
33 | 
34 | <article class="h-entry">
35 |   <div><span><h1 class="h-entry">deeper nested child object stops u-url parsing</h1></span></div>
36 |   <a href="http://example.com/"></a>
37 | </article>
38 | 
39 | <article class="h-entry">
40 |   <h1 class="p-name">p- property doesn't stop implied url parsing</h1>
41 |   <a href="http://example.com/"></a>
42 | </article>
43 | 
44 | <article class="h-entry">
45 |   <p class="e-content">e-property doesn't stop implied url parsing"</p>
46 |   <a href="http://example.com/"></a>
47 | </article>
48 | 
49 | <article class="h-entry">
50 |   <h1>implied u-photo does not stop implied u-url parsing</h1>
51 |   <img src="http://example.com/avatar.png">
52 |   <a href="http://example.com/"></a>
53 | </article>
54 | 
55 | <article class="h-entry">
56 |   <h1>implied u-photo does not stop implied u-url parsing</h1>
57 |   <img src="http://example.com/avatar.png">
58 |   <a href="http://example.com/"></a>
59 | </article>


--------------------------------------------------------------------------------
/test/examples/language.html:
--------------------------------------------------------------------------------
 1 | <html lang="it">
 2 |   <div class="h-card">
 3 |     <h1 class="p-name">Romero</h1>
 4 |   </div>
 5 |   <div class="h-entry">
 6 |     <h1 class="p-name">Un titolo italiano</h1>
 7 |     <div class="e-content" lang="en">With an <em>english</em> summary</div>
 8 |     <div class="e-content">Con un riassunto <em>italiano</em></div>
 9 |   </div>
10 |   <div class="h-entry" lang="sv">
11 |     <h1 class="p-name">En svensk titel</h1>
12 |     <div class="e-content" lang="en">With an <em>english</em> summary</div>
13 |     <div class="e-content">Och <em>svensk</em> huvudtext</div>
14 |   </div>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/test/examples/link-rel-minimal.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <script>var __pbpa = true;</script><script>var translated_warning_string = 'Warning: Never enter your Tumblr password unless \u201chttps://www.tumblr.com/login\u201d\x0ais the address in your web browser.\x0a\x0aYou should also see a green \u201cTumblr, Inc.\u201d identification in the address bar.\x0a\x0aSpammers and other bad guys use fake forms to steal passwords.\x0a\x0aTumblr will never ask you to log in from a user\u2019s blog.\x0a\x0aAre you absolutely sure you want to continue?';</script><script type="text/javascript" language="javascript" src="http://assets.tumblr.com/assets/scripts/pre_tumblelog.js?_v=75ff60d174af47d7ea271d82d4fe1151"></script>
 3 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | 
 6 | <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# blog: http://ogp.me/ns/blog#">
 7 | <link rel="micropub" href="https://feverdream.herokuapp.com/micropub">
 8 | 
 9 | <link rel="authorization_endpoint" href="https://indieauth.com/auth">
10 | <link rel="token_endpoint" href="https://tokens.indieauth.com/token">
11 | <link rel="alternate" href="android-app://com.tumblr/tumblr/x-callback-url/blog?blogName=kevinmarks" />
12 | <link rel="stylesheet" type="text/css" href="http://assets.tumblr.com/fonts/gibson/stylesheet.css?v=3">
13 | <meta charset="utf-8">
14 | </head>
15 | 
16 | <body>
17 | this should not be inside the links at all
18 | </body>
19 | </html>


--------------------------------------------------------------------------------
/test/examples/link_with_u-url.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <html class="h-card">
3 |   <head>
4 |     <title></title>
5 |     <link class="u-url" href="/">
6 |   </head>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/test/examples/metaformats_html_meta.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Hello World</title>
 6 |   <base href="http://tantek.com/" />
 7 |   <meta name="description" content="Descrypshun bar" />
 8 | </head>
 9 | <body>
10 |   <p>Hello world!</p>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/test/examples/metaformats_ogp.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Hello World</title>
 6 |   <base href="http://tantek.com/" />
 7 |   <meta property="og:type" content="article" />
 8 |   <meta property="og:title" content="Titull foo" />
 9 |   <meta property="og:description" content="Descrypshun bar" />
10 |   <meta property="og:image" content="http://example.com/baz.jpg" />
11 |   <meta property="og:audio" content="http://example.com/biff.mp3" />
12 |   <meta property="og:video" content="http://example.com/boff.mov" />
13 |   <meta property="article:author" content="/me" />
14 |   <meta property="article:published_time" content="2023-01-02T03:04Z" />
15 |   <meta property="article:modified_time" content="2023-01-02T05:06Z" />
16 | </head>
17 | <body>
18 |   <p>Hello world!</p>
19 | </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/test/examples/metaformats_twitter.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Hello World</title>
 6 |   <base href="http://tantek.com/" />
 7 |   <meta name="twitter:title" content="Titull foo" />
 8 |   <meta name="twitter:description" content="Descrypshun bar" />
 9 |   <meta name="twitter:image" content="/baz.jpg" />
10 | </head>
11 | <body>
12 |   <p>Hello world!</p>
13 | </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/test/examples/nested_complex_values.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Nested complex h-* value parsing test</title>
 5 |   </head>
 6 |   <body>
 7 |     <div class="h-entry">
 8 |       <div class="u-in-reply-to h-cite">
 9 |         <a class="p-author h-card" href="http://example.com">Example Author</a>
10 |         <a class="p-name u-url" href="http://example.com/post">Example Post</a>
11 |       </div>
12 |     </div>
13 |   </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/test/examples/nested_hcards.html:
--------------------------------------------------------------------------------
1 | <!--
2 |     Without the 'visited' set, p-name is incorrectly being hoisted to
3 |     the parent h-card
4 |   -->
5 | <div class="post reply h-entry ">
6 |     <p class="p-in-reply-to h-cite"><span class="p-name">KP</span></p>
7 |     <p class="p-author h-card"><span class="p-name">KP1</span></p>
8 | </div>
9 | 


--------------------------------------------------------------------------------
/test/examples/nested_multiple_classnames.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |  <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |  <title>Nested + Multiple Classnames</title>
 6 | </head>
 7 | <body>
 8 |  <article class="h-entry h-as-note">
 9 |   <h1 class="p-name">A BLOG POST</h1>
10 |    <div class="p-author p-reviewer h-card">
11 |     <span class="p-name">Tom Morris</span>
12 |     <a href="http://tommorris.org/" class="u-url">tommorris.org</a>
13 |     <p class="p-adr h-adr">
14 |      <span class="p-city">London</span>
15 |     </p>
16 |    </p>
17 |   </div>
18 |   <p class="h-cite">Some Citation</p>
19 |  </article>
20 | </body>
21 | </html>
22 | 


--------------------------------------------------------------------------------
/test/examples/nested_values.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Nested h-* value parsing test</title>
 5 |   </head>
 6 |   <body>
 7 |     <div class="h-entry">
 8 |       <p>
 9 |         <!-- value should be parsed as p-* -->
10 |         <a class="p-author h-card" href="http://about.me/kyle">Kyle</a>
11 |       </p>
12 |       <p>
13 |         <!-- value should be parsed as u-* -->
14 |         Liked <a class="u-like-of h-cite" href="http://example.com/foobar">foobar</a>.
15 |       </p>
16 | 
17 |       Oh by the way I should mention
18 |       <a class="h-card" href="http://people.com/george">George</a>
19 |       who will not have a nested value.
20 |     </div>
21 |   </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/test/examples/ordering_dedup.html:
--------------------------------------------------------------------------------
 1 | <article class="h-x-test h-product h-feed h-entry h-feed">
 2 |     <a class="u-url u-url" href="example.com" >URL </a>
 3 |     <span class="p-name p-name" >name</span>
 4 |     <a class="p-name u-url" href="example.com/2" >URL name</a>
 5 |     <a rel="me bookmark" href="example.com/rel">author a</a>
 6 |     <a rel="bookmark author bookmark" href="example.com/rel">author a</a>
 7 |     <a href="example.com/lang" rel="lang" hreflang="de"></a>
 8 |     <a href="example.com/lang" rel="lang" hreflang="en"></a>
 9 | </article>
10 | 
11 | 


--------------------------------------------------------------------------------
/test/examples/parse_id.html:
--------------------------------------------------------------------------------
1 | <div class="h-feed" id="recentArticles">
2 |     <h2 class="p-name">Recent Articles</h2>
3 |     <div class="hentry" id="article">Lorem Ipsum</div>
4 |     <div class="hentry" id="">empty id is invalid and should not be parsed</div>
5 |     <div class="p-author h-card" id="theAuthor">Max Mustermann</div>
6 | </div>


--------------------------------------------------------------------------------
/test/examples/person_with_url.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Simple_person_reference</title>
 6 | </head>
 7 | <body>
 8 |   <span class="h-card">
 9 |   <span class="p-name">Tom Morris</span>
10 |   <a href="http://tommorris.org/" class="u-url">tommorris.org</a>
11 |   </span>
12 | </body>
13 | </html>
14 | 


--------------------------------------------------------------------------------
/test/examples/plaintext_img_whitespace.html:
--------------------------------------------------------------------------------
 1 | <base href="https://example.com">
 2 | 
 3 | <div class="h-entry">
 4 |   <div class="e-content">
 5 |     <img src="/photo.jpg" alt="selfie">At some tourist spot
 6 |   </div>
 7 | </div>
 8 | 
 9 | <div class="h-entry">
10 |   <div class="e-content">
11 |     <img src="/photo.jpg" alt="">At another tourist spot
12 |   </div>
13 | </div>
14 | 
15 | <div class="h-entry">
16 |   <div class="e-content">
17 |     <img src="/photo.jpg">At yet another tourist spot
18 |   </div>
19 | </div>
20 | 


--------------------------------------------------------------------------------
/test/examples/plaintext_p_whitespace.html:
--------------------------------------------------------------------------------
 1 | <div class="h-entry">
 2 |   <div class="e-content">
 3 |     <p>foo</p><img src="pic.png" alt="bar">baz
 4 |   </div>
 5 | </div>
 6 | 
 7 | <div class="h-entry">
 8 |   <div class="e-content">
 9 |     <p>foo</p>bar baz
10 |   </div>
11 | </div>
12 | 
13 | <div class="h-entry">
14 |   <div class="e-content">
15 |     foo bar<p>baz</p>
16 |   </div>
17 | </div>
18 | 


--------------------------------------------------------------------------------
/test/examples/rel.html:
--------------------------------------------------------------------------------
1 | <a rel="author" href="http://example.com/a">author a</a>
2 | <a rel="author" href="http://example.com/b">author b</a>
3 | <a rel="in-reply-to" href="http://example.com/1">post 1</a>
4 | <a rel="in-reply-to" href="http://example.com/2">post 2</a>
5 | <a rel="alternate home"
6 |    href="http://example.com/fr"
7 |    media="handheld"
8 |    hreflang="fr">French mobile homepage</a>
9 | 


--------------------------------------------------------------------------------
/test/examples/rel_enclosure.html:
--------------------------------------------------------------------------------
1 | <p><a rel="enclosure" href="http://example.com/movie.mp4" type="video/mpeg">my movie</a></p>
2 | 


--------------------------------------------------------------------------------
/test/examples/relative_url_in_e.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Relative URLs in e-content</title>
 6 |   <base href="http://example.com/" />
 7 | </head>
 8 | <body>
 9 | <div class="h-entry">
10 | <div class="e-content"><p><a href=/cat.html>Cat <img src=cat.jpg></a></p></div>
11 | </div>
12 | </body>
13 | </html>
14 | 


--------------------------------------------------------------------------------
/test/examples/rsvp.html:
--------------------------------------------------------------------------------
1 | <article class="h-entry h-as-rsvp">
2 |   <a class="u-in-reply-to" href="https://www.facebook.com/events/224601034410479/"></a>
3 |   <data class="p-rsvp" value="yes">is attending.</data>
4 | </article>
5 | 


--------------------------------------------------------------------------------
/test/examples/simple_person_reference.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Simple_person_reference</title>
 6 | </head>
 7 | <body>
 8 |   <span class="h-card">
 9 |   <span class="p-name">Frances Berriman</span>
10 |   </span>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/test/examples/simple_person_reference_same_element.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Simple_person_reference</title>
 6 | </head>
 7 | <body>
 8 |   <span class="h-card p-name">Frances Berriman</span>
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/test/examples/string_stripping.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>String Stripping example</title>
 5 |   </head>
 6 |   <body>
 7 |     <div class="h-card">
 8 |       <span class="p-name">  Tom Morris </span>
 9 |     </div>
10 |   </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/examples/tag_whitespace_inside_p_value.html:
--------------------------------------------------------------------------------
1 | <article class="h-entry">
2 |   <div class="p-name">foo <span>bar</span></div>
3 | </article>
4 | 


--------------------------------------------------------------------------------
/test/examples/template_tag.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Template tag test</title>
 5 | </head>
 6 | <body>
 7 | <template class="h-card">
 8 | <span class="p-name">Tom Morris</span>
 9 | </template>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/examples/template_tag_inside_e_value.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 | <head>
 4 | <title>Template tag test</title>
 5 | </head>
 6 | <body>
 7 | <div class="h-entry">
 8 |     <div class="e-content">This is a Test with a <code>template</code> tag after this:<template>This should <b>never</b> be in the output</template></div>
 9 | </div>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/test/examples/test_src_equiv.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=utf-8" />
 5 | 
 6 |     <title>audio/video/source equiv</title>
 7 |     
 8 |   </head>
 9 |   <body>
10 |     <div class="h-card">
11 |       <h1 class="p-name">Alice</h1>
12 |       <img src="http://example.org/" class="u-x-example" />
13 |     </div>
14 | 
15 |     <div class="h-card">
16 |       <h1 class="p-name">Bob</h1>
17 |       <audio src="http://example.org/" class="u-x-example" />
18 |     </div>
19 | 
20 |     <div class="h-card">
21 |       <h1 class="p-name">Clarissa</h1>
22 |       <video controls src="http://example.org/" class="u-x-example" />
23 |     </div>
24 | 
25 |     <div class="h-card">
26 |       <h1 class="p-name">David</h1>
27 |       <video controls>
28 |         <source src="http://example.org/" class="u-x-example" />
29 |       </video>
30 | 	</div>
31 | 	
32 | 	<div class="h-card">
33 |       <h1 class="p-name">David</h1>
34 |         <iframe src="http://example.org/" class="u-x-example">
35 | 		  <p>Your browser does not support iframes.</p>
36 |         </iframe>
37 | 	</div>
38 |   </body>
39 | </html>
40 | 


--------------------------------------------------------------------------------
/test/examples/u-test.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 4 |   <title>Hello World</title>
 5 |   <base href="http://example.com/" />
 6 | </head>
 7 | <body>
 8 | <article class="h-entry">
 9 |     <h1><a  class="p-name">Expanding URLs within HTML content</a></h1>
10 |     <div class="e-content">
11 |         <ul>
12 |             <li><a class="u-url" href="http://www.w3.org/">Should not change: http://www.w3.org/</a></li>
13 |             <li><a class="u-url" href="http://example.com/">Should not change: http://example.com/</a></li>
14 |             <li><a class="u-url" href="test.html">File relative: test.html = http://example.com/test.html</a></li>
15 |             <li><a class="u-url" href="/test/test.html">Directory relative: /test/test.html = http://example.com/test/test.html</a></li>
16 |             <li><a class="u-url" href="/test.html">Relative to root: /test.html = http://example.com/test.html</a></li>
17 |         </ul>
18 |         <img src="http://www.w3.org/2008/site/images/logo-w3c-mobile-lg" /><img src="/images/test.gif" />
19 |     </div>  
20 | </article>
21 | </body>
22 | 


--------------------------------------------------------------------------------
/test/examples/u_all_cases.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 4 |   <title>Hello World</title>
 5 |   <base href="http://example.com/" />
 6 | </head>
 7 | <body>
 8 | <article class="h-entry">
 9 |     <h1><a  class="p-name">Testing variations of u-properties</a></h1>
10 |         <ul>
11 |             <!--if a.u-x[href] or area.u-x[href] or link.u-x[href], then get the href attribute -->
12 |             <li><a class="u-url" href="/test"></a></li>
13 |             <li><a class="u-url" href="http://example.com/test"></a></li>
14 |             <li><area class="u-url" href="/test"></li>
15 |             <li><area class="u-url" href="http://example.com/test"></li>
16 |             <li><link class="u-url" href="/test"></li>
17 |             <li><link class="u-url" href="http://example.com/test"></li>
18 |             <!--else if img.u-x[src] or audio.u-x[src] or video.u-x[src] or source.u-x[src] or iframe.u-x[src], then get the src attribute-->
19 |             <li><img class="u-url" src="/test"></li>
20 |             <li><img class="u-url" src="http://example.com/test"></li>
21 |             <li><audio class="u-url" src="/test"></audio></li>
22 |             <li><audio class="u-url" src="http://example.com/test"></audio></li>
23 |             <li><video class="u-url" src="/test"><source class="u-url" src="/test"></video></li>
24 |             <li><video class="u-url" src="http://example.com/test"><source class="u-url" src="http://example.com/test"></video></li>
25 |             <!--else if video.u-x[poster], then get the poster attribute-->
26 |             <li><video class="u-url" poster="/test"></video></li>
27 |             <li><video class="u-url" poster="http://example.com/test"></video></li>
28 |             <!--else if object.u-x[data], then get the data attribute-->
29 |             <li><object class="u-url" data="/test"></object></li>
30 |             <li><object class="u-url" data="http://example.com/test"></object></li>
31 |             <!-- else parse the element for the value-class-pattern. If a value is found, get it-->
32 |             <!-- no test case -->
33 |             <!-- else if abbr.u-x[title], then get the title attribute-->
34 |             <li><abbr class="u-url" title="/test">CHEESE</abbr></li>
35 |             <li><abbr class="u-url" title="http://example.com/test">CHEESE</abbr></li>
36 |             <!-- else if data.u-x[value] or input.u-x[value], then get the value attribute-->
37 |             <li><data class="u-url" value="/test"></data></li>
38 |             <li><data class="u-url" value="http://example.com/test"></data></li>
39 |             <li><input class="u-url" value="/test"></li>
40 |             <li><input class="u-url" value="http://example.com/test"></li>
41 |             <!-- else get the textContent of the element after removing all leading/trailing whitespace and nested <script> & <style> elements-->
42 |             <li><p class="u-url">/test</p></li>
43 |             <li><p class="u-url">http://example.com/test</p></li>
44 |             <li><p class="u-url">       /test        </p></li>
45 |             <li><p class="u-url">       /test<script>dsjfskjflkds</script>        </p></li>
46 |         </ul>
47 | </article>
48 | </body>
49 | 


--------------------------------------------------------------------------------
/test/examples/value_class_person.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta http-equiv="content-type" content="text/html; charset=utf-8">
 5 |   <title>Value Class Pattern</title>
 6 | </head>
 7 | <body>
 8 |   <div class="h-card">
 9 |     <div class="p-name">
10 |       Tom Morris
11 |     </div>
12 |     <div class="p-tel">
13 |       <span class="type">fake</span>
14 |       <span class="value">+44 1234 567890</span>
15 |     </div>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/examples/value_name_whitespace.html:
--------------------------------------------------------------------------------
 1 | <div class="h-entry">
 2 |   <div class="e-content p-name"><p>Hello World</p></div>
 3 | </div>
 4 | 
 5 | <div class="h-entry">
 6 |   <div class="e-content p-name">
 7 |     <p>Hello World</p>
 8 |   </div>
 9 | </div>
10 | 
11 | <div class="h-entry">
12 |   <div class="e-content p-name">Hello
13 | World</div>
14 | </div>
15 | 
16 | <div class="h-entry">
17 |   <div class="e-content p-name"><p>Hello<br>World</p></div>
18 | </div>
19 | 
20 | <div class="h-entry">
21 |   <div class="e-content p-name"><p>Hello<br>
22 | World</p></div>
23 | </div>
24 | 
25 | <div class="h-entry">
26 |   <div class="e-content p-name">Hello<br>
27 |     World</div>
28 | </div>
29 | 
30 | <div class="h-entry">
31 |   <div class="e-content p-name"><br>Hello<br>World<br></div>
32 | </div>
33 | 
34 | <div class="h-entry">
35 |   <div class="e-content p-name"><p>Hello</p><p>World</p></div>
36 | </div>
37 | 
38 | <div class="h-entry">
39 |   <div class="e-content p-name">
40 |     <pre>One
41 | Two
42 | Three</pre>
43 |   </div>
44 | </div>
45 | 
46 | <div class="h-entry">
47 |   <div class="e-content p-name">
48 |     <p>One</p>
49 |     <p>Two</p>
50 |     <p>Three</p>
51 |   </div>
52 | </div>
53 | 
54 | <div class="h-entry">
55 |   <div class="e-content p-name">
56 |     Hello World
57 |     <pre>
58 |       one
59 |       two
60 |       three
61 |     </pre>
62 |   </div>
63 | </div>
64 | 
65 | <div class="h-entry">
66 | <div class="e-content">
67 | <span class="p-name">Correct name</span>
68 | 
69 | <span class="p-summary">Correct summary</span>
70 | </div>
71 | </div>
72 | 


--------------------------------------------------------------------------------
/test/test_dom_addins.py:
--------------------------------------------------------------------------------
 1 | from mf2py.parser import Parser
 2 | 
 3 | 
 4 | def test_getElementsByClassName():
 5 |     p = Parser(doc=open("test/examples/person_with_url.html"))
 6 |     dom = p.__doc__
 7 |     assert len(dom.find_all(class_="u-url")) == 1
 8 |     expected_el = dom.find_all(class_="u-url")[0]
 9 |     assert expected_el["class"] == ["u-url"]
10 | 


--------------------------------------------------------------------------------
/test/test_parser.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import re
   3 | import sys
   4 | from unittest import TestCase, mock
   5 | 
   6 | import bs4
   7 | from bs4 import BeautifulSoup
   8 | 
   9 | from mf2py import Parser
  10 | 
  11 | TestCase.maxDiff = None
  12 | 
  13 | 
  14 | TEST_DIR = "test/examples/"
  15 | 
  16 | 
  17 | def parse_fixture(path, **kwargs):
  18 |     with open(os.path.join(TEST_DIR, path)) as f:
  19 |         p = Parser(doc=f, html_parser="html5lib", **kwargs)
  20 |         return p.to_dict()
  21 | 
  22 | 
  23 | def test_empty():
  24 |     p = Parser()
  25 |     assert type(p) is not None
  26 |     assert type(p.to_dict()) is dict
  27 | 
  28 | 
  29 | def test_open_file():
  30 |     with open(os.path.join(TEST_DIR, "empty.html")) as f:
  31 |         p = Parser(doc=f)
  32 | 
  33 |     assert p.__doc__ is not None
  34 |     assert type(p) is not None
  35 |     assert type(p.to_dict()) is dict
  36 | 
  37 | 
  38 | def test_doc_tag():
  39 |     # test that strings, BS doc and BS tags are all parsed
  40 |     doc = """<article class="h-entry"></article>"""
  41 |     soup = BeautifulSoup(doc, "html5lib")
  42 |     parse_string = Parser(doc).to_dict()
  43 |     assert "h-entry" in parse_string["items"][0]["type"]
  44 |     parse_doc = Parser(soup).to_dict()
  45 |     assert "h-entry" in parse_doc["items"][0]["type"]
  46 |     parse_tag = Parser(soup.article).to_dict()
  47 |     assert "h-entry" in parse_tag["items"][0]["type"]
  48 | 
  49 | 
  50 | @mock.patch("requests.get")
  51 | def test_user_agent(getter):
  52 |     ua_expect = "mf2py - microformats2 parser for python"
  53 |     assert Parser.useragent.startswith(ua_expect)
  54 | 
  55 |     resp = mock.MagicMock()
  56 |     resp.content = b""
  57 |     resp.text = ""
  58 |     resp.headers = {}
  59 |     getter.return_value = resp
  60 | 
  61 |     Parser(url="http://example.com")
  62 |     getter.assert_called_with(
  63 |         "http://example.com", headers={"User-Agent": Parser.useragent}
  64 |     )
  65 | 
  66 |     Parser.useragent = "something else"
  67 |     assert Parser.useragent == "something else"
  68 |     # set back to default. damn stateful classes
  69 |     Parser.useragent = "mf2py - microformats2 parser for python"
  70 | 
  71 | 
  72 | def test_base():
  73 |     with open(os.path.join(TEST_DIR, "base.html")) as f:
  74 |         p = Parser(doc=f)
  75 | 
  76 |     assert p.__url__ == "http://tantek.com/"
  77 | 
  78 | 
  79 | def test_simple_parse():
  80 |     result = parse_fixture("simple_person_reference.html")
  81 |     assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
  82 | 
  83 | 
  84 | def test_simple_person_reference_same_element():
  85 |     result = parse_fixture("simple_person_reference_same_element.html")
  86 |     assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
  87 | 
  88 | 
  89 | def test_person_with_url():
  90 |     result = parse_fixture("person_with_url.html")
  91 |     assert result["items"][0]["properties"]["name"] == ["Tom Morris"]
  92 |     assert result["items"][0]["properties"]["url"] == ["http://tommorris.org/"]
  93 | 
  94 | 
  95 | def test_vcp():
  96 |     result = parse_fixture("value_class_person.html")
  97 |     assert result["items"][0]["properties"]["tel"] == ["+44 1234 567890"]
  98 | 
  99 | 
 100 | def test_multiple_root_classnames():
 101 |     result = parse_fixture("nested_multiple_classnames.html")
 102 |     # order does not matter
 103 |     assert len(result["items"]) == 1
 104 |     assert set(result["items"][0]["type"]) == set(["h-entry", "h-as-note"])
 105 | 
 106 | 
 107 | def test_property_nested_microformat():
 108 |     result = parse_fixture("nested_multiple_classnames.html")
 109 | 
 110 |     assert len(result["items"]) == 1
 111 |     assert "author" in result["items"][0]["properties"]
 112 |     assert (
 113 |         result["items"][0]["properties"]["author"][0]["properties"]["name"][0]
 114 |         == "Tom Morris"
 115 |     )
 116 |     assert (
 117 |         result["items"][0]["properties"]["reviewer"][0]["properties"]["name"][0]
 118 |         == "Tom Morris"
 119 |     )
 120 |     assert (
 121 |         result["items"][0]["properties"]["author"][0]["properties"]["adr"][0][
 122 |             "properties"
 123 |         ]["city"][0]
 124 |         == "London"
 125 |     )
 126 | 
 127 | 
 128 | def test_plain_child_microformat():
 129 |     result = parse_fixture("nested_multiple_classnames.html")
 130 | 
 131 |     assert len(result["items"]) == 1
 132 |     assert "children" in result["items"][0]
 133 |     assert len(result["items"][0]["children"]) == 1
 134 |     assert result["items"][0]["children"][0]["properties"]["name"][0] == "Some Citation"
 135 | 
 136 | 
 137 | def test_datetime_parsing():
 138 |     result = parse_fixture("datetimes.html")
 139 |     assert result["items"][0]["properties"]["start"][0] == "2014-01-01T12:00:00+0000"
 140 |     assert result["items"][0]["properties"]["end"][0] == "3014-01-01T18:00:00+0000"
 141 |     assert result["items"][0]["properties"]["duration"][0] == "P1000Y"
 142 |     assert result["items"][0]["properties"]["updated"][0] == "2011-08-26T00:01:21+0000"
 143 |     assert result["items"][0]["properties"]["updated"][1] == "2011-08-26T00:01:21+0000"
 144 | 
 145 | 
 146 | def test_datetime_vcp_parsing():
 147 |     result = parse_fixture("datetimes.html")
 148 |     assert len(result["items"]) == 16
 149 |     assert result["items"][1]["properties"]["published"][0] == "3014-01-01 01:21Z"
 150 |     assert result["items"][2]["properties"]["updated"][0] == "2014-03-11 09:55"
 151 |     assert result["items"][3]["properties"]["published"][0] == "2014-01-30 15:28"
 152 |     assert result["items"][4]["properties"]["published"][0] == "9999-01-14T11:52+0800"
 153 |     assert result["items"][5]["properties"]["published"][0] == "2014-06-01 12:30-0600"
 154 |     assert result["items"][8]["properties"]["start"][0] == "2014-06-01 12:30-0600"
 155 |     assert result["items"][9]["properties"]["start"][0] == "2014-06-01 12:30-0600"
 156 |     assert result["items"][10]["properties"]["start"][0] == "2014-06-01 00:30-0600"
 157 |     assert result["items"][10]["properties"]["end"][0] == "2014-06-01 12:15"
 158 |     assert result["items"][10]["properties"]["start"][1] == "2014-06-01 00:30-0600"
 159 |     assert result["items"][10]["properties"]["end"][1] == "2014-06-01 12:15"
 160 |     assert result["items"][11]["properties"]["start"][0] == "2016-03-02 00:30-0600"
 161 |     assert result["items"][12]["properties"]["start"][0] == "2014-06-01 12:30-600"
 162 |     assert result["items"][13]["properties"]["start"][0] == "2014-06-01 12:30+600"
 163 |     assert result["items"][14]["properties"]["start"][0] == "2014-06-01 12:30Z"
 164 |     assert result["items"][15]["properties"]["start"][0] == "2014-06-01 12:30-600"
 165 | 
 166 | 
 167 | def test_dt_end_implied_date():
 168 |     """Test that events with dt-start and dt-end use the implied date rule
 169 |     http://microformats.org/wiki/value-class-pattern#microformats2_parsers
 170 |     for times without dates"""
 171 |     result = parse_fixture("datetimes.html")
 172 | 
 173 |     event_wo_tz = result["items"][6]
 174 |     assert event_wo_tz["properties"]["start"][0] == "2014-05-21 18:30"
 175 |     assert event_wo_tz["properties"]["end"][0] == "2014-05-21 19:30"
 176 | 
 177 |     event_w_tz = result["items"][7]
 178 |     assert event_w_tz["properties"]["start"][0] == "2014-06-01 12:30-0600"
 179 |     assert event_w_tz["properties"]["end"][0] == "2014-06-01 19:30-0600"
 180 | 
 181 | 
 182 | def test_embedded_parsing():
 183 |     result = parse_fixture("embedded.html")
 184 |     assert (
 185 |         result["items"][0]["properties"]["content"][0]["html"]
 186 |         == "<p>Blah blah blah blah blah.</p>\n   <p>Blah.</p>\n   <p>Blah blah blah.</p>"
 187 |     )
 188 |     assert (
 189 |         result["items"][0]["properties"]["content"][0]["value"]
 190 |         == "Blah blah blah blah blah.\n\nBlah.\n\nBlah blah blah."
 191 |     )
 192 | 
 193 | 
 194 | def test_embedded_exposed_dom():
 195 |     result = parse_fixture("embedded.html", expose_dom=True)
 196 |     content = result["items"][0]["properties"]["content"][0]
 197 |     assert "html" not in content
 198 |     assert isinstance(content["dom"], bs4.element.Tag)
 199 | 
 200 | 
 201 | def test_hoisting_nested_hcard():
 202 |     result = parse_fixture("nested_hcards.html")
 203 |     expected = [
 204 |         {
 205 |             "properties": {
 206 |                 "author": [
 207 |                     {
 208 |                         "properties": {"name": ["KP1"]},
 209 |                         "type": ["h-card"],
 210 |                         "value": "KP1",
 211 |                     }
 212 |                 ],
 213 |                 "in-reply-to": [
 214 |                     {"properties": {"name": ["KP"]}, "type": ["h-cite"], "value": "KP"}
 215 |                 ],
 216 |             },
 217 |             "type": ["h-entry"],
 218 |         }
 219 |     ]
 220 |     assert expected == result["items"]
 221 | 
 222 | 
 223 | def test_html_tag_class():
 224 |     result = parse_fixture("hfeed_on_html_tag.html")
 225 |     assert ["h-feed"] == result["items"][0]["type"]
 226 | 
 227 |     assert ["entry1"] == result["items"][0]["children"][0]["properties"]["name"]
 228 |     assert ["entry2"] == result["items"][0]["children"][1]["properties"]["name"]
 229 | 
 230 | 
 231 | def test_string_strip():
 232 |     result = parse_fixture("string_stripping.html")
 233 |     assert "Tom Morris" == result["items"][0]["properties"]["name"][0]
 234 | 
 235 | 
 236 | def test_template_parse():
 237 |     result = parse_fixture("template_tag.html")
 238 |     assert 0 == len(result["items"])
 239 | 
 240 | 
 241 | def test_template_tag_inside_e_value():
 242 |     result = parse_fixture("template_tag_inside_e_value.html")
 243 |     assert (
 244 |         "This is a Test with a <code>template</code> tag after this:"
 245 |         == result["items"][0]["properties"]["content"][0]["html"]
 246 |     )
 247 |     assert (
 248 |         "This is a Test with a template tag after this:"
 249 |         == result["items"][0]["properties"]["content"][0]["value"]
 250 |     )
 251 | 
 252 | 
 253 | def test_ordering_dedup():
 254 |     """test that classes are dedeuped and alphabetically ordered"""
 255 | 
 256 |     result = parse_fixture("ordering_dedup.html")
 257 |     item = result["items"][0]
 258 |     assert ["h-entry", "h-feed", "h-product", "h-x-test"] == item["type"]
 259 |     assert ["example.com", "example.com/2"] == item["properties"]["url"]
 260 |     assert ["name", "URL name"] == item["properties"]["name"]
 261 |     assert ["author", "bookmark", "me"] == result["rel-urls"]["example.com/rel"]["rels"]
 262 |     assert "de" == result["rel-urls"]["example.com/lang"]["hreflang"]
 263 | 
 264 | 
 265 | def test_class_names_format():
 266 |     """test that only classes with letters and possibly numbers in the vendor prefix part are used"""
 267 | 
 268 |     result = parse_fixture("class_names_format.html")
 269 |     item = result["items"][0]
 270 |     assert ["h-feed", "h-p3k-entry", "h-x-test"] == item["type"]
 271 |     assert "url" in item["properties"]
 272 |     assert "p3k-url" in item["properties"]
 273 |     assert "Url" not in item["properties"]
 274 |     assert "-url" not in item["properties"]
 275 |     assert "url-" not in item["properties"]
 276 | 
 277 |     assert "name" in item["properties"]
 278 |     assert "p3k-name" in item["properties"]
 279 |     assert "nAme" not in item["properties"]
 280 |     assert "-name" not in item["properties"]
 281 |     assert "name-" not in item["properties"]
 282 | 
 283 | 
 284 | def test_area_uparsing():
 285 |     result = parse_fixture("area.html")
 286 |     assert {"url": ["http://suda.co.uk"], "name": ["Brian Suda"]} == result["items"][0][
 287 |         "properties"
 288 |     ]
 289 |     assert "shape" in result["items"][0]
 290 |     assert "coords" in result["items"][0]
 291 | 
 292 | 
 293 | def test_src_equiv():
 294 |     result = parse_fixture("test_src_equiv.html")
 295 |     for item in result["items"]:
 296 |         assert "x-example" in item["properties"]
 297 |         assert "http://example.org/" == item["properties"]["x-example"][0]
 298 | 
 299 | 
 300 | def test_rels():
 301 |     result = parse_fixture("rel.html")
 302 |     assert {
 303 |         "in-reply-to": ["http://example.com/1", "http://example.com/2"],
 304 |         "author": ["http://example.com/a", "http://example.com/b"],
 305 |         "alternate": ["http://example.com/fr"],
 306 |         "home": ["http://example.com/fr"],
 307 |     } == result["rels"]
 308 |     assert {
 309 |         "http://example.com/1": {"text": "post 1", "rels": ["in-reply-to"]},
 310 |         "http://example.com/2": {"text": "post 2", "rels": ["in-reply-to"]},
 311 |         "http://example.com/a": {"text": "author a", "rels": ["author"]},
 312 |         "http://example.com/b": {"text": "author b", "rels": ["author"]},
 313 |         "http://example.com/fr": {
 314 |             "text": "French mobile homepage",
 315 |             "media": "handheld",
 316 |             "rels": ["alternate", "home"],
 317 |             "hreflang": "fr",
 318 |         },
 319 |     } == result["rel-urls"]
 320 | 
 321 | 
 322 | def test_alternates():
 323 |     result = parse_fixture("rel.html")
 324 |     assert [
 325 |         {
 326 |             "url": "http://example.com/fr",
 327 |             "media": "handheld",
 328 |             "text": "French mobile homepage",
 329 |             "rel": "home",
 330 |             "hreflang": "fr",
 331 |         }
 332 |     ] == result["alternates"]
 333 | 
 334 | 
 335 | def test_enclosures():
 336 |     result = parse_fixture("rel_enclosure.html")
 337 |     assert {"enclosure": ["http://example.com/movie.mp4"]} == result["rels"]
 338 |     assert {
 339 |         "http://example.com/movie.mp4": {
 340 |             "rels": ["enclosure"],
 341 |             "text": "my movie",
 342 |             "type": "video/mpeg",
 343 |         }
 344 |     } == result["rel-urls"]
 345 | 
 346 | 
 347 | def test_empty_href():
 348 |     result = parse_fixture("hcard_with_empty_url.html", url="http://foo.com")
 349 | 
 350 |     for hcard in result["items"]:
 351 |         assert ["http://foo.com"] == hcard["properties"]["url"]
 352 | 
 353 | 
 354 | def test_link_with_u_url():
 355 |     result = parse_fixture("link_with_u-url.html", url="http://foo.com")
 356 |     assert {
 357 |         "type": ["h-card"],
 358 |         "properties": {
 359 |             "name": [""],
 360 |             "url": ["http://foo.com/"],
 361 |         },
 362 |     } == result["items"][0]
 363 | 
 364 | 
 365 | def test_broken_url():
 366 |     result = parse_fixture("broken_url.html", url="http://example.com")
 367 |     assert (
 368 |         result["items"][0]["properties"]["relative"][0] == "http://example.com/foo.html"
 369 |     )
 370 |     assert result["items"][0]["properties"]["url"][0] == "http://www.[w3.org/"
 371 |     assert (
 372 |         result["items"][0]["properties"]["photo"][0]
 373 |         == "http://www.w3].org/20[08/site/images/logo-w3c-mobile-lg"
 374 |     )
 375 | 
 376 | 
 377 | def test_complex_e_content():
 378 |     """When parsing h-* e-* properties, we should fold {"value":..., "html":...}
 379 |     into the parsed microformat object, instead of nesting it under an
 380 |     unnecessary second layer of "value":
 381 |     """
 382 |     result = parse_fixture("complex_e_content.html")
 383 | 
 384 |     assert {
 385 |         "type": ["h-entry"],
 386 |         "properties": {
 387 |             "content": [
 388 |                 {
 389 |                     "type": ["h-card"],
 390 |                     "properties": {"name": ["Hello"]},
 391 |                     "html": "<p>Hello</p>",
 392 |                     "value": "Hello",
 393 |                 }
 394 |             ],
 395 |         },
 396 |     } == result["items"][0]
 397 | 
 398 | 
 399 | def test_relative_url_in_e():
 400 |     """When parsing e-* properties, make relative URLs absolute."""
 401 |     result = parse_fixture("relative_url_in_e.html")
 402 | 
 403 |     assert (
 404 |         '<p><a href="http://example.com/cat.html">Cat '
 405 |         '<img src="http://example.com/cat.jpg"/></a></p>'
 406 |     ) == result["items"][0]["properties"]["content"][0]["html"]
 407 | 
 408 | 
 409 | def test_nested_values():
 410 |     """When parsing nested microformats, check that value is the value of
 411 |     the simple property element"""
 412 |     result = parse_fixture("nested_values.html")
 413 |     entry = result["items"][0]
 414 | 
 415 |     assert {
 416 |         "properties": {
 417 |             "name": ["Kyle"],
 418 |             "url": ["http://about.me/kyle"],
 419 |         },
 420 |         "value": "Kyle",
 421 |         "type": ["h-card"],
 422 |     } == entry["properties"]["author"][0]
 423 | 
 424 |     assert {
 425 |         "properties": {
 426 |             "name": ["foobar"],
 427 |             "url": ["http://example.com/foobar"],
 428 |         },
 429 |         "value": "http://example.com/foobar",
 430 |         "type": ["h-cite"],
 431 |     } == entry["properties"]["like-of"][0]
 432 | 
 433 |     assert {
 434 |         "properties": {
 435 |             "name": ["George"],
 436 |             "url": ["http://people.com/george"],
 437 |         },
 438 |         "type": ["h-card"],
 439 |     } == entry["children"][0]
 440 | 
 441 | 
 442 | # implied properties tests
 443 | 
 444 | 
 445 | def test_implied_name():
 446 |     result = parse_fixture("implied_properties/implied_properties.html")
 447 | 
 448 |     for i in range(7):
 449 |         assert result["items"][i]["properties"]["name"][0] == "Tom Morris"
 450 | 
 451 | 
 452 | def test_implied_url():
 453 |     result = parse_fixture(
 454 |         "implied_properties/implied_properties.html", url="http://foo.com/"
 455 |     )
 456 |     assert result["items"][1]["properties"]["url"][0] == "http://tommorris.org/"
 457 |     # img should not have a "url" property
 458 |     assert "url" not in result["items"][4]["properties"]
 459 |     # href="" is relative to the base url
 460 |     assert result["items"][5]["properties"]["url"][0] == "http://foo.com/"
 461 | 
 462 | 
 463 | def test_implied_photo():
 464 |     result = parse_fixture("implied_properties/implied_photo.html")
 465 | 
 466 |     for i in range(12):
 467 |         photos = result["items"][i]["properties"]["photo"]
 468 |         assert len(photos) == 1
 469 |         assert photos[0] == "http://example.com/photo.jpg"
 470 | 
 471 |     # tests for no photo
 472 |     for i in range(12, 23):
 473 |         assert "photo" not in result["items"][i]["properties"]
 474 | 
 475 |     result = parse_fixture("implied_properties/implied_photo_relative_url.html")
 476 | 
 477 |     assert (
 478 |         result["items"][0]["properties"]["photo"][0]["value"]
 479 |         == "http://example.com/jane-img.jpeg"
 480 |     )
 481 |     assert (
 482 |         result["items"][1]["properties"]["photo"][0]
 483 |         == "http://example.com/jane-object.jpeg"
 484 |     )
 485 | 
 486 | 
 487 | def test_implied_url():
 488 |     result = parse_fixture("implied_properties/implied_url.html")
 489 | 
 490 |     for i in range(12):
 491 |         urls = result["items"][i]["properties"]["url"]
 492 |         assert len(urls) == 1
 493 |         assert urls[0] == "http://example.com"
 494 | 
 495 |     # tests for no url
 496 |     for i in range(12, 23):
 497 |         assert "url" not in result["items"][i]["properties"]
 498 | 
 499 | 
 500 | def test_stop_implied_url():
 501 |     """testing that explicit properties case implied url-parsing to be aborted"""
 502 | 
 503 |     result = parse_fixture("implied_properties/stop_implied_url.html")
 504 | 
 505 |     assert "url" not in result["items"][0]["properties"]
 506 |     assert "url" not in result["items"][1]["properties"]
 507 |     assert "url" not in result["items"][2]["properties"]
 508 |     assert "url" not in result["items"][3]["properties"]
 509 |     assert "url" not in result["items"][4]["properties"]
 510 |     assert "url" not in result["items"][5]["properties"]
 511 | 
 512 |     assert result["items"][6]["properties"]["url"] == ["http://example.com/"]
 513 |     assert result["items"][7]["properties"]["url"] == ["http://example.com/"]
 514 |     assert result["items"][8]["properties"]["url"] == ["http://example.com/"]
 515 |     assert result["items"][9]["properties"]["url"] == ["http://example.com/"]
 516 | 
 517 | 
 518 | def test_implied_nested_photo():
 519 |     result = parse_fixture(
 520 |         "implied_properties/implied_properties.html", url="http://bar.org"
 521 |     )
 522 |     assert result["items"][2]["properties"]["photo"][0] == {
 523 |         "alt": "",
 524 |         "value": "http://tommorris.org/photo.png",
 525 |     }
 526 |     assert (
 527 |         result["items"][3]["properties"]["photo"][0] == "http://tommorris.org/photo.png"
 528 |     )
 529 |     assert result["items"][4]["properties"]["photo"][0] == {
 530 |         "alt": "Tom Morris",
 531 |         "value": "http://tommorris.org/photo.png",
 532 |     }
 533 |     # src="" is relative to the base url
 534 |     assert result["items"][6]["properties"]["photo"][0] == "http://bar.org"
 535 | 
 536 | 
 537 | def test_implied_nested_photo_alt_name():
 538 |     result = parse_fixture("implied_properties/implied_properties.html")
 539 |     assert result["items"][3]["properties"]["name"][0] == "Tom Morris"
 540 | 
 541 | 
 542 | def test_implied_image():
 543 |     result = parse_fixture("implied_properties/implied_properties.html")
 544 |     assert result["items"][4]["properties"]["photo"][0] == {
 545 |         "alt": "Tom Morris",
 546 |         "value": "http://tommorris.org/photo.png",
 547 |     }
 548 |     assert result["items"][4]["properties"]["name"][0] == "Tom Morris"
 549 | 
 550 | 
 551 | def test_implied_name_empty_alt():
 552 |     """An empty alt text should not prevent us from including other
 553 |     children in the implied name.
 554 |     """
 555 | 
 556 |     result = parse_fixture("implied_properties/implied_name_empty_alt.html")
 557 |     hcard = result["items"][0]
 558 | 
 559 |     assert {
 560 |         "type": ["h-card"],
 561 |         "properties": {
 562 |             "name": ["@kylewmahan"],
 563 |             "url": ["https://twitter.com/kylewmahan"],
 564 |             "photo": [{"alt": "", "value": "https://example.org/test.jpg"}],
 565 |         },
 566 |     } == hcard
 567 | 
 568 | 
 569 | def test_relative_datetime():
 570 |     result = parse_fixture("implied_properties/implied_relative_datetimes.html")
 571 |     assert result["items"][0]["properties"]["updated"][0] == "2015-01-02 05:06"
 572 | 
 573 | 
 574 | def test_stop_implied_name_nested_h():
 575 |     result = parse_fixture("implied_properties/stop_implied_name_nested_h.html")
 576 |     assert "name" not in result["items"][0]["properties"]
 577 | 
 578 | 
 579 | def test_stop_implied_name_e_content():
 580 |     result = parse_fixture("implied_properties/stop_implied_name_e_content.html")
 581 |     assert "name" not in result["items"][0]["properties"]
 582 | 
 583 | 
 584 | def test_stop_implied_name_p_content():
 585 |     result = parse_fixture("implied_properties/stop_implied_name_p_content.html")
 586 |     assert "name" not in result["items"][0]["properties"]
 587 | 
 588 | 
 589 | def test_implied_properties_silo_pub():
 590 |     result = parse_fixture("implied_properties/implied_properties_silo_pub.html")
 591 |     item = result["items"][0]
 592 | 
 593 |     # implied_name = item['properties']['name'][0]
 594 |     # implied_name = re.sub('\s+', ' ', implied_name).strip()
 595 |     # assert '@kylewmahan on Twitter', implied_name)
 596 | 
 597 |     # no implied name expected under new rules
 598 | 
 599 |     assert "name" not in item["properties"]
 600 | 
 601 | 
 602 | def test_simple_person_reference_implied():
 603 |     result = parse_fixture("implied_properties/simple_person_reference_implied.html")
 604 |     assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
 605 | 
 606 | 
 607 | def test_implied_name_alt():
 608 |     result = parse_fixture("implied_properties/implied_name_alt.html")
 609 |     assert result["items"][0]["children"][0] == {
 610 |         "type": ["h-card"],
 611 |         "properties": {
 612 |             "name": ["Avatar of Stephen"],
 613 |             "photo": [{"alt": "Avatar of", "value": "avatar.jpg"}],
 614 |         },
 615 |     }
 616 | 
 617 | 
 618 | def test_value_name_whitespace():
 619 |     result = parse_fixture("value_name_whitespace.html")
 620 | 
 621 |     for i in range(3):
 622 |         assert result["items"][i]["properties"]["content"][0]["value"] == "Hello World"
 623 |         assert result["items"][i]["properties"]["name"][0] == "Hello World"
 624 | 
 625 |     for i in range(3, 7):
 626 |         assert result["items"][i]["properties"]["content"][0]["value"] == "Hello\nWorld"
 627 |         assert result["items"][i]["properties"]["name"][0] == "Hello\nWorld"
 628 | 
 629 |     assert result["items"][7]["properties"]["content"][0]["value"] == "Hello\n\nWorld"
 630 |     assert result["items"][7]["properties"]["name"][0] == "Hello\n\nWorld"
 631 | 
 632 |     assert result["items"][8]["properties"]["content"][0]["value"] == "One\nTwo\nThree"
 633 |     assert result["items"][8]["properties"]["name"][0] == "One\nTwo\nThree"
 634 | 
 635 |     assert (
 636 |         result["items"][9]["properties"]["content"][0]["value"] == "One\n\nTwo\n\nThree"
 637 |     )
 638 |     assert result["items"][9]["properties"]["name"][0] == "One\n\nTwo\n\nThree"
 639 | 
 640 |     assert (
 641 |         result["items"][10]["properties"]["content"][0]["value"]
 642 |         == "Hello World      one\n      two\n      three\n    "
 643 |     )
 644 |     assert (
 645 |         result["items"][10]["properties"]["name"][0]
 646 |         == "Hello World      one\n      two\n      three\n    "
 647 |     )
 648 | 
 649 |     assert (
 650 |         result["items"][11]["properties"]["content"][0]["value"]
 651 |         == "Correct name Correct summary"
 652 |     )
 653 |     assert result["items"][11]["properties"]["name"][0] == "Correct name"
 654 | 
 655 | 
 656 | # backcompat tests
 657 | 
 658 | 
 659 | def test_backcompat_hentry():
 660 |     result = parse_fixture("backcompat/hentry.html")
 661 |     assert "h-entry" in result["items"][0]["type"]
 662 |     assert (
 663 |         "Tom Morris"
 664 |         == result["items"][0]["properties"]["author"][0]["properties"]["name"][0]
 665 |     )
 666 |     assert "A Title" == result["items"][0]["properties"]["name"][0]
 667 |     assert "Some Content" == result["items"][0]["properties"]["content"][0]["value"]
 668 | 
 669 | 
 670 | def test_backcompat_hproduct():
 671 |     result = parse_fixture("backcompat/hproduct.html")
 672 |     assert 1 == len(result["items"])
 673 |     assert ["h-product"] == result["items"][0]["type"]
 674 |     assert ["bullshit"] == result["items"][0]["properties"]["category"]
 675 |     assert ["Quacktastic Products"] == result["items"][0]["properties"]["brand"]
 676 |     assert ["#BULLSHIT-001"] == result["items"][0]["properties"]["identifier"]
 677 |     assert (
 678 |         "Magical tasty sugar pills that don't do anything."
 679 |         == result["items"][0]["properties"]["description"][0]
 680 |     )
 681 |     assert ["Tom's Magical Quack Tincture"] == result["items"][0]["properties"]["name"]
 682 | 
 683 | 
 684 | def test_backcompat_hproduct_nested_hreview():
 685 |     result = parse_fixture("backcompat/hproduct_hreview_nested.html")
 686 |     assert ["h-review"] == result["items"][0]["children"][0]["type"]
 687 | 
 688 | 
 689 | def test_backcompat_hreview_nested_card_event_product():
 690 |     result = parse_fixture("backcompat/hreview_nested_card_event_product.html")
 691 |     assert ["h-review"] == result["items"][0]["type"]
 692 |     items = result["items"][0]["properties"]["item"]
 693 |     assert 3 == len(items)
 694 | 
 695 |     event = items[0]
 696 |     assert ["h-event"] == event["type"]
 697 |     assert ["http://example.com/event-url"] == event["properties"]["url"]
 698 |     assert ["event name"] == event["properties"]["name"]
 699 | 
 700 |     card = items[1]
 701 |     assert ["h-card"] == card["type"]
 702 |     assert ["http://example.com/card-url"] == card["properties"]["url"]
 703 |     assert ["card name"] == card["properties"]["name"]
 704 | 
 705 |     product = items[2]
 706 |     assert ["h-product"] == product["type"]
 707 |     assert ["http://example.com/product-url"] == product["properties"]["url"]
 708 |     assert ["product name"] == product["properties"]["name"]
 709 | 
 710 | 
 711 | def test_backcompat_rel_bookmark():
 712 |     """Confirm that rel=bookmark inside of an h-entry is converted
 713 |     to u-url.
 714 |     """
 715 |     result = parse_fixture("backcompat/feed_with_rel_bookmark.html")
 716 |     for ii, url in enumerate(
 717 |         (
 718 |             "/2014/11/24/jump-rope",
 719 |             "/2014/11/23/graffiti",
 720 |             "/2014/11/21/earth",
 721 |             "/2014/11/19/labor",
 722 |         )
 723 |     ):
 724 |         assert ["h-entry"] == result["items"][ii]["type"]
 725 |         assert [url] == result["items"][ii]["properties"]["url"]
 726 | 
 727 | 
 728 | def test_backcompat_rel_bookmark():
 729 |     """Confirm that rel=bookmark inside of an hentry and hreview is converted
 730 |     to a u-url and original u-url is ignored
 731 |     """
 732 | 
 733 |     tests = [
 734 |         "backcompat/hentry_with_rel_bookmark.html",
 735 |         "backcompat/hreview_with_rel_tag_bookmark.html",
 736 |     ]
 737 | 
 738 |     results = [parse_fixture(x) for x in tests]
 739 | 
 740 |     for result in results:
 741 |         assert [
 742 |             "https://example.com/bookmark",
 743 |             "https://example.com/bookmark-url",
 744 |         ] == result["items"][0]["properties"]["url"]
 745 | 
 746 | 
 747 | def test_backcompat_rel_tag():
 748 |     """Confirm that rel=tag inside of an hentry is converted
 749 |     to a p-category and the last path segment of the href is used.
 750 |     """
 751 | 
 752 |     tests = [
 753 |         "backcompat/hentry_with_rel_tag.html",
 754 |         "backcompat/hfeed_with_rel_tag.html",
 755 |         "backcompat/hrecipe_with_rel_tag.html",
 756 |         "backcompat/hreview_with_rel_tag_bookmark.html",
 757 |     ]
 758 | 
 759 |     results = [parse_fixture(x) for x in tests]
 760 |     for result in results:
 761 |         assert ["cat", "dog", "mountain lion", "mouse", "meerkat"] == result["items"][
 762 |             0
 763 |         ]["properties"]["category"]
 764 | 
 765 | 
 766 | def test_backcompat_rel_tag_entry_title():
 767 |     """Confirm that other backcompat properties on a rel=tag are parsed"""
 768 | 
 769 |     result = parse_fixture("backcompat/hentry_with_rel_tag_entry_title.html")
 770 |     assert ["cat"] == result["items"][0]["properties"]["category"]
 771 |     assert ["rhinoceros"] == result["items"][0]["properties"]["name"]
 772 | 
 773 | 
 774 | def test_backcompat_rel_multiple_root():
 775 |     """Confirm that rel=tag and rel=bookmark inside of an hentry+hreview is parsed correctly"""
 776 | 
 777 |     result = parse_fixture("backcompat/hreview_hentry_with_rel_tag_bookmark.html")
 778 | 
 779 |     assert len(result["items"]) == 1
 780 |     assert "h-entry" in result["items"][0]["type"]
 781 |     assert "h-review" in result["items"][0]["type"]
 782 | 
 783 |     assert ["cat", "dog", "mountain lion", "mouse", "meerkat"] == result["items"][0][
 784 |         "properties"
 785 |     ]["category"]
 786 |     assert [
 787 |         "https://example.com/bookmark",
 788 |         "https://example.com/bookmark-url",
 789 |     ] == result["items"][0]["properties"]["url"]
 790 | 
 791 | 
 792 | def test_backcompat_ignore_mf1_root_if_mf2_present():
 793 |     """Confirm that mf1 root class is ignored if another mf2 root class is present."""
 794 |     result = parse_fixture("backcompat/ignore_mf1_root_if_mf2_present.html")
 795 |     assert "h-entry" not in result["items"][0]["type"]
 796 |     assert "h-event" in result["items"][0]["type"]
 797 | 
 798 | 
 799 | def test_backcompat_no_implied_properties_mf1_root():
 800 |     """Confirm that mf1 root class does not have implied properties"""
 801 |     result = parse_fixture("backcompat/ignore_mf1_root_if_mf2_present.html")
 802 |     assert "h-entry" not in result["items"][0]["properties"]
 803 |     assert "name" not in result["items"][0]["type"]
 804 |     assert "url" not in result["items"][0]["properties"]
 805 |     assert "photo" not in result["items"][0]["properties"]
 806 | 
 807 | 
 808 | def test_backcompat_ignore_mf2_properties_in_mf1_root():
 809 |     """Confirm that mf2 properties are ignored in mf1 root class"""
 810 |     result = parse_fixture("backcompat/ignore_mf2_properties_in_mf1_root.html")
 811 |     assert "Correct name" == result["items"][0]["properties"]["name"][0]
 812 |     assert "Correct summary" == result["items"][0]["properties"]["summary"][0]
 813 | 
 814 | 
 815 | def test_backcompat_ignore_mf1_properties_in_mf2_root():
 816 |     """Confirm that mf1 properties are ignored in mf2 root class"""
 817 |     result = parse_fixture("backcompat/ignore_mf1_properties_in_mf2_root.html")
 818 |     assert "Correct name" == result["items"][0]["properties"]["name"][0]
 819 |     assert "Correct summary" == result["items"][0]["properties"]["summary"][0]
 820 | 
 821 | 
 822 | def test_backcompat_nested_mf2_in_mf1():
 823 |     """Confirm that mf2 roots nested inside mf1 root are parsed"""
 824 |     result = parse_fixture("backcompat/nested_mf2_in_mf1.html")
 825 |     assert "h-feed" == result["items"][0]["type"][0]
 826 |     assert "h-entry" == result["items"][0]["children"][0]["type"][0]
 827 |     assert "Correct name" == result["items"][0]["children"][0]["properties"]["name"][0]
 828 |     assert (
 829 |         "Correct summary"
 830 |         == result["items"][0]["children"][0]["properties"]["summary"][0]
 831 |     )
 832 | 
 833 | 
 834 | def test_backcompat_nested_mf1_in_mf2():
 835 |     """Confirm that mf1 roots nested inside mf2 root are parsed"""
 836 |     result = parse_fixture("backcompat/nested_mf1_in_mf2.html")
 837 |     assert "h-feed" == result["items"][0]["type"][0]
 838 |     assert "h-entry" == result["items"][0]["children"][0]["type"][0]
 839 |     assert "Correct name" == result["items"][0]["children"][0]["properties"]["name"][0]
 840 |     assert (
 841 |         "Correct summary"
 842 |         == result["items"][0]["children"][0]["properties"]["summary"][0]
 843 |     )
 844 | 
 845 | 
 846 | def test_backcompat_nested_mf1_in_mf2_e_content():
 847 |     """Confirm that mf1 roots nested inside mf2 root e-content are parsed as authored"""
 848 |     result = parse_fixture("backcompat/nested_mf1_in_mf2_e_content.html")
 849 | 
 850 |     mf2_entry = result["items"][0]
 851 |     mf1_entry = mf2_entry["children"][0]
 852 | 
 853 |     assert (
 854 |         '<div class="hentry">\n<span class="entry-title">Correct name</span>\n\n<span class="entry-summary">Correct summary</span>\n</div>'
 855 |         == mf2_entry["properties"]["content"][0]["html"]
 856 |     )
 857 | 
 858 |     assert (
 859 |         "Correct name Correct summary" == mf2_entry["properties"]["content"][0]["value"]
 860 |     )
 861 | 
 862 |     assert "h-entry" == mf1_entry["type"][0]
 863 |     assert "Correct name" == mf1_entry["properties"]["name"][0]
 864 |     assert "Correct summary" == mf1_entry["properties"]["summary"][0]
 865 | 
 866 | 
 867 | def test_backcompat_hentry_content_html():
 868 |     """Confirm that mf1 entry-content html is parsed as authored without mf2 replacements"""
 869 |     result = parse_fixture("backcompat/hentry_content_html.html")
 870 | 
 871 |     entry = result["items"][0]
 872 | 
 873 |     assert (
 874 |         '<p class="entry-summary">This is a summary</p> \n        <p>This is <a href="/tags/mytag" rel="tag">mytag</a> inside content. </p>'
 875 |         == entry["properties"]["content"][0]["html"]
 876 |     )
 877 | 
 878 | 
 879 | def test_whitespace_with_tags_inside_property():
 880 |     """Whitespace should only be trimmed at the ends of textContent, not inside.
 881 | 
 882 |     https://github.com/microformats/mf2py/issues/112
 883 |     """
 884 |     result = parse_fixture("tag_whitespace_inside_p_value.html")
 885 |     assert result["items"][0]["properties"] == {"name": ["foo bar"]}
 886 | 
 887 | 
 888 | def test_plaintext_p_whitespace():
 889 |     result = parse_fixture("plaintext_p_whitespace.html")
 890 |     assert result["items"][0]["properties"]["content"][0]["value"] == "foo\nbar baz"
 891 |     assert result["items"][1]["properties"]["content"][0]["value"] == "foo\nbar baz"
 892 |     assert result["items"][2]["properties"]["content"][0]["value"] == "foo bar\nbaz"
 893 | 
 894 | 
 895 | def test_plaintext_img_whitespace():
 896 |     result = parse_fixture("plaintext_img_whitespace.html")
 897 |     assert (
 898 |         result["items"][0]["properties"]["content"][0]["value"]
 899 |         == "selfie At some tourist spot"
 900 |     )
 901 |     assert (
 902 |         result["items"][1]["properties"]["content"][0]["value"]
 903 |         == "At another tourist spot"
 904 |     )
 905 |     assert (
 906 |         result["items"][2]["properties"]["content"][0]["value"]
 907 |         == "https://example.com/photo.jpg At yet another tourist spot"
 908 |     )
 909 | 
 910 | 
 911 | def test_photo_with_alt():
 912 |     """Confirm that alt text in img is parsed as a u-* property and implied photo"""
 913 | 
 914 |     path = "img_with_alt.html"
 915 | 
 916 |     result = parse_fixture(path)
 917 | 
 918 |     with open(os.path.join(TEST_DIR, path)) as f:
 919 |         exp_result = Parser(doc=f, html_parser="html5lib").to_dict()
 920 | 
 921 |     # simple img with u-*
 922 |     assert "/photo.jpg" == result["items"][0]["properties"]["photo"][0]
 923 |     assert "/photo.jpg" == exp_result["items"][0]["properties"]["photo"][0]
 924 | 
 925 |     assert {"alt": "alt text", "value": "/photo.jpg"} == result["items"][1][
 926 |         "properties"
 927 |     ]["url"][0]
 928 |     assert "/photo.jpg" == exp_result["items"][1]["properties"]["url"][0]["value"]
 929 |     assert "alt text" == exp_result["items"][1]["properties"]["url"][0]["alt"]
 930 | 
 931 |     assert {"alt": "", "value": "/photo.jpg"} == result["items"][2]["properties"][
 932 |         "in-reply-to"
 933 |     ][0]
 934 |     assert (
 935 |         "/photo.jpg" == exp_result["items"][2]["properties"]["in-reply-to"][0]["value"]
 936 |     )
 937 |     assert "" == exp_result["items"][2]["properties"]["in-reply-to"][0]["alt"]
 938 | 
 939 |     # img with u-* and h-* example
 940 |     assert "h-cite" in result["items"][3]["properties"]["in-reply-to"][0]["type"]
 941 |     assert (
 942 |         "/photo.jpg"
 943 |         == result["items"][3]["properties"]["in-reply-to"][0]["properties"]["photo"][0]
 944 |     )
 945 |     assert "/photo.jpg" == result["items"][3]["properties"]["in-reply-to"][0]["value"]
 946 |     assert "alt" not in result["items"][3]["properties"]["in-reply-to"][0]
 947 | 
 948 |     assert "h-cite" in exp_result["items"][3]["properties"]["in-reply-to"][0]["type"]
 949 |     assert (
 950 |         "/photo.jpg"
 951 |         == exp_result["items"][3]["properties"]["in-reply-to"][0]["properties"][
 952 |             "photo"
 953 |         ][0]
 954 |     )
 955 |     assert (
 956 |         "/photo.jpg" == exp_result["items"][3]["properties"]["in-reply-to"][0]["value"]
 957 |     )
 958 |     assert "alt" not in exp_result["items"][3]["properties"]["in-reply-to"][0]
 959 | 
 960 |     assert "h-cite" in result["items"][4]["properties"]["in-reply-to"][0]["type"]
 961 |     assert {"alt": "alt text", "value": "/photo.jpg"} == result["items"][4][
 962 |         "properties"
 963 |     ]["in-reply-to"][0]["properties"]["photo"][0]
 964 |     assert "/photo.jpg" == result["items"][4]["properties"]["in-reply-to"][0]["value"]
 965 |     assert "alt" in result["items"][4]["properties"]["in-reply-to"][0]
 966 | 
 967 |     assert "h-cite" in exp_result["items"][4]["properties"]["in-reply-to"][0]["type"]
 968 |     assert (
 969 |         "/photo.jpg"
 970 |         == exp_result["items"][4]["properties"]["in-reply-to"][0]["properties"][
 971 |             "photo"
 972 |         ][0]["value"]
 973 |     )
 974 |     assert (
 975 |         "/photo.jpg" == exp_result["items"][4]["properties"]["in-reply-to"][0]["value"]
 976 |     )
 977 |     assert (
 978 |         "alt text"
 979 |         == exp_result["items"][4]["properties"]["in-reply-to"][0]["properties"][
 980 |             "photo"
 981 |         ][0]["alt"]
 982 |     )
 983 |     assert "alt text" == exp_result["items"][4]["properties"]["in-reply-to"][0]["alt"]
 984 | 
 985 |     assert "h-cite" in result["items"][5]["properties"]["in-reply-to"][0]["type"]
 986 |     assert {"alt": "", "value": "/photo.jpg"} == result["items"][5]["properties"][
 987 |         "in-reply-to"
 988 |     ][0]["properties"]["photo"][0]
 989 |     assert "/photo.jpg" == result["items"][5]["properties"]["in-reply-to"][0]["value"]
 990 |     assert "alt" in result["items"][5]["properties"]["in-reply-to"][0]
 991 | 
 992 |     assert "h-cite" in exp_result["items"][5]["properties"]["in-reply-to"][0]["type"]
 993 |     assert (
 994 |         "/photo.jpg"
 995 |         == exp_result["items"][5]["properties"]["in-reply-to"][0]["properties"][
 996 |             "photo"
 997 |         ][0]["value"]
 998 |     )
 999 |     assert (
1000 |         "/photo.jpg" == exp_result["items"][5]["properties"]["in-reply-to"][0]["value"]
1001 |     )
1002 |     assert (
1003 |         ""
1004 |         == exp_result["items"][5]["properties"]["in-reply-to"][0]["properties"][
1005 |             "photo"
1006 |         ][0]["alt"]
1007 |     )
1008 |     assert "" == exp_result["items"][5]["properties"]["in-reply-to"][0]["alt"]
1009 | 
1010 | 
1011 | def test_photo_with_srcset():
1012 |     result = parse_fixture("img_with_srcset.html")
1013 | 
1014 |     assert result["items"][0]["properties"]["photo"][0]["srcset"] == {
1015 |         "480w": "elva-fairy-480w.jpg",
1016 |         "800w": "elva-fairy-800w.jpg",
1017 |     }
1018 |     assert result["items"][1]["properties"]["photo"][0]["srcset"] == {
1019 |         "1x": "elva-fairy-320w.jpg",
1020 |         "1.5x": "elva-fairy-480w.jpg",
1021 |         "2x": "elva-fairy-640w.jpg",
1022 |     }
1023 |     assert (
1024 |         result["items"][1]["properties"]["photo"][0]["srcset"]["2x"]
1025 |         != "elva-fairy-2w.jpg"
1026 |     )
1027 |     for i in range(2, 7):
1028 |         assert result["items"][i]["properties"]["photo"][0]["srcset"] == {
1029 |             "1x": "elva-fairy,320w.jpg",
1030 |             "1.5x": "elva-fairy,480w.jpg",
1031 |         }
1032 |     assert result["items"][7]["properties"]["photo"][0]["srcset"] == {
1033 |         "1x": "elva-fairy,320w.jpg",
1034 |     }
1035 |     assert result["items"][8]["properties"]["photo"][0]["srcset"] == {
1036 |         "1x": "elva-fairy,320w.jpg",
1037 |         "1.5x": "elva-fairy,480w.jpg",
1038 |         "2x": "elva-fairy,640w.jpg",
1039 |     }
1040 | 
1041 |     result = parse_fixture("img_with_srcset_with_base.html")
1042 | 
1043 |     assert result["items"][0]["properties"]["photo"][0]["srcset"] == {
1044 |         "480w": "https://example.com/elva-fairy-480w.jpg",
1045 |         "800w": "https://example.com/elva-fairy-800w.jpg",
1046 |     }
1047 | 
1048 | 
1049 | def test_parse_id():
1050 |     result = parse_fixture("parse_id.html")
1051 |     assert "recentArticles" == result["items"][0]["id"]
1052 |     assert "article" == result["items"][0]["children"][0]["id"]
1053 |     assert "id" not in result["items"][0]["children"][1]
1054 |     assert "theAuthor" == result["items"][0]["properties"]["author"][0]["id"]
1055 | 
1056 | 
1057 | # unicode tests
1058 | 
1059 | 
1060 | def get_all_files():
1061 |     all_files = []
1062 | 
1063 |     for dir_, _, files in os.walk(TEST_DIR):
1064 |         for filename in files:
1065 |             rel_dir = os.path.relpath(dir_, TEST_DIR)
1066 |             all_files.append(os.path.join(rel_dir, filename))
1067 | 
1068 |     return all_files
1069 | 
1070 | 
1071 | def assert_unicode_everywhere(obj):
1072 |     if isinstance(obj, dict):
1073 |         for k, v in obj.items():
1074 |             assert not isinstance(k, bytes), "key=%r; type=%r" % (k, type(k))
1075 |             assert_unicode_everywhere(v)
1076 |     elif isinstance(obj, list):
1077 |         for v in obj:
1078 |             assert_unicode_everywhere(v)
1079 | 
1080 |     assert not isinstance(obj, bytes), "value=%r; type=%r" % (obj, type(obj))
1081 | 
1082 | 
1083 | def check_unicode(filename, jsonblob):
1084 |     assert_unicode_everywhere(jsonblob)
1085 | 
1086 | 
1087 | def test_unicode_everywhere():
1088 |     """make sure everything is unicode"""
1089 | 
1090 |     for h in get_all_files():
1091 |         result = parse_fixture(h)
1092 |         check_unicode(h, result)
1093 | 
1094 | 
1095 | def test_input_tree_integrity():
1096 |     """make sure that if we parse a BS4 soup, our modifications do not leak into the document represented by it"""
1097 | 
1098 |     for path in get_all_files():
1099 |         with open(os.path.join(TEST_DIR, path)) as f:
1100 |             soup = BeautifulSoup(f, features="lxml")
1101 |             html1 = soup.prettify()
1102 |             p = Parser(doc=soup, html_parser="lxml")
1103 |             html2 = soup.prettify()
1104 |         make_labelled_cmp("tree_integrity_" + path)(html1, html2)
1105 | 
1106 | 
1107 | def make_labelled_cmp(label):
1108 |     def f(html1, html2):
1109 |         assert html1 == html2
1110 | 
1111 |     f.description = label
1112 |     return f
1113 | 
1114 | 
1115 | def test_all_u_cases():
1116 |     """test variations of u- parsing and that relative urls are always resolved"""
1117 | 
1118 |     URL_COUNT = 28
1119 |     result = parse_fixture("u_all_cases.html")
1120 | 
1121 |     assert URL_COUNT == len(result["items"][0]["properties"]["url"])
1122 |     for i in range(URL_COUNT):
1123 |         make_labelled_cmp("all_u_cases_" + str(i))(
1124 |             "http://example.com/test", result["items"][0]["properties"]["url"][i]
1125 |         )
1126 | 
1127 | 
1128 | def test_filtered_roots():
1129 |     result = parse_fixture("filter_roots.html")
1130 |     assert len(result["items"]) == 8
1131 | 
1132 |     result = parse_fixture("filter_roots.html", filter_roots=True)
1133 |     assert len(result["items"]) == 1
1134 | 
1135 |     result = parse_fixture(
1136 |         "filter_roots_custom.html", filter_roots={"foo", "bar", "bat", "baz"}
1137 |     )
1138 |     assert len(result["items"]) == 1
1139 | 
1140 | 
1141 | def test_metaformats_flag_false():
1142 |     result = parse_fixture("metaformats_ogp.html")
1143 |     assert result["items"] == []
1144 | 
1145 | 
1146 | def test_metaformats_title_only():
1147 |     result = parse_fixture("base.html", metaformats=True)
1148 |     assert result["items"] == [
1149 |         {
1150 |             "type": ["h-entry"],
1151 |             "properties": {
1152 |                 "name": ["Hello World"],
1153 |             },
1154 |             "source": "metaformats",
1155 |         }
1156 |     ]
1157 | 
1158 | 
1159 | def test_metaformats_ogp():
1160 |     result = parse_fixture("metaformats_ogp.html", metaformats=True)
1161 |     assert result["items"] == [
1162 |         {
1163 |             "type": ["h-entry"],
1164 |             "properties": {
1165 |                 "name": ["Titull foo"],
1166 |                 "summary": ["Descrypshun bar"],
1167 |                 "photo": ["http://example.com/baz.jpg"],
1168 |                 "audio": ["http://example.com/biff.mp3"],
1169 |                 "video": ["http://example.com/boff.mov"],
1170 |                 "author": ["http://tantek.com/me"],
1171 |                 "published": ["2023-01-02T03:04Z"],
1172 |                 "updated": ["2023-01-02T05:06Z"],
1173 |             },
1174 |             "source": "metaformats",
1175 |         }
1176 |     ]
1177 | 
1178 | 
1179 | def test_metaformats_twitter():
1180 |     result = parse_fixture("metaformats_twitter.html", metaformats=True)
1181 |     assert result["items"] == [
1182 |         {
1183 |             "type": ["h-entry"],
1184 |             "properties": {
1185 |                 "name": ["Titull foo"],
1186 |                 "summary": ["Descrypshun bar"],
1187 |                 "photo": ["http://tantek.com/baz.jpg"],
1188 |             },
1189 |             "source": "metaformats",
1190 |         }
1191 |     ]
1192 | 
1193 | 
1194 | def test_metaformats_html_meta():
1195 |     result = parse_fixture("metaformats_html_meta.html", metaformats=True)
1196 |     assert result["items"] == [
1197 |         {
1198 |             "type": ["h-entry"],
1199 |             "properties": {
1200 |                 "name": ["Hello World"],
1201 |                 "summary": ["Descrypshun bar"],
1202 |             },
1203 |             "source": "metaformats",
1204 |         }
1205 |     ]
1206 | 
1207 | 
1208 | def test_language():
1209 |     result = parse_fixture("language.html")
1210 |     assert result["items"][0]["lang"] == "it"
1211 |     assert result["items"][1]["lang"] == "it"
1212 |     assert result["items"][1]["properties"]["content"][0]["lang"] == "en"
1213 |     assert result["items"][1]["properties"]["content"][1]["lang"] == "it"
1214 |     assert result["items"][2]["lang"] == "sv"
1215 |     assert result["items"][2]["properties"]["content"][0]["lang"] == "en"
1216 |     assert result["items"][2]["properties"]["content"][1]["lang"] == "sv"
1217 | 
1218 | 
1219 | def test_parser_object():
1220 |     with open(os.path.join(TEST_DIR, "festivus.html")) as f:
1221 |         p = Parser(doc=f)
1222 |     assert len(p.to_dict(filter_by_type="h-card")) == 3
1223 |     assert len(p.to_dict(filter_by_type="h-entry")) == 4
1224 |     assert (
1225 |         p.to_json(filter_by_type="h-card")
1226 |         == '[{"type": ["h-card"], "properties": {"name": ["Jerry"]}}, {"type": '
1227 |         '["h-card"], "properties": {"name": ["Frank"]}}, {"type": ["h-card"], '
1228 |         '"properties": {"name": ["Cosmo"]}}]'
1229 |     )
1230 | 


--------------------------------------------------------------------------------
/test/test_suite.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import os.path
 4 | import sys
 5 | 
 6 | from test_parser import check_unicode
 7 | 
 8 | import mf2py
 9 | 
10 | 
11 | def test_mf2tests():
12 |     allfiles = glob.glob(os.path.join(".", "testsuite", "tests", "*", "*", "*.json"))
13 |     for jsonfile in allfiles:
14 |         htmlfile = jsonfile[:-4] + "html"
15 |         with open(htmlfile) as f:
16 |             p = mf2py.parse(doc=f, url="http://example.com")
17 |             check_unicode(htmlfile, p)
18 |         with open(jsonfile) as jsonf:
19 |             try:
20 |                 s = json.load(jsonf)
21 |             except:
22 |                 s = "bad file: " + jsonfile + sys.exc_info()[0]
23 |         check_mf2(htmlfile, p, s)
24 | 
25 | 
26 | def check_mf2(htmlfile, p, s):
27 |     # TODO ignore extra keys in p that are not in s
28 |     assert p == s
29 | 


--------------------------------------------------------------------------------