or open/close
149 | length = len(results)
150 | for i in range(0, length):
151 | if results[i] == " " and (
152 | i == 0
153 | or i == length - 1
154 | or results[i - 1] == " "
155 | or results[i - 1] in (P_BREAK_BEFORE, P_BREAK_AFTER)
156 | or results[i + 1] == " "
157 | or results[i + 1] in (P_BREAK_BEFORE, P_BREAK_AFTER)
158 | ):
159 | results[i] = ""
160 |
161 | if results:
162 | # remove leading whitespace and i.e. next lines
163 | while (
164 | isinstance(results[0], str) and (results[0] == "" or results[0].isspace())
165 | ) or results[0] in (P_BREAK_BEFORE, P_BREAK_AFTER):
166 | results.pop(0)
167 | if not results:
168 | break
169 |
170 | if results:
171 | # remove trailing whitespace and i.e. next lines
172 | while (
173 | isinstance(results[-1], str)
174 | and (results[-1] == "" or results[-1].isspace())
175 | ) or results[-1] in (P_BREAK_BEFORE, P_BREAK_AFTER):
176 | results.pop(-1)
177 | if not results:
178 | break
179 |
180 | # trim leading and trailing non- whitespace
181 | if results:
182 | if isinstance(results[0], str):
183 | results[0] = results[0].lstrip()
184 | if isinstance(results[-1], str):
185 | results[-1] = results[-1].rstrip()
186 |
187 | # create final string by concatenating replacing consecutive sequence of by largest value number of \n
188 | text = ""
189 | count = 0
190 | last = None
191 | for t in results:
192 | if t in (P_BREAK_BEFORE, P_BREAK_AFTER):
193 | count = max(t, count)
194 | elif t == PRE_BEFORE:
195 | text = text.rstrip(" ")
196 | elif not isinstance(t, int):
197 | if count or last == "\n":
198 | t = t.lstrip(" ")
199 | text = "".join([text, "\n" * count, t])
200 | count = 0
201 | last = t
202 |
203 | return text
204 |
--------------------------------------------------------------------------------
/mf2py/implied_properties.py:
--------------------------------------------------------------------------------
1 | from . import mf2_classes
2 | from .dom_helpers import get_attr, get_children, get_img, get_textContent, try_urljoin
3 |
4 |
5 | def name(el, base_url, filtered_roots):
6 | """Find an implied name property
7 |
8 | Args:
9 | el (bs4.element.Tag): a DOM element
10 |
11 | Returns:
12 | string: the implied name value
13 | """
14 |
15 | def non_empty(val):
16 | """If alt or title is empty, we don't want to use it as the implied
17 | name"""
18 | return val is not None and val != ""
19 |
20 | # if image or area use alt text if not empty
21 | prop_value = get_attr(el, "alt", check_name=("img", "area"))
22 | if non_empty(prop_value):
23 | return prop_value
24 |
25 | # if abbreviation use the title if not empty
26 | prop_value = get_attr(el, "title", check_name="abbr")
27 | if non_empty(prop_value):
28 | return prop_value
29 |
30 | # find candidate child or grandchild
31 | poss_child = None
32 | children = list(get_children(el))
33 | if len(children) == 1:
34 | poss_child = children[0]
35 |
36 | # ignore if mf2 root
37 | if mf2_classes.root(poss_child.get("class", []), filtered_roots):
38 | poss_child = None
39 |
40 | # if it is not img, area, abbr then find grandchild
41 | if poss_child and poss_child.name not in ("img", "area", "abbr"):
42 | grandchildren = list(get_children(poss_child))
43 | # if only one grandchild
44 | if len(grandchildren) == 1:
45 | poss_child = grandchildren[0]
46 | # if it is not img, area, abbr or is mf2 root then no possible child
47 | if poss_child.name not in ("img", "area", "abbr") or mf2_classes.root(
48 | poss_child.get("class", []), filtered_roots
49 | ):
50 | poss_child = None
51 |
52 | # if a possible child was found
53 | if poss_child is not None:
54 | # use alt if possible child is img or area
55 | prop_value = get_attr(poss_child, "alt", check_name=("img", "area"))
56 | if non_empty(prop_value):
57 | return prop_value
58 |
59 | # use title if possible child is abbr
60 | prop_value = get_attr(poss_child, "title", check_name="abbr")
61 | if non_empty(prop_value):
62 | return prop_value
63 |
64 | # use text if all else fails
65 | # replace images with alt but not with src in implied name
66 | # proposal: https://github.com/microformats/microformats2-parsing/issues/35#issuecomment-393615508
67 | return get_textContent(el, replace_img=True, img_to_src=False, base_url=base_url)
68 |
69 |
70 | def photo(el, base_url, filtered_roots):
71 | """Find an implied photo property
72 |
73 | Args:
74 | el (bs4.element.Tag): a DOM element
75 | base_url (string): the base URL to use, to reconcile relative URLs
76 |
77 | Returns:
78 | string or dictionary: the implied photo value or implied photo as a dictionary with alt value
79 | """
80 |
81 | def get_photo_child(children):
82 | "take a list of children and finds a valid child for photo property"
83 |
84 | # if element has one image child use source if exists and img is
85 | # not root class
86 | poss_imgs = [c for c in children if c.name == "img"]
87 | if len(poss_imgs) == 1:
88 | poss_img = poss_imgs[0]
89 | if not mf2_classes.root(poss_img.get("class", []), filtered_roots):
90 | return poss_img
91 |
92 | # if element has one object child use data if exists and object is
93 | # not root class
94 | poss_objs = [c for c in children if c.name == "object"]
95 | if len(poss_objs) == 1:
96 | poss_obj = poss_objs[0]
97 | if not mf2_classes.root(poss_obj.get("class", []), filtered_roots):
98 | return poss_obj
99 |
100 | def resolve_relative_url(prop_value):
101 | if isinstance(prop_value, dict):
102 | prop_value["value"] = try_urljoin(base_url, prop_value["value"])
103 | else:
104 | prop_value = try_urljoin(base_url, prop_value)
105 | return prop_value
106 |
107 | # if element is an img use source if exists
108 | if prop_value := get_img(el, base_url):
109 | return resolve_relative_url(prop_value)
110 |
111 | # if element is an object use data if exists
112 | if prop_value := get_attr(el, "data", check_name="object"):
113 | return resolve_relative_url(prop_value)
114 |
115 | # find candidate child or grandchild
116 | poss_child = None
117 | children = list(get_children(el))
118 |
119 | poss_child = get_photo_child(children)
120 |
121 | # if no possible child found then look for grandchild if only one child which is not not mf2 root
122 | if (
123 | poss_child is None
124 | and len(children) == 1
125 | and not mf2_classes.root(children[0].get("class", []), filtered_roots)
126 | ):
127 | grandchildren = list(get_children(children[0]))
128 | poss_child = get_photo_child(grandchildren)
129 |
130 | # if a possible child was found parse
131 | if poss_child is not None:
132 | # img get src
133 | if prop_value := get_img(poss_child, base_url):
134 | return resolve_relative_url(prop_value)
135 |
136 | # object get data
137 | if prop_value := get_attr(poss_child, "data", check_name="object"):
138 | return resolve_relative_url(prop_value)
139 |
140 |
141 | def url(el, base_url, filtered_roots):
142 | """Find an implied url property
143 |
144 | Args:
145 | el (bs4.element.Tag): a DOM element
146 | base_url (string): the base URL to use, to reconcile relative URLs
147 |
148 | Returns:
149 | string: the implied url value
150 | """
151 |
152 | def get_url_child(children):
153 | "take a list of children and finds a valid child for url property"
154 |
155 | # if element has one child use if not root class
156 | poss_as = [c for c in children if c.name == "a"]
157 | if len(poss_as) == 1:
158 | poss_a = poss_as[0]
159 | if not mf2_classes.root(poss_a.get("class", []), filtered_roots):
160 | return poss_a
161 |
162 | # if element has one area child use if not root class
163 | poss_areas = [c for c in children if c.name == "area"]
164 | if len(poss_areas) == 1:
165 | poss_area = poss_areas[0]
166 | if not mf2_classes.root(poss_area.get("class", []), filtered_roots):
167 | return poss_area
168 |
169 | # if element is a or area use its href if exists
170 | prop_value = get_attr(el, "href", check_name=("a", "area"))
171 | if prop_value is not None: # an empty href is valid
172 | return try_urljoin(base_url, prop_value)
173 |
174 | # find candidate child or grandchild
175 | poss_child = None
176 | children = list(get_children(el))
177 |
178 | poss_child = get_url_child(children)
179 |
180 | # if no possible child found then look for grandchild if only one child which is not mf2 root
181 | if (
182 | poss_child is None
183 | and len(children) == 1
184 | and not mf2_classes.root(children[0].get("class", []), filtered_roots)
185 | ):
186 | grandchildren = list(get_children(children[0]))
187 | poss_child = get_url_child(grandchildren)
188 |
189 | # if a possible child was found parse
190 | if poss_child is not None:
191 | prop_value = get_attr(poss_child, "href", check_name=("a", "area"))
192 | if prop_value is not None: # an empty href is valid
193 | return try_urljoin(base_url, prop_value)
194 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file.
3 |
4 | ## 2.0.1 - 2023-12-07
5 | The mf2py library is excited to transition into 2.0. This version increase incorporates months of work from contributors, informed by active discussions among implementers and users.
6 |
7 | This release officially deprecates support for versions of Python lower than 3.8.
8 |
9 | Below are the changes we have made in this release.
10 |
11 | ### New Features
12 | - Enable `img_with_alt` by default (#184)
13 | - Add timezone offset normalisation (#206)
14 | - Add option for exposing DOM for embedded properties (#208)
15 | - Add srcset support (#209)
16 | - Add language support (#210)
17 | - Add option for filtering root class names (#211)
18 | - Add option for metaformats support (#213)
19 |
20 | ### Changes
21 | - Remove `img_with_alt` option entirely (#200)
22 | - Resolve implied photo relative paths (#205)
23 | - Make relative URLs in embedded properties absolute (#201)
24 | - Fix whitespace in plaintext conversion (#207)
25 | - Replace `dict_class` with standard `dict` (#196)
26 |
27 | ### Tests, Library and Documentation Maintenance
28 | - Update tests to include alt texts by default (#190)
29 | - Add Windows and macOS tests (#198)
30 | - Use poetry for dependency management (#189)
31 | - Deprecate Python 2 support (#179)
32 | - Lint code with `black` and `isort`
33 | - Add linting CI actions (#193)
34 | - Move from `nosetests` to `pytest` (#186)
35 | - Add 3.11, 3.12 and drop pypy from test matrix; upgrade poetry action (#204)
36 | - Prepare tests to test options (#214)
37 | - Bring README doctests up-to-date (#215)
38 |
39 | ## 1.1.3 - 2022-06-28
40 | - reduce instances where photo is implied (#135)
41 | - always do relative URL resolution (#138)
42 | - VCP now handles tz offsets without leading zeros (#142)
43 | - implement id parsing (#143)
44 | - fix outdated syntax causing SyntaxWarning (#157)
45 |
46 | ## 1.1.2 - 2018-08-08
47 | - add parsing for iframe.u-*[src] (#116)
48 | - bug fix: reduced implied urls (#117)
49 | - bug fix: don't collapse whitespace between tags
50 | - specify explicit versions for dependencies
51 | - revert BeautifulSoup copying added in 1.1.1 due to bugs (eg #108)
52 | - misc performance improvements
53 |
54 | ## 1.1.1 - 2018-06-15
55 | - streamline backcompat to use JSON only.
56 | - fix multiple mf1 root rel-tag parsing
57 | - correct url and photo for hreview.
58 | - add rules for nested hreview. update backcompat to use multiple matches in old properties.
59 | - fix `rel-tag` to `p-category` conversion so that other classes are not lost.
60 | - use original authored html for `e-*` parsing in backcompat
61 | - make classes and rels into unordered (alphabetically ordered) deduped arrays.
62 | - only use class names for mf2 which follow the naming rules
63 | - fix `parse` method to use default html parser.
64 | - always use the first value for attributes for rels.
65 | - correct AM/PM conversion in datetime value class pattern.
66 | - add ordinal date parsing to datetimes value class pattern. ordinal date is normalised to YYYY-MM-DD
67 | - remove hack for html tag classes since that is fixed in new BS
68 | - better whitespace algorithm for `name` and `html.value` parsing
69 | - experimental flag for including `alt` in `u-photo` parsing
70 | - make a copy of the BeautifulSoup given by user to work on for parsing to prevent changes to original doc
71 | - bump version to 1.1.1
72 |
73 | ## 1.1.0 - 2018-03-16
74 | - bump version to 1.1.0 since it is a "major" change
75 | - added tests for new implied name rules
76 | - modified earlier tests to accommodate new rules
77 | - use space separator instead of "T"
78 | - Don't add "00" seconds unless authored
79 | - use TZ authored in separate `value` element
80 | - only use first found `value` of a particular type `date`, `time`, or `timezone`.
81 | - move backcompat rules into JSON files
82 | - reorganise value class pattern parsing into new files
83 | - add datetime_helpers to organise datetime parsing rules
84 | - reorganise tests
85 | - remove Heroku frontend, point to mf2py-web and python.microformats.io instead in README.
86 | - remove Flask and gunicorn requirements
87 | - add debug info with description, version, url and the html parser used
88 |
89 | ## 1.0.6 - 2018-03-04
90 | - strip leading/trailing white space for `e-*[html]`. update the corresponding tests
91 | - blank values explicitly authored are allowed as property values
92 | - include `alt` or `src` from ` ` in parsing for `p-*` and `e-*[value]`
93 | - parse `title` from ` ` for `p-*` resolves #84
94 | - and `poster` from `` for `u-*` resolves #76
95 | - use `html5lib` as default parser
96 | - use the final redirect URL resolves #62
97 | - update requirements to use BS4 v4.6.0 and html5lib v1.0.1
98 | - drop support for Python 2.6 as html5lib dropped support
99 |
100 | ## 1.0.5 - 2016-05-09
101 | - Implied property checks now ignore alt="", treating it the same as
102 | if no alt value is defined.
103 | - Support for using a custom dict implementation by setting
104 | mf2py.Parser.dict_class. collections.OrderedDict yields much nicer
105 | output for hosted parsers.
106 |
107 | ## 1.0.4 - 2016-03-21
108 | - Performance improvement changing simple calls to soup.find_all to
109 | a manual iteration over .contents.
110 |
111 | ## 1.0.3 - 2016-02-05
112 | - Performance improvement by limiting number of calls to soup.find_all
113 | in backcompat module. Should not be any functional changes.
114 |
115 | ## 1.0.2 - 2016-01-26
116 | - Backward compatibility parsing for rel=tag properties. These are now converted
117 | to p-category based on the last path segment of the tag URI as spec'd in
118 | http://microformats.org/wiki/h-entry#Parser_Compatibility
119 | - Optional property html_parser to specify the html parser that BeautifulSoup
120 | should use (e.g., "lxml" or "html5lib")
121 |
122 | ## 1.0.1 - 2015-12-11
123 | - `u-*` properties are now parsed from ` ` elements per the updated spec
124 | http://microformats.org/wiki/microformats2-parsing-issues#link_elements_and_u-_parsing
125 |
126 | ## 1.0.0 - 2015-10-05
127 | - Version number bumped to 1.0.0 following community discussion.
128 |
129 | ## 0.2.8 - 2015-09-21
130 | - Stricter checks that Parser.__init__ params are actually None before
131 | ignoring them.
132 |
133 | ## 0.2.7 - 2015-08-03
134 | - Now produces unicode strings for every key and value, no more byte
135 | strings anywhere.
136 | - Do not add 'T' between date and time when normalizing dates
137 | - Unit tests for running the microformats test suite
138 |
139 | ## 0.2.6 - 2015-05-06
140 | - New top-level "rel-urls" entry, contains rich data parsed from rel
141 | links, organized by URL.
142 |
143 | ## 0.2.5 - 2015-03-01
144 | - convenience method `mf2py.parse` that takes the same arguments as Parser
145 | and returns a dict.
146 | - nested h-* classes now parse their "value" based on the property
147 | they represent (p-*, u-*, dt-*), so for example "p-in-reply-to
148 | h-cite" would have a name as its value and "u-in-reply-to h-cite"
149 | will have a URL.
150 |
151 | ## 0.2.4 - 2015-02-13
152 | - Add rel=bookmark to backward compat parsing rules based (translated
153 | to u-url in mf2)
154 | - Parser constructor now takes explicit named arguments instead of
155 | **kwargs, for saner behavior when called with unnamed arguments.
156 | - Bugfix: Empty href="" attributes are now properly interpreted as
157 | the current document's URL.
158 |
159 | ## 0.2.3 - 2015-02-07
160 | - Minor Py3 compatibility fix
161 | - Correct typo `test_requires` -> `tests_require` in setup.py
162 |
163 | ## 0.2.2 - 2015-02-05
164 | - Started keeping a changelog!
165 | - Use a better method for extracting HTML for an e-* property
166 | - Correct BeautifulSoup4 dependency in setup.py to fix error with
167 | installation from PyPI.
168 | - Buffed up docstrings for public methods.
169 |
--------------------------------------------------------------------------------
/mf2py/parser.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import json
3 | from urllib.parse import urlparse
4 |
5 | import requests
6 | from bs4 import BeautifulSoup, FeatureNotFound
7 | from bs4.element import Tag
8 |
9 | from . import (
10 | backcompat,
11 | implied_properties,
12 | metaformats,
13 | mf2_classes,
14 | parse_property,
15 | temp_fixes,
16 | )
17 | from .dom_helpers import get_attr, get_children, get_descendents, try_urljoin
18 | from .mf_helpers import unordered_list
19 | from .version import __version__
20 |
21 |
22 | def parse(
23 | doc=None,
24 | url=None,
25 | html_parser=None,
26 | expose_dom=False,
27 | metaformats=False,
28 | filter_roots=False,
29 | ):
30 | """
31 | Parse a document or URL for microformats and return a dictionary in mf2json format.
32 |
33 | Args:
34 | doc (file, string or BeautifulSoup doc): file handle, text of content
35 | to parse, or BeautifulSoup document. If None it will be fetched from
36 | given URL.
37 | url (string): URL of the file to be processed. If None it will be
38 | extracted from the ` ` element of given doc.
39 | html_parser (string): optional, select a specific HTML parser. Valid options
40 | from the BeautifulSoup documentation are: "html", "xml","html5", "lxml",
41 | "html5lib", and "html.parser".
42 | expose_dom (boolean): optional, expose the DOM of embedded properties.
43 | metaformats (boolean): optional, include metaformats extracted from OGP
44 | and Twitter card data: https://microformats.org/wiki/metaformats
45 | filter_roots (boolean or list): optional, filter root class names. Use
46 | True to filter known conflicting classes, otherwise filter given list.
47 |
48 | Return: a mf2json dict representing the structured data in the document
49 |
50 | """
51 | return Parser(
52 | doc,
53 | url,
54 | html_parser,
55 | expose_dom=expose_dom,
56 | metaformats=metaformats,
57 | filter_roots=filter_roots,
58 | ).to_dict()
59 |
60 |
61 | class Parser(object):
62 | """
63 | Parser to parse a document or URL for microformats and output in various formats.
64 |
65 | Args:
66 | doc (file, string or BeautifulSoup doc): file handle, text of content
67 | to parse, or BeautifulSoup document. If None it will be fetched from
68 | given URL.
69 | url (string): URL of the file to be processed. If None it will be
70 | extracted from the ` ` element of given doc.
71 | html_parser (string): optional, select a specific HTML parser. Valid options
72 | from the BeautifulSoup documentation are: "html", "xml","html5", "lxml",
73 | "html5lib", and "html.parser".
74 | expose_dom (boolean): optional, expose the DOM of embedded properties.
75 | metaformats (boolean): optional, include metaformats extracted from OGP
76 | and Twitter card data: https://microformats.org/wiki/metaformats
77 | filter_roots (boolean or list): optional, filter root class names. Use
78 | True to filter known conflicting classes, otherwise filter given list.
79 |
80 | Attributes:
81 | useragent (string): the User-Agent string for the Parser
82 |
83 | """
84 |
85 | ua_desc = "mf2py - microformats2 parser for python"
86 | ua_url = "https://github.com/microformats/mf2py"
87 | useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)
88 |
89 | def __init__(
90 | self,
91 | doc=None,
92 | url=None,
93 | html_parser=None,
94 | expose_dom=False,
95 | metaformats=False,
96 | filter_roots=False,
97 | ):
98 | self.__url__ = None
99 | self.__doc__ = None
100 | self._preserve_doc = False
101 | self.__parsed__ = {
102 | "items": [],
103 | "rels": {},
104 | "rel-urls": {},
105 | "debug": {
106 | "description": self.ua_desc,
107 | "source": self.ua_url,
108 | "version": __version__,
109 | },
110 | }
111 | self.lang = None
112 | self.expose_dom = expose_dom
113 | self.__metaformats = metaformats
114 | try:
115 | self.filtered_roots = set(filter_roots)
116 | except TypeError:
117 | if filter_roots:
118 | self.filtered_roots = mf2_classes.CONFLICTING_ROOTS_TAILWIND
119 | else:
120 | self.filtered_roots = []
121 |
122 | # use default parser if none specified
123 | self.__html_parser__ = html_parser or "html5lib"
124 |
125 | if url is not None:
126 | self.__url__ = url
127 |
128 | if doc is None:
129 | data = requests.get(
130 | self.__url__,
131 | headers={
132 | "User-Agent": self.useragent,
133 | },
134 | )
135 |
136 | # update to final URL after redirects
137 | self.__url__ = data.url
138 |
139 | # HACK: check for character encodings and use 'correct' data
140 | if "charset" in data.headers.get("content-type", ""):
141 | doc = data.text
142 | else:
143 | doc = data.content
144 |
145 | if doc is not None:
146 | if isinstance(doc, BeautifulSoup) or isinstance(doc, Tag):
147 | self.__doc__ = doc
148 | self._preserve_doc = True
149 | else:
150 | try:
151 | # try the user-given html parser or default html5lib
152 | self.__doc__ = BeautifulSoup(doc, features=self.__html_parser__)
153 | except FeatureNotFound:
154 | # maybe raise a warning?
155 | # else switch to default use
156 | self.__doc__ = BeautifulSoup(doc)
157 |
158 | # update actual parser used
159 | # uses builder.NAME from BeautifulSoup
160 | if isinstance(self.__doc__, BeautifulSoup) and self.__doc__.builder is not None:
161 | self.__html_parser__ = self.__doc__.builder.NAME
162 | else:
163 | self.__html_parser__ = None
164 |
165 | # check for tag
166 | if self.__doc__:
167 | poss_base = next(
168 | (el for el in get_descendents(self.__doc__) if el.name == "base"), None
169 | )
170 | if poss_base:
171 | poss_base_url = poss_base.get("href") # try to get href
172 | if poss_base_url:
173 | if urlparse(poss_base_url).netloc:
174 | # base specifies an absolute path
175 | self.__url__ = poss_base_url
176 | elif self.__url__:
177 | # base specifies a relative path
178 | self.__url__ = try_urljoin(self.__url__, poss_base_url)
179 |
180 | if self.__doc__ is not None:
181 | if document := self.__doc__.find("html"):
182 | self.lang = document.attrs.get("lang")
183 | # parse!
184 | self._parse()
185 |
186 | def _parse(self):
187 | """Does the work of actually parsing the document. Done automatically
188 | on initialization.
189 | """
190 | self._default_date = None
191 | # _default_date exists to provide implementation for rules described
192 | # in legacy value-class-pattern. basically, if you have two dt-
193 | # properties and one does not have the full date, it can use the
194 | # existing date as a template.
195 | # see value-class-pattern#microformats2_parsers on wiki.
196 | # see also the implied_relative_datetimes testcase.
197 |
198 | def handle_microformat(
199 | root_class_names,
200 | el,
201 | value_property=None,
202 | simple_value=None,
203 | backcompat_mode=False,
204 | ):
205 | """Handles a (possibly nested) microformat, i.e. h-*"""
206 | properties = {}
207 | children = []
208 | self._default_date = None
209 | # for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
210 | parsed_types_aggregation = set()
211 |
212 | if backcompat_mode:
213 | el = backcompat.apply_rules(
214 | el, self.__html_parser__, self.filtered_roots
215 | )
216 | root_class_names = mf2_classes.root(
217 | el.get("class", []), self.filtered_roots
218 | )
219 |
220 | root_lang = el.attrs.get("lang")
221 |
222 | # parse for properties and children
223 | for child in get_children(el):
224 | (
225 | child_props,
226 | child_children,
227 | child_parsed_types_aggregation,
228 | ) = parse_props(child, root_lang)
229 | for key, new_value in child_props.items():
230 | prop_value = properties.get(key, [])
231 | prop_value.extend(new_value)
232 | properties[key] = prop_value
233 | children.extend(child_children)
234 | parsed_types_aggregation.update(child_parsed_types_aggregation)
235 |
236 | # complex h-* objects can take their "value" from the
237 | # first explicit property ("name" for p-* or "url" for u-*)
238 | if value_property and value_property in properties:
239 | simple_value = properties[value_property][0]
240 |
241 | # if some properties not already found find in implied ways unless in backcompat mode
242 | if not backcompat_mode:
243 | # stop implied name if any p-*, e-*, h-* is already found
244 | if "name" not in properties and parsed_types_aggregation.isdisjoint(
245 | "peh"
246 | ):
247 | properties["name"] = [
248 | implied_properties.name(el, self.__url__, self.filtered_roots)
249 | ]
250 |
251 | if "photo" not in properties and parsed_types_aggregation.isdisjoint(
252 | "uh"
253 | ):
254 | x = implied_properties.photo(el, self.__url__, self.filtered_roots)
255 | if x is not None:
256 | properties["photo"] = [x]
257 |
258 | # stop implied url if any u-* or h-* is already found
259 | if "url" not in properties and parsed_types_aggregation.isdisjoint(
260 | "uh"
261 | ):
262 | x = implied_properties.url(el, self.__url__, self.filtered_roots)
263 | if x is not None:
264 | properties["url"] = [x]
265 |
266 | # build microformat with type and properties
267 | microformat = {
268 | "type": [class_name for class_name in sorted(root_class_names)],
269 | "properties": properties,
270 | }
271 | if el.name == "area":
272 | shape = get_attr(el, "shape")
273 | if shape is not None:
274 | microformat["shape"] = shape
275 |
276 | coords = get_attr(el, "coords")
277 | if coords is not None:
278 | microformat["coords"] = coords
279 |
280 | # insert children if any
281 | if children:
282 | microformat["children"] = children
283 |
284 | Id = get_attr(el, "id")
285 | if Id:
286 | microformat["id"] = Id
287 |
288 | # simple value is the parsed property value if it were not
289 | # an h-* class
290 | if simple_value is not None:
291 | if isinstance(simple_value, dict):
292 | # for e-* properties, the simple value will be
293 | # {"html":..., "value":...} which we should fold
294 | # into the microformat object
295 | # details: https://github.com/microformats/mf2py/issues/35
296 | microformat.update(simple_value)
297 | else:
298 | microformat["value"] = simple_value
299 |
300 | if root_lang:
301 | microformat["lang"] = root_lang
302 | elif self.lang:
303 | microformat["lang"] = self.lang
304 | return microformat
305 |
306 | def parse_props(el, root_lang):
307 | """Parse the properties from a single element"""
308 | props = {}
309 | children = []
310 | # for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
311 | parsed_types_aggregation = set()
312 |
313 | classes = el.get("class", [])
314 | filtered_classes = mf2_classes.filter_classes(classes)
315 | # Is this element a microformat2 root?
316 | root_class_names = filtered_classes["h"]
317 | backcompat_mode = False
318 |
319 | # Is this element a microformat1 root?
320 | if not root_class_names:
321 | root_class_names = backcompat.root(classes)
322 | backcompat_mode = True
323 |
324 | if root_class_names:
325 | parsed_types_aggregation.add("h")
326 |
327 | # Is this a property element (p-*, u-*, etc.) flag
328 | # False is default
329 | is_property_el = False
330 |
331 | # Parse plaintext p-* properties.
332 | p_value = None
333 | for prop_name in filtered_classes["p"]:
334 | is_property_el = True
335 | parsed_types_aggregation.add("p")
336 | prop_value = props.setdefault(prop_name, [])
337 |
338 | # if value has not been parsed then parse it
339 | if p_value is None:
340 | p_value = parse_property.text(el, base_url=self.__url__)
341 |
342 | if root_class_names:
343 | prop_value.append(
344 | handle_microformat(
345 | root_class_names,
346 | el,
347 | value_property="name",
348 | simple_value=p_value,
349 | backcompat_mode=backcompat_mode,
350 | )
351 | )
352 | else:
353 | prop_value.append(p_value)
354 |
355 | # Parse URL u-* properties.
356 | u_value = None
357 | for prop_name in filtered_classes["u"]:
358 | is_property_el = True
359 | parsed_types_aggregation.add("u")
360 | prop_value = props.setdefault(prop_name, [])
361 |
362 | # if value has not been parsed then parse it
363 | if u_value is None:
364 | u_value = parse_property.url(el, base_url=self.__url__)
365 |
366 | if root_class_names:
367 | prop_value.append(
368 | handle_microformat(
369 | root_class_names,
370 | el,
371 | value_property="url",
372 | simple_value=u_value,
373 | backcompat_mode=backcompat_mode,
374 | )
375 | )
376 | else:
377 | if isinstance(u_value, dict):
378 | prop_value.append(u_value)
379 | else:
380 | prop_value.append(u_value)
381 |
382 | # Parse datetime dt-* properties.
383 | dt_value = None
384 | for prop_name in filtered_classes["dt"]:
385 | is_property_el = True
386 | parsed_types_aggregation.add("d")
387 | prop_value = props.setdefault(prop_name, [])
388 |
389 | # if value has not been parsed then parse it
390 | if dt_value is None:
391 | dt_value, new_date = parse_property.datetime(el, self._default_date)
392 | # update the default date
393 | if new_date:
394 | self._default_date = new_date
395 |
396 | if root_class_names:
397 | stops_implied_name = True
398 | prop_value.append(
399 | handle_microformat(
400 | root_class_names,
401 | el,
402 | simple_value=dt_value,
403 | backcompat_mode=backcompat_mode,
404 | )
405 | )
406 | else:
407 | if dt_value is not None:
408 | prop_value.append(dt_value)
409 |
410 | # Parse embedded markup e-* properties.
411 | e_value = None
412 | for prop_name in filtered_classes["e"]:
413 | is_property_el = True
414 | parsed_types_aggregation.add("e")
415 | prop_value = props.setdefault(prop_name, [])
416 |
417 | # if value has not been parsed then parse it
418 | if e_value is None:
419 | # send original element for parsing backcompat
420 | if el.original is None:
421 | embedded_el = el
422 | else:
423 | embedded_el = el.original
424 | if self._preserve_doc:
425 | embedded_el = copy.copy(embedded_el)
426 | temp_fixes.rm_templates(embedded_el)
427 | e_value = parse_property.embedded(
428 | embedded_el, self.__url__, root_lang, self.lang, self.expose_dom
429 | )
430 |
431 | if root_class_names:
432 | stops_implied_name = True
433 | prop_value.append(
434 | handle_microformat(
435 | root_class_names,
436 | el,
437 | simple_value=e_value,
438 | backcompat_mode=backcompat_mode,
439 | )
440 | )
441 | else:
442 | prop_value.append(e_value)
443 |
444 | # if this is not a property element, but it is a h-* microformat,
445 | # add it to our list of children
446 | if not is_property_el and root_class_names:
447 | children.append(
448 | handle_microformat(
449 | root_class_names, el, backcompat_mode=backcompat_mode
450 | )
451 | )
452 | # parse child tags, provided this isn't a microformat root-class
453 | if not root_class_names:
454 | for child in get_children(el):
455 | (
456 | child_properties,
457 | child_microformats,
458 | child_parsed_types_aggregation,
459 | ) = parse_props(child, root_lang)
460 | for prop_name in child_properties:
461 | v = props.get(prop_name, [])
462 | v.extend(child_properties[prop_name])
463 | props[prop_name] = v
464 | children.extend(child_microformats)
465 | parsed_types_aggregation.update(child_parsed_types_aggregation)
466 | return props, children, parsed_types_aggregation
467 |
468 | def parse_rels(el):
469 | """Parse an element for rel microformats"""
470 | rel_attrs = get_attr(el, "rel")
471 | # if rel attributes exist
472 | if rel_attrs is not None:
473 | # find the url and normalise it
474 | url = try_urljoin(self.__url__, el.get("href", ""))
475 | value_dict = self.__parsed__["rel-urls"].get(url, {})
476 |
477 | # 1st one wins
478 | if "text" not in value_dict:
479 | value_dict["text"] = el.get_text().strip()
480 |
481 | url_rels = value_dict.get("rels", [])
482 | value_dict["rels"] = url_rels
483 |
484 | for knownattr in ("media", "hreflang", "type", "title"):
485 | x = get_attr(el, knownattr)
486 | # 1st one wins
487 | if x is not None and knownattr not in value_dict:
488 | value_dict[knownattr] = x
489 |
490 | self.__parsed__["rel-urls"][url] = value_dict
491 |
492 | for rel_value in rel_attrs:
493 | value_list = self.__parsed__["rels"].get(rel_value, [])
494 | if url not in value_list:
495 | value_list.append(url)
496 | if rel_value not in url_rels:
497 | url_rels.append(rel_value)
498 |
499 | self.__parsed__["rels"][rel_value] = value_list
500 | if "alternate" in rel_attrs:
501 | alternate_list = self.__parsed__.get("alternates", [])
502 | alternate_dict = {}
503 | alternate_dict["url"] = url
504 | x = " ".join([r for r in rel_attrs if not r == "alternate"])
505 | if x != "":
506 | alternate_dict["rel"] = x
507 | alternate_dict["text"] = el.get_text().strip()
508 | for knownattr in ("media", "hreflang", "type", "title"):
509 | x = get_attr(el, knownattr)
510 | if x is not None:
511 | alternate_dict[knownattr] = x
512 | alternate_list.append(alternate_dict)
513 | self.__parsed__["alternates"] = alternate_list
514 |
515 | def parse_el(el, ctx):
516 | """Parse an element for microformats"""
517 | classes = el.get("class", [])
518 |
519 | # find potential microformats in root classnames h-*
520 | potential_microformats = mf2_classes.root(classes, self.filtered_roots)
521 |
522 | # if potential microformats found parse them
523 | if potential_microformats:
524 | result = handle_microformat(potential_microformats, el)
525 | ctx.append(result)
526 | else:
527 | # find backcompat root classnames
528 | potential_microformats = backcompat.root(classes)
529 | if potential_microformats:
530 | result = handle_microformat(
531 | potential_microformats, el, backcompat_mode=True
532 | )
533 | ctx.append(result)
534 | else:
535 | # parse child tags
536 | for child in get_children(el):
537 | parse_el(child, ctx)
538 |
539 | ctx = []
540 |
541 | if self.__metaformats:
542 | # extract out a metaformats item, if available
543 | self.__metaformats_item = metaformats.parse(self.__doc__, url=self.__url__)
544 |
545 | # start parsing at root element of the document
546 | parse_el(self.__doc__, ctx)
547 | self.__parsed__["items"] = ctx
548 | if self.__metaformats and self.__metaformats_item:
549 | self.__parsed__["items"].append(self.__metaformats_item)
550 |
551 | # parse for rel values
552 | for el in get_descendents(self.__doc__):
553 | if el.name in ("a", "area", "link") and el.has_attr("rel"):
554 | parse_rels(el)
555 |
556 | # sort the rels array in rel-urls since this should be unordered set
557 | for url in self.__parsed__["rel-urls"]:
558 | if "rels" in self.__parsed__["rel-urls"][url]:
559 | rels = self.__parsed__["rel-urls"][url]["rels"]
560 | self.__parsed__["rel-urls"][url]["rels"] = unordered_list(rels)
561 |
562 | # add actual parser used to debug
563 | # uses builder.NAME from BeautifulSoup
564 | if self.__html_parser__:
565 | self.__parsed__["debug"]["markup parser"] = self.__html_parser__
566 | else:
567 | self.__parsed__["debug"]["markup parser"] = "unknown"
568 |
569 | def to_dict(self, filter_by_type=None):
570 | """Get a dictionary version of the parsed microformat document.
571 |
572 | Args:
573 | filter_by_type (string, optional): only include top-level items of
574 | the given h-* type. Defaults to None.
575 |
576 | Returns:
577 | dict: representation of the parsed microformats document
578 | """
579 | if filter_by_type is None:
580 | return self.__parsed__
581 | else:
582 | return [x for x in self.__parsed__["items"] if filter_by_type in x["type"]]
583 |
584 | def to_json(self, pretty_print=False, filter_by_type=None):
585 | """Get a json-encoding string version of the parsed microformats document
586 |
587 | Args:
588 | pretty_print (bool, optional): Encode the json document with
589 | linebreaks and indents to improve readability. Defaults to False.
590 | filter_by_type (bool, optional): only include top-level items of
591 | the given h-* type
592 |
593 | Returns:
594 | string: a json-encoded string
595 | """
596 |
597 | if pretty_print:
598 | return json.dumps(
599 | self.to_dict(filter_by_type), indent=4, separators=(", ", ": ")
600 | )
601 | else:
602 | return json.dumps(self.to_dict(filter_by_type))
603 |
--------------------------------------------------------------------------------
/test/test_parser.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | from unittest import TestCase, mock
5 |
6 | import bs4
7 | from bs4 import BeautifulSoup
8 |
9 | from mf2py import Parser
10 |
11 | TestCase.maxDiff = None
12 |
13 |
14 | TEST_DIR = "test/examples/"
15 |
16 |
17 | def parse_fixture(path, **kwargs):
18 | with open(os.path.join(TEST_DIR, path)) as f:
19 | p = Parser(doc=f, html_parser="html5lib", **kwargs)
20 | return p.to_dict()
21 |
22 |
23 | def test_empty():
24 | p = Parser()
25 | assert type(p) is not None
26 | assert type(p.to_dict()) is dict
27 |
28 |
29 | def test_open_file():
30 | with open(os.path.join(TEST_DIR, "empty.html")) as f:
31 | p = Parser(doc=f)
32 |
33 | assert p.__doc__ is not None
34 | assert type(p) is not None
35 | assert type(p.to_dict()) is dict
36 |
37 |
38 | def test_doc_tag():
39 | # test that strings, BS doc and BS tags are all parsed
40 | doc = """ """
41 | soup = BeautifulSoup(doc, "html5lib")
42 | parse_string = Parser(doc).to_dict()
43 | assert "h-entry" in parse_string["items"][0]["type"]
44 | parse_doc = Parser(soup).to_dict()
45 | assert "h-entry" in parse_doc["items"][0]["type"]
46 | parse_tag = Parser(soup.article).to_dict()
47 | assert "h-entry" in parse_tag["items"][0]["type"]
48 |
49 |
50 | @mock.patch("requests.get")
51 | def test_user_agent(getter):
52 | ua_expect = "mf2py - microformats2 parser for python"
53 | assert Parser.useragent.startswith(ua_expect)
54 |
55 | resp = mock.MagicMock()
56 | resp.content = b""
57 | resp.text = ""
58 | resp.headers = {}
59 | getter.return_value = resp
60 |
61 | Parser(url="http://example.com")
62 | getter.assert_called_with(
63 | "http://example.com", headers={"User-Agent": Parser.useragent}
64 | )
65 |
66 | Parser.useragent = "something else"
67 | assert Parser.useragent == "something else"
68 | # set back to default. damn stateful classes
69 | Parser.useragent = "mf2py - microformats2 parser for python"
70 |
71 |
72 | def test_base():
73 | with open(os.path.join(TEST_DIR, "base.html")) as f:
74 | p = Parser(doc=f)
75 |
76 | assert p.__url__ == "http://tantek.com/"
77 |
78 |
79 | def test_simple_parse():
80 | result = parse_fixture("simple_person_reference.html")
81 | assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
82 |
83 |
84 | def test_simple_person_reference_same_element():
85 | result = parse_fixture("simple_person_reference_same_element.html")
86 | assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
87 |
88 |
89 | def test_person_with_url():
90 | result = parse_fixture("person_with_url.html")
91 | assert result["items"][0]["properties"]["name"] == ["Tom Morris"]
92 | assert result["items"][0]["properties"]["url"] == ["http://tommorris.org/"]
93 |
94 |
95 | def test_vcp():
96 | result = parse_fixture("value_class_person.html")
97 | assert result["items"][0]["properties"]["tel"] == ["+44 1234 567890"]
98 |
99 |
100 | def test_multiple_root_classnames():
101 | result = parse_fixture("nested_multiple_classnames.html")
102 | # order does not matter
103 | assert len(result["items"]) == 1
104 | assert set(result["items"][0]["type"]) == set(["h-entry", "h-as-note"])
105 |
106 |
107 | def test_property_nested_microformat():
108 | result = parse_fixture("nested_multiple_classnames.html")
109 |
110 | assert len(result["items"]) == 1
111 | assert "author" in result["items"][0]["properties"]
112 | assert (
113 | result["items"][0]["properties"]["author"][0]["properties"]["name"][0]
114 | == "Tom Morris"
115 | )
116 | assert (
117 | result["items"][0]["properties"]["reviewer"][0]["properties"]["name"][0]
118 | == "Tom Morris"
119 | )
120 | assert (
121 | result["items"][0]["properties"]["author"][0]["properties"]["adr"][0][
122 | "properties"
123 | ]["city"][0]
124 | == "London"
125 | )
126 |
127 |
128 | def test_plain_child_microformat():
129 | result = parse_fixture("nested_multiple_classnames.html")
130 |
131 | assert len(result["items"]) == 1
132 | assert "children" in result["items"][0]
133 | assert len(result["items"][0]["children"]) == 1
134 | assert result["items"][0]["children"][0]["properties"]["name"][0] == "Some Citation"
135 |
136 |
137 | def test_datetime_parsing():
138 | result = parse_fixture("datetimes.html")
139 | assert result["items"][0]["properties"]["start"][0] == "2014-01-01T12:00:00+0000"
140 | assert result["items"][0]["properties"]["end"][0] == "3014-01-01T18:00:00+0000"
141 | assert result["items"][0]["properties"]["duration"][0] == "P1000Y"
142 | assert result["items"][0]["properties"]["updated"][0] == "2011-08-26T00:01:21+0000"
143 | assert result["items"][0]["properties"]["updated"][1] == "2011-08-26T00:01:21+0000"
144 |
145 |
146 | def test_datetime_vcp_parsing():
147 | result = parse_fixture("datetimes.html")
148 | assert len(result["items"]) == 16
149 | assert result["items"][1]["properties"]["published"][0] == "3014-01-01 01:21Z"
150 | assert result["items"][2]["properties"]["updated"][0] == "2014-03-11 09:55"
151 | assert result["items"][3]["properties"]["published"][0] == "2014-01-30 15:28"
152 | assert result["items"][4]["properties"]["published"][0] == "9999-01-14T11:52+0800"
153 | assert result["items"][5]["properties"]["published"][0] == "2014-06-01 12:30-0600"
154 | assert result["items"][8]["properties"]["start"][0] == "2014-06-01 12:30-0600"
155 | assert result["items"][9]["properties"]["start"][0] == "2014-06-01 12:30-0600"
156 | assert result["items"][10]["properties"]["start"][0] == "2014-06-01 00:30-0600"
157 | assert result["items"][10]["properties"]["end"][0] == "2014-06-01 12:15"
158 | assert result["items"][10]["properties"]["start"][1] == "2014-06-01 00:30-0600"
159 | assert result["items"][10]["properties"]["end"][1] == "2014-06-01 12:15"
160 | assert result["items"][11]["properties"]["start"][0] == "2016-03-02 00:30-0600"
161 | assert result["items"][12]["properties"]["start"][0] == "2014-06-01 12:30-600"
162 | assert result["items"][13]["properties"]["start"][0] == "2014-06-01 12:30+600"
163 | assert result["items"][14]["properties"]["start"][0] == "2014-06-01 12:30Z"
164 | assert result["items"][15]["properties"]["start"][0] == "2014-06-01 12:30-600"
165 |
166 |
167 | def test_dt_end_implied_date():
168 | """Test that events with dt-start and dt-end use the implied date rule
169 | http://microformats.org/wiki/value-class-pattern#microformats2_parsers
170 | for times without dates"""
171 | result = parse_fixture("datetimes.html")
172 |
173 | event_wo_tz = result["items"][6]
174 | assert event_wo_tz["properties"]["start"][0] == "2014-05-21 18:30"
175 | assert event_wo_tz["properties"]["end"][0] == "2014-05-21 19:30"
176 |
177 | event_w_tz = result["items"][7]
178 | assert event_w_tz["properties"]["start"][0] == "2014-06-01 12:30-0600"
179 | assert event_w_tz["properties"]["end"][0] == "2014-06-01 19:30-0600"
180 |
181 |
182 | def test_embedded_parsing():
183 | result = parse_fixture("embedded.html")
184 | assert (
185 | result["items"][0]["properties"]["content"][0]["html"]
186 | == "Blah blah blah blah blah.
\n Blah.
\n Blah blah blah.
"
187 | )
188 | assert (
189 | result["items"][0]["properties"]["content"][0]["value"]
190 | == "Blah blah blah blah blah.\n\nBlah.\n\nBlah blah blah."
191 | )
192 |
193 |
194 | def test_embedded_exposed_dom():
195 | result = parse_fixture("embedded.html", expose_dom=True)
196 | content = result["items"][0]["properties"]["content"][0]
197 | assert "html" not in content
198 | assert isinstance(content["dom"], bs4.element.Tag)
199 |
200 |
201 | def test_hoisting_nested_hcard():
202 | result = parse_fixture("nested_hcards.html")
203 | expected = [
204 | {
205 | "properties": {
206 | "author": [
207 | {
208 | "properties": {"name": ["KP1"]},
209 | "type": ["h-card"],
210 | "value": "KP1",
211 | }
212 | ],
213 | "in-reply-to": [
214 | {"properties": {"name": ["KP"]}, "type": ["h-cite"], "value": "KP"}
215 | ],
216 | },
217 | "type": ["h-entry"],
218 | }
219 | ]
220 | assert expected == result["items"]
221 |
222 |
223 | def test_html_tag_class():
224 | result = parse_fixture("hfeed_on_html_tag.html")
225 | assert ["h-feed"] == result["items"][0]["type"]
226 |
227 | assert ["entry1"] == result["items"][0]["children"][0]["properties"]["name"]
228 | assert ["entry2"] == result["items"][0]["children"][1]["properties"]["name"]
229 |
230 |
231 | def test_string_strip():
232 | result = parse_fixture("string_stripping.html")
233 | assert "Tom Morris" == result["items"][0]["properties"]["name"][0]
234 |
235 |
236 | def test_template_parse():
237 | result = parse_fixture("template_tag.html")
238 | assert 0 == len(result["items"])
239 |
240 |
241 | def test_template_tag_inside_e_value():
242 | result = parse_fixture("template_tag_inside_e_value.html")
243 | assert (
244 | "This is a Test with a template tag after this:"
245 | == result["items"][0]["properties"]["content"][0]["html"]
246 | )
247 | assert (
248 | "This is a Test with a template tag after this:"
249 | == result["items"][0]["properties"]["content"][0]["value"]
250 | )
251 |
252 |
253 | def test_ordering_dedup():
254 | """test that classes are dedeuped and alphabetically ordered"""
255 |
256 | result = parse_fixture("ordering_dedup.html")
257 | item = result["items"][0]
258 | assert ["h-entry", "h-feed", "h-product", "h-x-test"] == item["type"]
259 | assert ["example.com", "example.com/2"] == item["properties"]["url"]
260 | assert ["name", "URL name"] == item["properties"]["name"]
261 | assert ["author", "bookmark", "me"] == result["rel-urls"]["example.com/rel"]["rels"]
262 | assert "de" == result["rel-urls"]["example.com/lang"]["hreflang"]
263 |
264 |
265 | def test_class_names_format():
266 | """test that only classes with letters and possibly numbers in the vendor prefix part are used"""
267 |
268 | result = parse_fixture("class_names_format.html")
269 | item = result["items"][0]
270 | assert ["h-feed", "h-p3k-entry", "h-x-test"] == item["type"]
271 | assert "url" in item["properties"]
272 | assert "p3k-url" in item["properties"]
273 | assert "Url" not in item["properties"]
274 | assert "-url" not in item["properties"]
275 | assert "url-" not in item["properties"]
276 |
277 | assert "name" in item["properties"]
278 | assert "p3k-name" in item["properties"]
279 | assert "nAme" not in item["properties"]
280 | assert "-name" not in item["properties"]
281 | assert "name-" not in item["properties"]
282 |
283 |
284 | def test_area_uparsing():
285 | result = parse_fixture("area.html")
286 | assert {"url": ["http://suda.co.uk"], "name": ["Brian Suda"]} == result["items"][0][
287 | "properties"
288 | ]
289 | assert "shape" in result["items"][0]
290 | assert "coords" in result["items"][0]
291 |
292 |
293 | def test_src_equiv():
294 | result = parse_fixture("test_src_equiv.html")
295 | for item in result["items"]:
296 | assert "x-example" in item["properties"]
297 | assert "http://example.org/" == item["properties"]["x-example"][0]
298 |
299 |
300 | def test_rels():
301 | result = parse_fixture("rel.html")
302 | assert {
303 | "in-reply-to": ["http://example.com/1", "http://example.com/2"],
304 | "author": ["http://example.com/a", "http://example.com/b"],
305 | "alternate": ["http://example.com/fr"],
306 | "home": ["http://example.com/fr"],
307 | } == result["rels"]
308 | assert {
309 | "http://example.com/1": {"text": "post 1", "rels": ["in-reply-to"]},
310 | "http://example.com/2": {"text": "post 2", "rels": ["in-reply-to"]},
311 | "http://example.com/a": {"text": "author a", "rels": ["author"]},
312 | "http://example.com/b": {"text": "author b", "rels": ["author"]},
313 | "http://example.com/fr": {
314 | "text": "French mobile homepage",
315 | "media": "handheld",
316 | "rels": ["alternate", "home"],
317 | "hreflang": "fr",
318 | },
319 | } == result["rel-urls"]
320 |
321 |
322 | def test_alternates():
323 | result = parse_fixture("rel.html")
324 | assert [
325 | {
326 | "url": "http://example.com/fr",
327 | "media": "handheld",
328 | "text": "French mobile homepage",
329 | "rel": "home",
330 | "hreflang": "fr",
331 | }
332 | ] == result["alternates"]
333 |
334 |
335 | def test_enclosures():
336 | result = parse_fixture("rel_enclosure.html")
337 | assert {"enclosure": ["http://example.com/movie.mp4"]} == result["rels"]
338 | assert {
339 | "http://example.com/movie.mp4": {
340 | "rels": ["enclosure"],
341 | "text": "my movie",
342 | "type": "video/mpeg",
343 | }
344 | } == result["rel-urls"]
345 |
346 |
347 | def test_empty_href():
348 | result = parse_fixture("hcard_with_empty_url.html", url="http://foo.com")
349 |
350 | for hcard in result["items"]:
351 | assert ["http://foo.com"] == hcard["properties"]["url"]
352 |
353 |
354 | def test_link_with_u_url():
355 | result = parse_fixture("link_with_u-url.html", url="http://foo.com")
356 | assert {
357 | "type": ["h-card"],
358 | "properties": {
359 | "name": [""],
360 | "url": ["http://foo.com/"],
361 | },
362 | } == result["items"][0]
363 |
364 |
365 | def test_broken_url():
366 | result = parse_fixture("broken_url.html", url="http://example.com")
367 | assert (
368 | result["items"][0]["properties"]["relative"][0] == "http://example.com/foo.html"
369 | )
370 | assert result["items"][0]["properties"]["url"][0] == "http://www.[w3.org/"
371 | assert (
372 | result["items"][0]["properties"]["photo"][0]
373 | == "http://www.w3].org/20[08/site/images/logo-w3c-mobile-lg"
374 | )
375 |
376 |
377 | def test_complex_e_content():
378 | """When parsing h-* e-* properties, we should fold {"value":..., "html":...}
379 | into the parsed microformat object, instead of nesting it under an
380 | unnecessary second layer of "value":
381 | """
382 | result = parse_fixture("complex_e_content.html")
383 |
384 | assert {
385 | "type": ["h-entry"],
386 | "properties": {
387 | "content": [
388 | {
389 | "type": ["h-card"],
390 | "properties": {"name": ["Hello"]},
391 | "html": "Hello
",
392 | "value": "Hello",
393 | }
394 | ],
395 | },
396 | } == result["items"][0]
397 |
398 |
399 | def test_relative_url_in_e():
400 | """When parsing e-* properties, make relative URLs absolute."""
401 | result = parse_fixture("relative_url_in_e.html")
402 |
403 | assert (
404 | 'Cat '
405 | '
'
406 | ) == result["items"][0]["properties"]["content"][0]["html"]
407 |
408 |
409 | def test_nested_values():
410 | """When parsing nested microformats, check that value is the value of
411 | the simple property element"""
412 | result = parse_fixture("nested_values.html")
413 | entry = result["items"][0]
414 |
415 | assert {
416 | "properties": {
417 | "name": ["Kyle"],
418 | "url": ["http://about.me/kyle"],
419 | },
420 | "value": "Kyle",
421 | "type": ["h-card"],
422 | } == entry["properties"]["author"][0]
423 |
424 | assert {
425 | "properties": {
426 | "name": ["foobar"],
427 | "url": ["http://example.com/foobar"],
428 | },
429 | "value": "http://example.com/foobar",
430 | "type": ["h-cite"],
431 | } == entry["properties"]["like-of"][0]
432 |
433 | assert {
434 | "properties": {
435 | "name": ["George"],
436 | "url": ["http://people.com/george"],
437 | },
438 | "type": ["h-card"],
439 | } == entry["children"][0]
440 |
441 |
442 | # implied properties tests
443 |
444 |
445 | def test_implied_name():
446 | result = parse_fixture("implied_properties/implied_properties.html")
447 |
448 | for i in range(7):
449 | assert result["items"][i]["properties"]["name"][0] == "Tom Morris"
450 |
451 |
452 | def test_implied_url():
453 | result = parse_fixture(
454 | "implied_properties/implied_properties.html", url="http://foo.com/"
455 | )
456 | assert result["items"][1]["properties"]["url"][0] == "http://tommorris.org/"
457 | # img should not have a "url" property
458 | assert "url" not in result["items"][4]["properties"]
459 | # href="" is relative to the base url
460 | assert result["items"][5]["properties"]["url"][0] == "http://foo.com/"
461 |
462 |
463 | def test_implied_photo():
464 | result = parse_fixture("implied_properties/implied_photo.html")
465 |
466 | for i in range(12):
467 | photos = result["items"][i]["properties"]["photo"]
468 | assert len(photos) == 1
469 | assert photos[0] == "http://example.com/photo.jpg"
470 |
471 | # tests for no photo
472 | for i in range(12, 23):
473 | assert "photo" not in result["items"][i]["properties"]
474 |
475 | result = parse_fixture("implied_properties/implied_photo_relative_url.html")
476 |
477 | assert (
478 | result["items"][0]["properties"]["photo"][0]["value"]
479 | == "http://example.com/jane-img.jpeg"
480 | )
481 | assert (
482 | result["items"][1]["properties"]["photo"][0]
483 | == "http://example.com/jane-object.jpeg"
484 | )
485 |
486 |
487 | def test_implied_url():
488 | result = parse_fixture("implied_properties/implied_url.html")
489 |
490 | for i in range(12):
491 | urls = result["items"][i]["properties"]["url"]
492 | assert len(urls) == 1
493 | assert urls[0] == "http://example.com"
494 |
495 | # tests for no url
496 | for i in range(12, 23):
497 | assert "url" not in result["items"][i]["properties"]
498 |
499 |
500 | def test_stop_implied_url():
501 | """testing that explicit properties case implied url-parsing to be aborted"""
502 |
503 | result = parse_fixture("implied_properties/stop_implied_url.html")
504 |
505 | assert "url" not in result["items"][0]["properties"]
506 | assert "url" not in result["items"][1]["properties"]
507 | assert "url" not in result["items"][2]["properties"]
508 | assert "url" not in result["items"][3]["properties"]
509 | assert "url" not in result["items"][4]["properties"]
510 | assert "url" not in result["items"][5]["properties"]
511 |
512 | assert result["items"][6]["properties"]["url"] == ["http://example.com/"]
513 | assert result["items"][7]["properties"]["url"] == ["http://example.com/"]
514 | assert result["items"][8]["properties"]["url"] == ["http://example.com/"]
515 | assert result["items"][9]["properties"]["url"] == ["http://example.com/"]
516 |
517 |
518 | def test_implied_nested_photo():
519 | result = parse_fixture(
520 | "implied_properties/implied_properties.html", url="http://bar.org"
521 | )
522 | assert result["items"][2]["properties"]["photo"][0] == {
523 | "alt": "",
524 | "value": "http://tommorris.org/photo.png",
525 | }
526 | assert (
527 | result["items"][3]["properties"]["photo"][0] == "http://tommorris.org/photo.png"
528 | )
529 | assert result["items"][4]["properties"]["photo"][0] == {
530 | "alt": "Tom Morris",
531 | "value": "http://tommorris.org/photo.png",
532 | }
533 | # src="" is relative to the base url
534 | assert result["items"][6]["properties"]["photo"][0] == "http://bar.org"
535 |
536 |
537 | def test_implied_nested_photo_alt_name():
538 | result = parse_fixture("implied_properties/implied_properties.html")
539 | assert result["items"][3]["properties"]["name"][0] == "Tom Morris"
540 |
541 |
542 | def test_implied_image():
543 | result = parse_fixture("implied_properties/implied_properties.html")
544 | assert result["items"][4]["properties"]["photo"][0] == {
545 | "alt": "Tom Morris",
546 | "value": "http://tommorris.org/photo.png",
547 | }
548 | assert result["items"][4]["properties"]["name"][0] == "Tom Morris"
549 |
550 |
551 | def test_implied_name_empty_alt():
552 | """An empty alt text should not prevent us from including other
553 | children in the implied name.
554 | """
555 |
556 | result = parse_fixture("implied_properties/implied_name_empty_alt.html")
557 | hcard = result["items"][0]
558 |
559 | assert {
560 | "type": ["h-card"],
561 | "properties": {
562 | "name": ["@kylewmahan"],
563 | "url": ["https://twitter.com/kylewmahan"],
564 | "photo": [{"alt": "", "value": "https://example.org/test.jpg"}],
565 | },
566 | } == hcard
567 |
568 |
569 | def test_relative_datetime():
570 | result = parse_fixture("implied_properties/implied_relative_datetimes.html")
571 | assert result["items"][0]["properties"]["updated"][0] == "2015-01-02 05:06"
572 |
573 |
574 | def test_stop_implied_name_nested_h():
575 | result = parse_fixture("implied_properties/stop_implied_name_nested_h.html")
576 | assert "name" not in result["items"][0]["properties"]
577 |
578 |
579 | def test_stop_implied_name_e_content():
580 | result = parse_fixture("implied_properties/stop_implied_name_e_content.html")
581 | assert "name" not in result["items"][0]["properties"]
582 |
583 |
584 | def test_stop_implied_name_p_content():
585 | result = parse_fixture("implied_properties/stop_implied_name_p_content.html")
586 | assert "name" not in result["items"][0]["properties"]
587 |
588 |
589 | def test_implied_properties_silo_pub():
590 | result = parse_fixture("implied_properties/implied_properties_silo_pub.html")
591 | item = result["items"][0]
592 |
593 | # implied_name = item['properties']['name'][0]
594 | # implied_name = re.sub('\s+', ' ', implied_name).strip()
595 | # assert '@kylewmahan on Twitter', implied_name)
596 |
597 | # no implied name expected under new rules
598 |
599 | assert "name" not in item["properties"]
600 |
601 |
602 | def test_simple_person_reference_implied():
603 | result = parse_fixture("implied_properties/simple_person_reference_implied.html")
604 | assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
605 |
606 |
607 | def test_implied_name_alt():
608 | result = parse_fixture("implied_properties/implied_name_alt.html")
609 | assert result["items"][0]["children"][0] == {
610 | "type": ["h-card"],
611 | "properties": {
612 | "name": ["Avatar of Stephen"],
613 | "photo": [{"alt": "Avatar of", "value": "avatar.jpg"}],
614 | },
615 | }
616 |
617 |
618 | def test_value_name_whitespace():
619 | result = parse_fixture("value_name_whitespace.html")
620 |
621 | for i in range(3):
622 | assert result["items"][i]["properties"]["content"][0]["value"] == "Hello World"
623 | assert result["items"][i]["properties"]["name"][0] == "Hello World"
624 |
625 | for i in range(3, 7):
626 | assert result["items"][i]["properties"]["content"][0]["value"] == "Hello\nWorld"
627 | assert result["items"][i]["properties"]["name"][0] == "Hello\nWorld"
628 |
629 | assert result["items"][7]["properties"]["content"][0]["value"] == "Hello\n\nWorld"
630 | assert result["items"][7]["properties"]["name"][0] == "Hello\n\nWorld"
631 |
632 | assert result["items"][8]["properties"]["content"][0]["value"] == "One\nTwo\nThree"
633 | assert result["items"][8]["properties"]["name"][0] == "One\nTwo\nThree"
634 |
635 | assert (
636 | result["items"][9]["properties"]["content"][0]["value"] == "One\n\nTwo\n\nThree"
637 | )
638 | assert result["items"][9]["properties"]["name"][0] == "One\n\nTwo\n\nThree"
639 |
640 | assert (
641 | result["items"][10]["properties"]["content"][0]["value"]
642 | == "Hello World one\n two\n three\n "
643 | )
644 | assert (
645 | result["items"][10]["properties"]["name"][0]
646 | == "Hello World one\n two\n three\n "
647 | )
648 |
649 | assert (
650 | result["items"][11]["properties"]["content"][0]["value"]
651 | == "Correct name Correct summary"
652 | )
653 | assert result["items"][11]["properties"]["name"][0] == "Correct name"
654 |
655 |
656 | # backcompat tests
657 |
658 |
659 | def test_backcompat_hentry():
660 | result = parse_fixture("backcompat/hentry.html")
661 | assert "h-entry" in result["items"][0]["type"]
662 | assert (
663 | "Tom Morris"
664 | == result["items"][0]["properties"]["author"][0]["properties"]["name"][0]
665 | )
666 | assert "A Title" == result["items"][0]["properties"]["name"][0]
667 | assert "Some Content" == result["items"][0]["properties"]["content"][0]["value"]
668 |
669 |
670 | def test_backcompat_hproduct():
671 | result = parse_fixture("backcompat/hproduct.html")
672 | assert 1 == len(result["items"])
673 | assert ["h-product"] == result["items"][0]["type"]
674 | assert ["bullshit"] == result["items"][0]["properties"]["category"]
675 | assert ["Quacktastic Products"] == result["items"][0]["properties"]["brand"]
676 | assert ["#BULLSHIT-001"] == result["items"][0]["properties"]["identifier"]
677 | assert (
678 | "Magical tasty sugar pills that don't do anything."
679 | == result["items"][0]["properties"]["description"][0]
680 | )
681 | assert ["Tom's Magical Quack Tincture"] == result["items"][0]["properties"]["name"]
682 |
683 |
684 | def test_backcompat_hproduct_nested_hreview():
685 | result = parse_fixture("backcompat/hproduct_hreview_nested.html")
686 | assert ["h-review"] == result["items"][0]["children"][0]["type"]
687 |
688 |
689 | def test_backcompat_hreview_nested_card_event_product():
690 | result = parse_fixture("backcompat/hreview_nested_card_event_product.html")
691 | assert ["h-review"] == result["items"][0]["type"]
692 | items = result["items"][0]["properties"]["item"]
693 | assert 3 == len(items)
694 |
695 | event = items[0]
696 | assert ["h-event"] == event["type"]
697 | assert ["http://example.com/event-url"] == event["properties"]["url"]
698 | assert ["event name"] == event["properties"]["name"]
699 |
700 | card = items[1]
701 | assert ["h-card"] == card["type"]
702 | assert ["http://example.com/card-url"] == card["properties"]["url"]
703 | assert ["card name"] == card["properties"]["name"]
704 |
705 | product = items[2]
706 | assert ["h-product"] == product["type"]
707 | assert ["http://example.com/product-url"] == product["properties"]["url"]
708 | assert ["product name"] == product["properties"]["name"]
709 |
710 |
711 | def test_backcompat_rel_bookmark():
712 | """Confirm that rel=bookmark inside of an h-entry is converted
713 | to u-url.
714 | """
715 | result = parse_fixture("backcompat/feed_with_rel_bookmark.html")
716 | for ii, url in enumerate(
717 | (
718 | "/2014/11/24/jump-rope",
719 | "/2014/11/23/graffiti",
720 | "/2014/11/21/earth",
721 | "/2014/11/19/labor",
722 | )
723 | ):
724 | assert ["h-entry"] == result["items"][ii]["type"]
725 | assert [url] == result["items"][ii]["properties"]["url"]
726 |
727 |
728 | def test_backcompat_rel_bookmark():
729 | """Confirm that rel=bookmark inside of an hentry and hreview is converted
730 | to a u-url and original u-url is ignored
731 | """
732 |
733 | tests = [
734 | "backcompat/hentry_with_rel_bookmark.html",
735 | "backcompat/hreview_with_rel_tag_bookmark.html",
736 | ]
737 |
738 | results = [parse_fixture(x) for x in tests]
739 |
740 | for result in results:
741 | assert [
742 | "https://example.com/bookmark",
743 | "https://example.com/bookmark-url",
744 | ] == result["items"][0]["properties"]["url"]
745 |
746 |
747 | def test_backcompat_rel_tag():
748 | """Confirm that rel=tag inside of an hentry is converted
749 | to a p-category and the last path segment of the href is used.
750 | """
751 |
752 | tests = [
753 | "backcompat/hentry_with_rel_tag.html",
754 | "backcompat/hfeed_with_rel_tag.html",
755 | "backcompat/hrecipe_with_rel_tag.html",
756 | "backcompat/hreview_with_rel_tag_bookmark.html",
757 | ]
758 |
759 | results = [parse_fixture(x) for x in tests]
760 | for result in results:
761 | assert ["cat", "dog", "mountain lion", "mouse", "meerkat"] == result["items"][
762 | 0
763 | ]["properties"]["category"]
764 |
765 |
766 | def test_backcompat_rel_tag_entry_title():
767 | """Confirm that other backcompat properties on a rel=tag are parsed"""
768 |
769 | result = parse_fixture("backcompat/hentry_with_rel_tag_entry_title.html")
770 | assert ["cat"] == result["items"][0]["properties"]["category"]
771 | assert ["rhinoceros"] == result["items"][0]["properties"]["name"]
772 |
773 |
774 | def test_backcompat_rel_multiple_root():
775 | """Confirm that rel=tag and rel=bookmark inside of an hentry+hreview is parsed correctly"""
776 |
777 | result = parse_fixture("backcompat/hreview_hentry_with_rel_tag_bookmark.html")
778 |
779 | assert len(result["items"]) == 1
780 | assert "h-entry" in result["items"][0]["type"]
781 | assert "h-review" in result["items"][0]["type"]
782 |
783 | assert ["cat", "dog", "mountain lion", "mouse", "meerkat"] == result["items"][0][
784 | "properties"
785 | ]["category"]
786 | assert [
787 | "https://example.com/bookmark",
788 | "https://example.com/bookmark-url",
789 | ] == result["items"][0]["properties"]["url"]
790 |
791 |
792 | def test_backcompat_ignore_mf1_root_if_mf2_present():
793 | """Confirm that mf1 root class is ignored if another mf2 root class is present."""
794 | result = parse_fixture("backcompat/ignore_mf1_root_if_mf2_present.html")
795 | assert "h-entry" not in result["items"][0]["type"]
796 | assert "h-event" in result["items"][0]["type"]
797 |
798 |
799 | def test_backcompat_no_implied_properties_mf1_root():
800 | """Confirm that mf1 root class does not have implied properties"""
801 | result = parse_fixture("backcompat/ignore_mf1_root_if_mf2_present.html")
802 | assert "h-entry" not in result["items"][0]["properties"]
803 | assert "name" not in result["items"][0]["type"]
804 | assert "url" not in result["items"][0]["properties"]
805 | assert "photo" not in result["items"][0]["properties"]
806 |
807 |
808 | def test_backcompat_ignore_mf2_properties_in_mf1_root():
809 | """Confirm that mf2 properties are ignored in mf1 root class"""
810 | result = parse_fixture("backcompat/ignore_mf2_properties_in_mf1_root.html")
811 | assert "Correct name" == result["items"][0]["properties"]["name"][0]
812 | assert "Correct summary" == result["items"][0]["properties"]["summary"][0]
813 |
814 |
815 | def test_backcompat_ignore_mf1_properties_in_mf2_root():
816 | """Confirm that mf1 properties are ignored in mf2 root class"""
817 | result = parse_fixture("backcompat/ignore_mf1_properties_in_mf2_root.html")
818 | assert "Correct name" == result["items"][0]["properties"]["name"][0]
819 | assert "Correct summary" == result["items"][0]["properties"]["summary"][0]
820 |
821 |
822 | def test_backcompat_nested_mf2_in_mf1():
823 | """Confirm that mf2 roots nested inside mf1 root are parsed"""
824 | result = parse_fixture("backcompat/nested_mf2_in_mf1.html")
825 | assert "h-feed" == result["items"][0]["type"][0]
826 | assert "h-entry" == result["items"][0]["children"][0]["type"][0]
827 | assert "Correct name" == result["items"][0]["children"][0]["properties"]["name"][0]
828 | assert (
829 | "Correct summary"
830 | == result["items"][0]["children"][0]["properties"]["summary"][0]
831 | )
832 |
833 |
834 | def test_backcompat_nested_mf1_in_mf2():
835 | """Confirm that mf1 roots nested inside mf2 root are parsed"""
836 | result = parse_fixture("backcompat/nested_mf1_in_mf2.html")
837 | assert "h-feed" == result["items"][0]["type"][0]
838 | assert "h-entry" == result["items"][0]["children"][0]["type"][0]
839 | assert "Correct name" == result["items"][0]["children"][0]["properties"]["name"][0]
840 | assert (
841 | "Correct summary"
842 | == result["items"][0]["children"][0]["properties"]["summary"][0]
843 | )
844 |
845 |
846 | def test_backcompat_nested_mf1_in_mf2_e_content():
847 | """Confirm that mf1 roots nested inside mf2 root e-content are parsed as authored"""
848 | result = parse_fixture("backcompat/nested_mf1_in_mf2_e_content.html")
849 |
850 | mf2_entry = result["items"][0]
851 | mf1_entry = mf2_entry["children"][0]
852 |
853 | assert (
854 | '\nCorrect name \n\nCorrect summary \n
'
855 | == mf2_entry["properties"]["content"][0]["html"]
856 | )
857 |
858 | assert (
859 | "Correct name Correct summary" == mf2_entry["properties"]["content"][0]["value"]
860 | )
861 |
862 | assert "h-entry" == mf1_entry["type"][0]
863 | assert "Correct name" == mf1_entry["properties"]["name"][0]
864 | assert "Correct summary" == mf1_entry["properties"]["summary"][0]
865 |
866 |
867 | def test_backcompat_hentry_content_html():
868 | """Confirm that mf1 entry-content html is parsed as authored without mf2 replacements"""
869 | result = parse_fixture("backcompat/hentry_content_html.html")
870 |
871 | entry = result["items"][0]
872 |
873 | assert (
874 | 'This is a summary
\n This is mytag inside content.
'
875 | == entry["properties"]["content"][0]["html"]
876 | )
877 |
878 |
879 | def test_whitespace_with_tags_inside_property():
880 | """Whitespace should only be trimmed at the ends of textContent, not inside.
881 |
882 | https://github.com/microformats/mf2py/issues/112
883 | """
884 | result = parse_fixture("tag_whitespace_inside_p_value.html")
885 | assert result["items"][0]["properties"] == {"name": ["foo bar"]}
886 |
887 |
888 | def test_plaintext_p_whitespace():
889 | result = parse_fixture("plaintext_p_whitespace.html")
890 | assert result["items"][0]["properties"]["content"][0]["value"] == "foo\nbar baz"
891 | assert result["items"][1]["properties"]["content"][0]["value"] == "foo\nbar baz"
892 | assert result["items"][2]["properties"]["content"][0]["value"] == "foo bar\nbaz"
893 |
894 |
895 | def test_plaintext_img_whitespace():
896 | result = parse_fixture("plaintext_img_whitespace.html")
897 | assert (
898 | result["items"][0]["properties"]["content"][0]["value"]
899 | == "selfie At some tourist spot"
900 | )
901 | assert (
902 | result["items"][1]["properties"]["content"][0]["value"]
903 | == "At another tourist spot"
904 | )
905 | assert (
906 | result["items"][2]["properties"]["content"][0]["value"]
907 | == "https://example.com/photo.jpg At yet another tourist spot"
908 | )
909 |
910 |
911 | def test_photo_with_alt():
912 | """Confirm that alt text in img is parsed as a u-* property and implied photo"""
913 |
914 | path = "img_with_alt.html"
915 |
916 | result = parse_fixture(path)
917 |
918 | with open(os.path.join(TEST_DIR, path)) as f:
919 | exp_result = Parser(doc=f, html_parser="html5lib").to_dict()
920 |
921 | # simple img with u-*
922 | assert "/photo.jpg" == result["items"][0]["properties"]["photo"][0]
923 | assert "/photo.jpg" == exp_result["items"][0]["properties"]["photo"][0]
924 |
925 | assert {"alt": "alt text", "value": "/photo.jpg"} == result["items"][1][
926 | "properties"
927 | ]["url"][0]
928 | assert "/photo.jpg" == exp_result["items"][1]["properties"]["url"][0]["value"]
929 | assert "alt text" == exp_result["items"][1]["properties"]["url"][0]["alt"]
930 |
931 | assert {"alt": "", "value": "/photo.jpg"} == result["items"][2]["properties"][
932 | "in-reply-to"
933 | ][0]
934 | assert (
935 | "/photo.jpg" == exp_result["items"][2]["properties"]["in-reply-to"][0]["value"]
936 | )
937 | assert "" == exp_result["items"][2]["properties"]["in-reply-to"][0]["alt"]
938 |
939 | # img with u-* and h-* example
940 | assert "h-cite" in result["items"][3]["properties"]["in-reply-to"][0]["type"]
941 | assert (
942 | "/photo.jpg"
943 | == result["items"][3]["properties"]["in-reply-to"][0]["properties"]["photo"][0]
944 | )
945 | assert "/photo.jpg" == result["items"][3]["properties"]["in-reply-to"][0]["value"]
946 | assert "alt" not in result["items"][3]["properties"]["in-reply-to"][0]
947 |
948 | assert "h-cite" in exp_result["items"][3]["properties"]["in-reply-to"][0]["type"]
949 | assert (
950 | "/photo.jpg"
951 | == exp_result["items"][3]["properties"]["in-reply-to"][0]["properties"][
952 | "photo"
953 | ][0]
954 | )
955 | assert (
956 | "/photo.jpg" == exp_result["items"][3]["properties"]["in-reply-to"][0]["value"]
957 | )
958 | assert "alt" not in exp_result["items"][3]["properties"]["in-reply-to"][0]
959 |
960 | assert "h-cite" in result["items"][4]["properties"]["in-reply-to"][0]["type"]
961 | assert {"alt": "alt text", "value": "/photo.jpg"} == result["items"][4][
962 | "properties"
963 | ]["in-reply-to"][0]["properties"]["photo"][0]
964 | assert "/photo.jpg" == result["items"][4]["properties"]["in-reply-to"][0]["value"]
965 | assert "alt" in result["items"][4]["properties"]["in-reply-to"][0]
966 |
967 | assert "h-cite" in exp_result["items"][4]["properties"]["in-reply-to"][0]["type"]
968 | assert (
969 | "/photo.jpg"
970 | == exp_result["items"][4]["properties"]["in-reply-to"][0]["properties"][
971 | "photo"
972 | ][0]["value"]
973 | )
974 | assert (
975 | "/photo.jpg" == exp_result["items"][4]["properties"]["in-reply-to"][0]["value"]
976 | )
977 | assert (
978 | "alt text"
979 | == exp_result["items"][4]["properties"]["in-reply-to"][0]["properties"][
980 | "photo"
981 | ][0]["alt"]
982 | )
983 | assert "alt text" == exp_result["items"][4]["properties"]["in-reply-to"][0]["alt"]
984 |
985 | assert "h-cite" in result["items"][5]["properties"]["in-reply-to"][0]["type"]
986 | assert {"alt": "", "value": "/photo.jpg"} == result["items"][5]["properties"][
987 | "in-reply-to"
988 | ][0]["properties"]["photo"][0]
989 | assert "/photo.jpg" == result["items"][5]["properties"]["in-reply-to"][0]["value"]
990 | assert "alt" in result["items"][5]["properties"]["in-reply-to"][0]
991 |
992 | assert "h-cite" in exp_result["items"][5]["properties"]["in-reply-to"][0]["type"]
993 | assert (
994 | "/photo.jpg"
995 | == exp_result["items"][5]["properties"]["in-reply-to"][0]["properties"][
996 | "photo"
997 | ][0]["value"]
998 | )
999 | assert (
1000 | "/photo.jpg" == exp_result["items"][5]["properties"]["in-reply-to"][0]["value"]
1001 | )
1002 | assert (
1003 | ""
1004 | == exp_result["items"][5]["properties"]["in-reply-to"][0]["properties"][
1005 | "photo"
1006 | ][0]["alt"]
1007 | )
1008 | assert "" == exp_result["items"][5]["properties"]["in-reply-to"][0]["alt"]
1009 |
1010 |
1011 | def test_photo_with_srcset():
1012 | result = parse_fixture("img_with_srcset.html")
1013 |
1014 | assert result["items"][0]["properties"]["photo"][0]["srcset"] == {
1015 | "480w": "elva-fairy-480w.jpg",
1016 | "800w": "elva-fairy-800w.jpg",
1017 | }
1018 | assert result["items"][1]["properties"]["photo"][0]["srcset"] == {
1019 | "1x": "elva-fairy-320w.jpg",
1020 | "1.5x": "elva-fairy-480w.jpg",
1021 | "2x": "elva-fairy-640w.jpg",
1022 | }
1023 | assert (
1024 | result["items"][1]["properties"]["photo"][0]["srcset"]["2x"]
1025 | != "elva-fairy-2w.jpg"
1026 | )
1027 | for i in range(2, 7):
1028 | assert result["items"][i]["properties"]["photo"][0]["srcset"] == {
1029 | "1x": "elva-fairy,320w.jpg",
1030 | "1.5x": "elva-fairy,480w.jpg",
1031 | }
1032 | assert result["items"][7]["properties"]["photo"][0]["srcset"] == {
1033 | "1x": "elva-fairy,320w.jpg",
1034 | }
1035 | assert result["items"][8]["properties"]["photo"][0]["srcset"] == {
1036 | "1x": "elva-fairy,320w.jpg",
1037 | "1.5x": "elva-fairy,480w.jpg",
1038 | "2x": "elva-fairy,640w.jpg",
1039 | }
1040 |
1041 | result = parse_fixture("img_with_srcset_with_base.html")
1042 |
1043 | assert result["items"][0]["properties"]["photo"][0]["srcset"] == {
1044 | "480w": "https://example.com/elva-fairy-480w.jpg",
1045 | "800w": "https://example.com/elva-fairy-800w.jpg",
1046 | }
1047 |
1048 |
1049 | def test_parse_id():
1050 | result = parse_fixture("parse_id.html")
1051 | assert "recentArticles" == result["items"][0]["id"]
1052 | assert "article" == result["items"][0]["children"][0]["id"]
1053 | assert "id" not in result["items"][0]["children"][1]
1054 | assert "theAuthor" == result["items"][0]["properties"]["author"][0]["id"]
1055 |
1056 |
1057 | # unicode tests
1058 |
1059 |
1060 | def get_all_files():
1061 | all_files = []
1062 |
1063 | for dir_, _, files in os.walk(TEST_DIR):
1064 | for filename in files:
1065 | rel_dir = os.path.relpath(dir_, TEST_DIR)
1066 | all_files.append(os.path.join(rel_dir, filename))
1067 |
1068 | return all_files
1069 |
1070 |
1071 | def assert_unicode_everywhere(obj):
1072 | if isinstance(obj, dict):
1073 | for k, v in obj.items():
1074 | assert not isinstance(k, bytes), "key=%r; type=%r" % (k, type(k))
1075 | assert_unicode_everywhere(v)
1076 | elif isinstance(obj, list):
1077 | for v in obj:
1078 | assert_unicode_everywhere(v)
1079 |
1080 | assert not isinstance(obj, bytes), "value=%r; type=%r" % (obj, type(obj))
1081 |
1082 |
1083 | def check_unicode(filename, jsonblob):
1084 | assert_unicode_everywhere(jsonblob)
1085 |
1086 |
1087 | def test_unicode_everywhere():
1088 | """make sure everything is unicode"""
1089 |
1090 | for h in get_all_files():
1091 | result = parse_fixture(h)
1092 | check_unicode(h, result)
1093 |
1094 |
1095 | def test_input_tree_integrity():
1096 | """make sure that if we parse a BS4 soup, our modifications do not leak into the document represented by it"""
1097 |
1098 | for path in get_all_files():
1099 | with open(os.path.join(TEST_DIR, path)) as f:
1100 | soup = BeautifulSoup(f, features="lxml")
1101 | html1 = soup.prettify()
1102 | p = Parser(doc=soup, html_parser="lxml")
1103 | html2 = soup.prettify()
1104 | make_labelled_cmp("tree_integrity_" + path)(html1, html2)
1105 |
1106 |
1107 | def make_labelled_cmp(label):
1108 | def f(html1, html2):
1109 | assert html1 == html2
1110 |
1111 | f.description = label
1112 | return f
1113 |
1114 |
1115 | def test_all_u_cases():
1116 | """test variations of u- parsing and that relative urls are always resolved"""
1117 |
1118 | URL_COUNT = 28
1119 | result = parse_fixture("u_all_cases.html")
1120 |
1121 | assert URL_COUNT == len(result["items"][0]["properties"]["url"])
1122 | for i in range(URL_COUNT):
1123 | make_labelled_cmp("all_u_cases_" + str(i))(
1124 | "http://example.com/test", result["items"][0]["properties"]["url"][i]
1125 | )
1126 |
1127 |
1128 | def test_filtered_roots():
1129 | result = parse_fixture("filter_roots.html")
1130 | assert len(result["items"]) == 8
1131 |
1132 | result = parse_fixture("filter_roots.html", filter_roots=True)
1133 | assert len(result["items"]) == 1
1134 |
1135 | result = parse_fixture(
1136 | "filter_roots_custom.html", filter_roots={"foo", "bar", "bat", "baz"}
1137 | )
1138 | assert len(result["items"]) == 1
1139 |
1140 |
1141 | def test_metaformats_flag_false():
1142 | result = parse_fixture("metaformats_ogp.html")
1143 | assert result["items"] == []
1144 |
1145 |
1146 | def test_metaformats_title_only():
1147 | result = parse_fixture("base.html", metaformats=True)
1148 | assert result["items"] == [
1149 | {
1150 | "type": ["h-entry"],
1151 | "properties": {
1152 | "name": ["Hello World"],
1153 | },
1154 | "source": "metaformats",
1155 | }
1156 | ]
1157 |
1158 |
1159 | def test_metaformats_ogp():
1160 | result = parse_fixture("metaformats_ogp.html", metaformats=True)
1161 | assert result["items"] == [
1162 | {
1163 | "type": ["h-entry"],
1164 | "properties": {
1165 | "name": ["Titull foo"],
1166 | "summary": ["Descrypshun bar"],
1167 | "photo": ["http://example.com/baz.jpg"],
1168 | "audio": ["http://example.com/biff.mp3"],
1169 | "video": ["http://example.com/boff.mov"],
1170 | "author": ["http://tantek.com/me"],
1171 | "published": ["2023-01-02T03:04Z"],
1172 | "updated": ["2023-01-02T05:06Z"],
1173 | },
1174 | "source": "metaformats",
1175 | }
1176 | ]
1177 |
1178 |
1179 | def test_metaformats_twitter():
1180 | result = parse_fixture("metaformats_twitter.html", metaformats=True)
1181 | assert result["items"] == [
1182 | {
1183 | "type": ["h-entry"],
1184 | "properties": {
1185 | "name": ["Titull foo"],
1186 | "summary": ["Descrypshun bar"],
1187 | "photo": ["http://tantek.com/baz.jpg"],
1188 | },
1189 | "source": "metaformats",
1190 | }
1191 | ]
1192 |
1193 |
1194 | def test_metaformats_html_meta():
1195 | result = parse_fixture("metaformats_html_meta.html", metaformats=True)
1196 | assert result["items"] == [
1197 | {
1198 | "type": ["h-entry"],
1199 | "properties": {
1200 | "name": ["Hello World"],
1201 | "summary": ["Descrypshun bar"],
1202 | },
1203 | "source": "metaformats",
1204 | }
1205 | ]
1206 |
1207 |
1208 | def test_language():
1209 | result = parse_fixture("language.html")
1210 | assert result["items"][0]["lang"] == "it"
1211 | assert result["items"][1]["lang"] == "it"
1212 | assert result["items"][1]["properties"]["content"][0]["lang"] == "en"
1213 | assert result["items"][1]["properties"]["content"][1]["lang"] == "it"
1214 | assert result["items"][2]["lang"] == "sv"
1215 | assert result["items"][2]["properties"]["content"][0]["lang"] == "en"
1216 | assert result["items"][2]["properties"]["content"][1]["lang"] == "sv"
1217 |
1218 |
1219 | def test_parser_object():
1220 | with open(os.path.join(TEST_DIR, "festivus.html")) as f:
1221 | p = Parser(doc=f)
1222 | assert len(p.to_dict(filter_by_type="h-card")) == 3
1223 | assert len(p.to_dict(filter_by_type="h-entry")) == 4
1224 | assert (
1225 | p.to_json(filter_by_type="h-card")
1226 | == '[{"type": ["h-card"], "properties": {"name": ["Jerry"]}}, {"type": '
1227 | '["h-card"], "properties": {"name": ["Frank"]}}, {"type": ["h-card"], '
1228 | '"properties": {"name": ["Cosmo"]}}]'
1229 | )
1230 |
--------------------------------------------------------------------------------