Hey Brian.
9 |
9 |
10 | Tom Morris
11 |
12 |
13 | fake
14 | +44 1234 567890
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/test/examples/value_name_whitespace.html:
--------------------------------------------------------------------------------
1 |
4 |
5 |
10 |
11 |
12 |
Hello
13 | World
14 |
15 |
16 |
19 |
20 |
24 |
25 |
26 |
Hello
27 | World
28 |
29 |
30 |
33 |
34 |
37 |
38 |
39 |
40 |
One
41 | Two
42 | Three
43 |
44 |
45 |
46 |
47 |
48 |
One
49 |
Two
50 |
Three
51 |
52 |
53 |
54 |
55 |
56 | Hello World
57 |
58 | one
59 | two
60 | three
61 |
62 |
63 |
64 |
65 |
66 |
67 | Correct name
68 |
69 | Correct summary
70 |
71 |
72 |
--------------------------------------------------------------------------------
/test/test_dom_addins.py:
--------------------------------------------------------------------------------
1 | from mf2py.parser import Parser
2 |
3 |
4 | def test_getElementsByClassName():
5 | p = Parser(doc=open("test/examples/person_with_url.html"))
6 | dom = p.__doc__
7 | assert len(dom.find_all(class_="u-url")) == 1
8 | expected_el = dom.find_all(class_="u-url")[0]
9 | assert expected_el["class"] == ["u-url"]
10 |
--------------------------------------------------------------------------------
/test/test_parser.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | from unittest import TestCase, mock
5 |
6 | import bs4
7 | from bs4 import BeautifulSoup
8 |
9 | from mf2py import Parser
10 |
11 | TestCase.maxDiff = None
12 |
13 |
14 | TEST_DIR = "test/examples/"
15 |
16 |
17 | def parse_fixture(path, **kwargs):
18 | with open(os.path.join(TEST_DIR, path)) as f:
19 | p = Parser(doc=f, html_parser="html5lib", **kwargs)
20 | return p.to_dict()
21 |
22 |
23 | def test_empty():
24 | p = Parser()
25 | assert type(p) is not None
26 | assert type(p.to_dict()) is dict
27 |
28 |
29 | def test_open_file():
30 | with open(os.path.join(TEST_DIR, "empty.html")) as f:
31 | p = Parser(doc=f)
32 |
33 | assert p.__doc__ is not None
34 | assert type(p) is not None
35 | assert type(p.to_dict()) is dict
36 |
37 |
38 | def test_doc_tag():
39 | # test that strings, BS doc and BS tags are all parsed
40 | doc = """
"""
41 | soup = BeautifulSoup(doc, "html5lib")
42 | parse_string = Parser(doc).to_dict()
43 | assert "h-entry" in parse_string["items"][0]["type"]
44 | parse_doc = Parser(soup).to_dict()
45 | assert "h-entry" in parse_doc["items"][0]["type"]
46 | parse_tag = Parser(soup.article).to_dict()
47 | assert "h-entry" in parse_tag["items"][0]["type"]
48 |
49 |
50 | @mock.patch("requests.get")
51 | def test_user_agent(getter):
52 | ua_expect = "mf2py - microformats2 parser for python"
53 | assert Parser.useragent.startswith(ua_expect)
54 |
55 | resp = mock.MagicMock()
56 | resp.content = b""
57 | resp.text = ""
58 | resp.headers = {}
59 | getter.return_value = resp
60 |
61 | Parser(url="http://example.com")
62 | getter.assert_called_with(
63 | "http://example.com", headers={"User-Agent": Parser.useragent}
64 | )
65 |
66 | Parser.useragent = "something else"
67 | assert Parser.useragent == "something else"
68 | # set back to default. damn stateful classes
69 | Parser.useragent = "mf2py - microformats2 parser for python"
70 |
71 |
72 | def test_base():
73 | with open(os.path.join(TEST_DIR, "base.html")) as f:
74 | p = Parser(doc=f)
75 |
76 | assert p.__url__ == "http://tantek.com/"
77 |
78 |
79 | def test_simple_parse():
80 | result = parse_fixture("simple_person_reference.html")
81 | assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
82 |
83 |
84 | def test_simple_person_reference_same_element():
85 | result = parse_fixture("simple_person_reference_same_element.html")
86 | assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
87 |
88 |
89 | def test_person_with_url():
90 | result = parse_fixture("person_with_url.html")
91 | assert result["items"][0]["properties"]["name"] == ["Tom Morris"]
92 | assert result["items"][0]["properties"]["url"] == ["http://tommorris.org/"]
93 |
94 |
95 | def test_vcp():
96 | result = parse_fixture("value_class_person.html")
97 | assert result["items"][0]["properties"]["tel"] == ["+44 1234 567890"]
98 |
99 |
100 | def test_multiple_root_classnames():
101 | result = parse_fixture("nested_multiple_classnames.html")
102 | # order does not matter
103 | assert len(result["items"]) == 1
104 | assert set(result["items"][0]["type"]) == set(["h-entry", "h-as-note"])
105 |
106 |
107 | def test_property_nested_microformat():
108 | result = parse_fixture("nested_multiple_classnames.html")
109 |
110 | assert len(result["items"]) == 1
111 | assert "author" in result["items"][0]["properties"]
112 | assert (
113 | result["items"][0]["properties"]["author"][0]["properties"]["name"][0]
114 | == "Tom Morris"
115 | )
116 | assert (
117 | result["items"][0]["properties"]["reviewer"][0]["properties"]["name"][0]
118 | == "Tom Morris"
119 | )
120 | assert (
121 | result["items"][0]["properties"]["author"][0]["properties"]["adr"][0][
122 | "properties"
123 | ]["city"][0]
124 | == "London"
125 | )
126 |
127 |
128 | def test_plain_child_microformat():
129 | result = parse_fixture("nested_multiple_classnames.html")
130 |
131 | assert len(result["items"]) == 1
132 | assert "children" in result["items"][0]
133 | assert len(result["items"][0]["children"]) == 1
134 | assert result["items"][0]["children"][0]["properties"]["name"][0] == "Some Citation"
135 |
136 |
137 | def test_datetime_parsing():
138 | result = parse_fixture("datetimes.html")
139 | assert result["items"][0]["properties"]["start"][0] == "2014-01-01T12:00:00+0000"
140 | assert result["items"][0]["properties"]["end"][0] == "3014-01-01T18:00:00+0000"
141 | assert result["items"][0]["properties"]["duration"][0] == "P1000Y"
142 | assert result["items"][0]["properties"]["updated"][0] == "2011-08-26T00:01:21+0000"
143 | assert result["items"][0]["properties"]["updated"][1] == "2011-08-26T00:01:21+0000"
144 |
145 |
146 | def test_datetime_vcp_parsing():
147 | result = parse_fixture("datetimes.html")
148 | assert len(result["items"]) == 16
149 | assert result["items"][1]["properties"]["published"][0] == "3014-01-01 01:21Z"
150 | assert result["items"][2]["properties"]["updated"][0] == "2014-03-11 09:55"
151 | assert result["items"][3]["properties"]["published"][0] == "2014-01-30 15:28"
152 | assert result["items"][4]["properties"]["published"][0] == "9999-01-14T11:52+0800"
153 | assert result["items"][5]["properties"]["published"][0] == "2014-06-01 12:30-0600"
154 | assert result["items"][8]["properties"]["start"][0] == "2014-06-01 12:30-0600"
155 | assert result["items"][9]["properties"]["start"][0] == "2014-06-01 12:30-0600"
156 | assert result["items"][10]["properties"]["start"][0] == "2014-06-01 00:30-0600"
157 | assert result["items"][10]["properties"]["end"][0] == "2014-06-01 12:15"
158 | assert result["items"][10]["properties"]["start"][1] == "2014-06-01 00:30-0600"
159 | assert result["items"][10]["properties"]["end"][1] == "2014-06-01 12:15"
160 | assert result["items"][11]["properties"]["start"][0] == "2016-03-02 00:30-0600"
161 | assert result["items"][12]["properties"]["start"][0] == "2014-06-01 12:30-600"
162 | assert result["items"][13]["properties"]["start"][0] == "2014-06-01 12:30+600"
163 | assert result["items"][14]["properties"]["start"][0] == "2014-06-01 12:30Z"
164 | assert result["items"][15]["properties"]["start"][0] == "2014-06-01 12:30-600"
165 |
166 |
167 | def test_dt_end_implied_date():
168 | """Test that events with dt-start and dt-end use the implied date rule
169 | http://microformats.org/wiki/value-class-pattern#microformats2_parsers
170 | for times without dates"""
171 | result = parse_fixture("datetimes.html")
172 |
173 | event_wo_tz = result["items"][6]
174 | assert event_wo_tz["properties"]["start"][0] == "2014-05-21 18:30"
175 | assert event_wo_tz["properties"]["end"][0] == "2014-05-21 19:30"
176 |
177 | event_w_tz = result["items"][7]
178 | assert event_w_tz["properties"]["start"][0] == "2014-06-01 12:30-0600"
179 | assert event_w_tz["properties"]["end"][0] == "2014-06-01 19:30-0600"
180 |
181 |
182 | def test_embedded_parsing():
183 | result = parse_fixture("embedded.html")
184 | assert (
185 | result["items"][0]["properties"]["content"][0]["html"]
186 | == "
Blah blah blah blah blah.
\n
Blah.
\n
Blah blah blah.
"
187 | )
188 | assert (
189 | result["items"][0]["properties"]["content"][0]["value"]
190 | == "Blah blah blah blah blah.\n\nBlah.\n\nBlah blah blah."
191 | )
192 |
193 |
194 | def test_embedded_exposed_dom():
195 | result = parse_fixture("embedded.html", expose_dom=True)
196 | content = result["items"][0]["properties"]["content"][0]
197 | assert "html" not in content
198 | assert isinstance(content["dom"], bs4.element.Tag)
199 |
200 |
201 | def test_hoisting_nested_hcard():
202 | result = parse_fixture("nested_hcards.html")
203 | expected = [
204 | {
205 | "properties": {
206 | "author": [
207 | {
208 | "properties": {"name": ["KP1"]},
209 | "type": ["h-card"],
210 | "value": "KP1",
211 | }
212 | ],
213 | "in-reply-to": [
214 | {"properties": {"name": ["KP"]}, "type": ["h-cite"], "value": "KP"}
215 | ],
216 | },
217 | "type": ["h-entry"],
218 | }
219 | ]
220 | assert expected == result["items"]
221 |
222 |
223 | def test_html_tag_class():
224 | result = parse_fixture("hfeed_on_html_tag.html")
225 | assert ["h-feed"] == result["items"][0]["type"]
226 |
227 | assert ["entry1"] == result["items"][0]["children"][0]["properties"]["name"]
228 | assert ["entry2"] == result["items"][0]["children"][1]["properties"]["name"]
229 |
230 |
231 | def test_string_strip():
232 | result = parse_fixture("string_stripping.html")
233 | assert "Tom Morris" == result["items"][0]["properties"]["name"][0]
234 |
235 |
236 | def test_template_parse():
237 | result = parse_fixture("template_tag.html")
238 | assert 0 == len(result["items"])
239 |
240 |
241 | def test_template_tag_inside_e_value():
242 | result = parse_fixture("template_tag_inside_e_value.html")
243 | assert (
244 | "This is a Test with a
template
tag after this:"
245 | == result["items"][0]["properties"]["content"][0]["html"]
246 | )
247 | assert (
248 | "This is a Test with a template tag after this:"
249 | == result["items"][0]["properties"]["content"][0]["value"]
250 | )
251 |
252 |
253 | def test_ordering_dedup():
254 | """test that classes are dedeuped and alphabetically ordered"""
255 |
256 | result = parse_fixture("ordering_dedup.html")
257 | item = result["items"][0]
258 | assert ["h-entry", "h-feed", "h-product", "h-x-test"] == item["type"]
259 | assert ["example.com", "example.com/2"] == item["properties"]["url"]
260 | assert ["name", "URL name"] == item["properties"]["name"]
261 | assert ["author", "bookmark", "me"] == result["rel-urls"]["example.com/rel"]["rels"]
262 | assert "de" == result["rel-urls"]["example.com/lang"]["hreflang"]
263 |
264 |
265 | def test_class_names_format():
266 | """test that only classes with letters and possibly numbers in the vendor prefix part are used"""
267 |
268 | result = parse_fixture("class_names_format.html")
269 | item = result["items"][0]
270 | assert ["h-feed", "h-p3k-entry", "h-x-test"] == item["type"]
271 | assert "url" in item["properties"]
272 | assert "p3k-url" in item["properties"]
273 | assert "Url" not in item["properties"]
274 | assert "-url" not in item["properties"]
275 | assert "url-" not in item["properties"]
276 |
277 | assert "name" in item["properties"]
278 | assert "p3k-name" in item["properties"]
279 | assert "nAme" not in item["properties"]
280 | assert "-name" not in item["properties"]
281 | assert "name-" not in item["properties"]
282 |
283 |
284 | def test_area_uparsing():
285 | result = parse_fixture("area.html")
286 | assert {"url": ["http://suda.co.uk"], "name": ["Brian Suda"]} == result["items"][0][
287 | "properties"
288 | ]
289 | assert "shape" in result["items"][0]
290 | assert "coords" in result["items"][0]
291 |
292 |
293 | def test_src_equiv():
294 | result = parse_fixture("test_src_equiv.html")
295 | for item in result["items"]:
296 | assert "x-example" in item["properties"]
297 | assert "http://example.org/" == item["properties"]["x-example"][0]
298 |
299 |
300 | def test_rels():
301 | result = parse_fixture("rel.html")
302 | assert {
303 | "in-reply-to": ["http://example.com/1", "http://example.com/2"],
304 | "author": ["http://example.com/a", "http://example.com/b"],
305 | "alternate": ["http://example.com/fr"],
306 | "home": ["http://example.com/fr"],
307 | } == result["rels"]
308 | assert {
309 | "http://example.com/1": {"text": "post 1", "rels": ["in-reply-to"]},
310 | "http://example.com/2": {"text": "post 2", "rels": ["in-reply-to"]},
311 | "http://example.com/a": {"text": "author a", "rels": ["author"]},
312 | "http://example.com/b": {"text": "author b", "rels": ["author"]},
313 | "http://example.com/fr": {
314 | "text": "French mobile homepage",
315 | "media": "handheld",
316 | "rels": ["alternate", "home"],
317 | "hreflang": "fr",
318 | },
319 | } == result["rel-urls"]
320 |
321 |
322 | def test_alternates():
323 | result = parse_fixture("rel.html")
324 | assert [
325 | {
326 | "url": "http://example.com/fr",
327 | "media": "handheld",
328 | "text": "French mobile homepage",
329 | "rel": "home",
330 | "hreflang": "fr",
331 | }
332 | ] == result["alternates"]
333 |
334 |
335 | def test_enclosures():
336 | result = parse_fixture("rel_enclosure.html")
337 | assert {"enclosure": ["http://example.com/movie.mp4"]} == result["rels"]
338 | assert {
339 | "http://example.com/movie.mp4": {
340 | "rels": ["enclosure"],
341 | "text": "my movie",
342 | "type": "video/mpeg",
343 | }
344 | } == result["rel-urls"]
345 |
346 |
347 | def test_empty_href():
348 | result = parse_fixture("hcard_with_empty_url.html", url="http://foo.com")
349 |
350 | for hcard in result["items"]:
351 | assert ["http://foo.com"] == hcard["properties"]["url"]
352 |
353 |
354 | def test_link_with_u_url():
355 | result = parse_fixture("link_with_u-url.html", url="http://foo.com")
356 | assert {
357 | "type": ["h-card"],
358 | "properties": {
359 | "name": [""],
360 | "url": ["http://foo.com/"],
361 | },
362 | } == result["items"][0]
363 |
364 |
365 | def test_broken_url():
366 | result = parse_fixture("broken_url.html", url="http://example.com")
367 | assert (
368 | result["items"][0]["properties"]["relative"][0] == "http://example.com/foo.html"
369 | )
370 | assert result["items"][0]["properties"]["url"][0] == "http://www.[w3.org/"
371 | assert (
372 | result["items"][0]["properties"]["photo"][0]
373 | == "http://www.w3].org/20[08/site/images/logo-w3c-mobile-lg"
374 | )
375 |
376 |
377 | def test_complex_e_content():
378 | """When parsing h-* e-* properties, we should fold {"value":..., "html":...}
379 | into the parsed microformat object, instead of nesting it under an
380 | unnecessary second layer of "value":
381 | """
382 | result = parse_fixture("complex_e_content.html")
383 |
384 | assert {
385 | "type": ["h-entry"],
386 | "properties": {
387 | "content": [
388 | {
389 | "type": ["h-card"],
390 | "properties": {"name": ["Hello"]},
391 | "html": "
Hello
",
392 | "value": "Hello",
393 | }
394 | ],
395 | },
396 | } == result["items"][0]
397 |
398 |
399 | def test_relative_url_in_e():
400 | """When parsing e-* properties, make relative URLs absolute."""
401 | result = parse_fixture("relative_url_in_e.html")
402 |
403 | assert (
404 | '
Cat '
405 | '
'
406 | ) == result["items"][0]["properties"]["content"][0]["html"]
407 |
408 |
409 | def test_nested_values():
410 | """When parsing nested microformats, check that value is the value of
411 | the simple property element"""
412 | result = parse_fixture("nested_values.html")
413 | entry = result["items"][0]
414 |
415 | assert {
416 | "properties": {
417 | "name": ["Kyle"],
418 | "url": ["http://about.me/kyle"],
419 | },
420 | "value": "Kyle",
421 | "type": ["h-card"],
422 | } == entry["properties"]["author"][0]
423 |
424 | assert {
425 | "properties": {
426 | "name": ["foobar"],
427 | "url": ["http://example.com/foobar"],
428 | },
429 | "value": "http://example.com/foobar",
430 | "type": ["h-cite"],
431 | } == entry["properties"]["like-of"][0]
432 |
433 | assert {
434 | "properties": {
435 | "name": ["George"],
436 | "url": ["http://people.com/george"],
437 | },
438 | "type": ["h-card"],
439 | } == entry["children"][0]
440 |
441 |
442 | # implied properties tests
443 |
444 |
445 | def test_implied_name():
446 | result = parse_fixture("implied_properties/implied_properties.html")
447 |
448 | for i in range(7):
449 | assert result["items"][i]["properties"]["name"][0] == "Tom Morris"
450 |
451 |
452 | def test_implied_url():
453 | result = parse_fixture(
454 | "implied_properties/implied_properties.html", url="http://foo.com/"
455 | )
456 | assert result["items"][1]["properties"]["url"][0] == "http://tommorris.org/"
457 | # img should not have a "url" property
458 | assert "url" not in result["items"][4]["properties"]
459 | # href="" is relative to the base url
460 | assert result["items"][5]["properties"]["url"][0] == "http://foo.com/"
461 |
462 |
463 | def test_implied_photo():
464 | result = parse_fixture("implied_properties/implied_photo.html")
465 |
466 | for i in range(12):
467 | photos = result["items"][i]["properties"]["photo"]
468 | assert len(photos) == 1
469 | assert photos[0] == "http://example.com/photo.jpg"
470 |
471 | # tests for no photo
472 | for i in range(12, 23):
473 | assert "photo" not in result["items"][i]["properties"]
474 |
475 | result = parse_fixture("implied_properties/implied_photo_relative_url.html")
476 |
477 | assert (
478 | result["items"][0]["properties"]["photo"][0]["value"]
479 | == "http://example.com/jane-img.jpeg"
480 | )
481 | assert (
482 | result["items"][1]["properties"]["photo"][0]
483 | == "http://example.com/jane-object.jpeg"
484 | )
485 |
486 |
487 | def test_implied_url():
488 | result = parse_fixture("implied_properties/implied_url.html")
489 |
490 | for i in range(12):
491 | urls = result["items"][i]["properties"]["url"]
492 | assert len(urls) == 1
493 | assert urls[0] == "http://example.com"
494 |
495 | # tests for no url
496 | for i in range(12, 23):
497 | assert "url" not in result["items"][i]["properties"]
498 |
499 |
500 | def test_stop_implied_url():
501 | """testing that explicit properties case implied url-parsing to be aborted"""
502 |
503 | result = parse_fixture("implied_properties/stop_implied_url.html")
504 |
505 | assert "url" not in result["items"][0]["properties"]
506 | assert "url" not in result["items"][1]["properties"]
507 | assert "url" not in result["items"][2]["properties"]
508 | assert "url" not in result["items"][3]["properties"]
509 | assert "url" not in result["items"][4]["properties"]
510 | assert "url" not in result["items"][5]["properties"]
511 |
512 | assert result["items"][6]["properties"]["url"] == ["http://example.com/"]
513 | assert result["items"][7]["properties"]["url"] == ["http://example.com/"]
514 | assert result["items"][8]["properties"]["url"] == ["http://example.com/"]
515 | assert result["items"][9]["properties"]["url"] == ["http://example.com/"]
516 |
517 |
518 | def test_implied_nested_photo():
519 | result = parse_fixture(
520 | "implied_properties/implied_properties.html", url="http://bar.org"
521 | )
522 | assert result["items"][2]["properties"]["photo"][0] == {
523 | "alt": "",
524 | "value": "http://tommorris.org/photo.png",
525 | }
526 | assert (
527 | result["items"][3]["properties"]["photo"][0] == "http://tommorris.org/photo.png"
528 | )
529 | assert result["items"][4]["properties"]["photo"][0] == {
530 | "alt": "Tom Morris",
531 | "value": "http://tommorris.org/photo.png",
532 | }
533 | # src="" is relative to the base url
534 | assert result["items"][6]["properties"]["photo"][0] == "http://bar.org"
535 |
536 |
537 | def test_implied_nested_photo_alt_name():
538 | result = parse_fixture("implied_properties/implied_properties.html")
539 | assert result["items"][3]["properties"]["name"][0] == "Tom Morris"
540 |
541 |
542 | def test_implied_image():
543 | result = parse_fixture("implied_properties/implied_properties.html")
544 | assert result["items"][4]["properties"]["photo"][0] == {
545 | "alt": "Tom Morris",
546 | "value": "http://tommorris.org/photo.png",
547 | }
548 | assert result["items"][4]["properties"]["name"][0] == "Tom Morris"
549 |
550 |
551 | def test_implied_name_empty_alt():
552 | """An empty alt text should not prevent us from including other
553 | children in the implied name.
554 | """
555 |
556 | result = parse_fixture("implied_properties/implied_name_empty_alt.html")
557 | hcard = result["items"][0]
558 |
559 | assert {
560 | "type": ["h-card"],
561 | "properties": {
562 | "name": ["@kylewmahan"],
563 | "url": ["https://twitter.com/kylewmahan"],
564 | "photo": [{"alt": "", "value": "https://example.org/test.jpg"}],
565 | },
566 | } == hcard
567 |
568 |
569 | def test_relative_datetime():
570 | result = parse_fixture("implied_properties/implied_relative_datetimes.html")
571 | assert result["items"][0]["properties"]["updated"][0] == "2015-01-02 05:06"
572 |
573 |
574 | def test_stop_implied_name_nested_h():
575 | result = parse_fixture("implied_properties/stop_implied_name_nested_h.html")
576 | assert "name" not in result["items"][0]["properties"]
577 |
578 |
579 | def test_stop_implied_name_e_content():
580 | result = parse_fixture("implied_properties/stop_implied_name_e_content.html")
581 | assert "name" not in result["items"][0]["properties"]
582 |
583 |
584 | def test_stop_implied_name_p_content():
585 | result = parse_fixture("implied_properties/stop_implied_name_p_content.html")
586 | assert "name" not in result["items"][0]["properties"]
587 |
588 |
589 | def test_implied_properties_silo_pub():
590 | result = parse_fixture("implied_properties/implied_properties_silo_pub.html")
591 | item = result["items"][0]
592 |
593 | # implied_name = item['properties']['name'][0]
594 | # implied_name = re.sub('\s+', ' ', implied_name).strip()
595 | # assert '@kylewmahan on Twitter', implied_name)
596 |
597 | # no implied name expected under new rules
598 |
599 | assert "name" not in item["properties"]
600 |
601 |
602 | def test_simple_person_reference_implied():
603 | result = parse_fixture("implied_properties/simple_person_reference_implied.html")
604 | assert result["items"][0]["properties"] == {"name": ["Frances Berriman"]}
605 |
606 |
607 | def test_implied_name_alt():
608 | result = parse_fixture("implied_properties/implied_name_alt.html")
609 | assert result["items"][0]["children"][0] == {
610 | "type": ["h-card"],
611 | "properties": {
612 | "name": ["Avatar of Stephen"],
613 | "photo": [{"alt": "Avatar of", "value": "avatar.jpg"}],
614 | },
615 | }
616 |
617 |
618 | def test_value_name_whitespace():
619 | result = parse_fixture("value_name_whitespace.html")
620 |
621 | for i in range(3):
622 | assert result["items"][i]["properties"]["content"][0]["value"] == "Hello World"
623 | assert result["items"][i]["properties"]["name"][0] == "Hello World"
624 |
625 | for i in range(3, 7):
626 | assert result["items"][i]["properties"]["content"][0]["value"] == "Hello\nWorld"
627 | assert result["items"][i]["properties"]["name"][0] == "Hello\nWorld"
628 |
629 | assert result["items"][7]["properties"]["content"][0]["value"] == "Hello\n\nWorld"
630 | assert result["items"][7]["properties"]["name"][0] == "Hello\n\nWorld"
631 |
632 | assert result["items"][8]["properties"]["content"][0]["value"] == "One\nTwo\nThree"
633 | assert result["items"][8]["properties"]["name"][0] == "One\nTwo\nThree"
634 |
635 | assert (
636 | result["items"][9]["properties"]["content"][0]["value"] == "One\n\nTwo\n\nThree"
637 | )
638 | assert result["items"][9]["properties"]["name"][0] == "One\n\nTwo\n\nThree"
639 |
640 | assert (
641 | result["items"][10]["properties"]["content"][0]["value"]
642 | == "Hello World one\n two\n three\n "
643 | )
644 | assert (
645 | result["items"][10]["properties"]["name"][0]
646 | == "Hello World one\n two\n three\n "
647 | )
648 |
649 | assert (
650 | result["items"][11]["properties"]["content"][0]["value"]
651 | == "Correct name Correct summary"
652 | )
653 | assert result["items"][11]["properties"]["name"][0] == "Correct name"
654 |
655 |
656 | # backcompat tests
657 |
658 |
659 | def test_backcompat_hentry():
660 | result = parse_fixture("backcompat/hentry.html")
661 | assert "h-entry" in result["items"][0]["type"]
662 | assert (
663 | "Tom Morris"
664 | == result["items"][0]["properties"]["author"][0]["properties"]["name"][0]
665 | )
666 | assert "A Title" == result["items"][0]["properties"]["name"][0]
667 | assert "Some Content" == result["items"][0]["properties"]["content"][0]["value"]
668 |
669 |
670 | def test_backcompat_hproduct():
671 | result = parse_fixture("backcompat/hproduct.html")
672 | assert 1 == len(result["items"])
673 | assert ["h-product"] == result["items"][0]["type"]
674 | assert ["bullshit"] == result["items"][0]["properties"]["category"]
675 | assert ["Quacktastic Products"] == result["items"][0]["properties"]["brand"]
676 | assert ["#BULLSHIT-001"] == result["items"][0]["properties"]["identifier"]
677 | assert (
678 | "Magical tasty sugar pills that don't do anything."
679 | == result["items"][0]["properties"]["description"][0]
680 | )
681 | assert ["Tom's Magical Quack Tincture"] == result["items"][0]["properties"]["name"]
682 |
683 |
684 | def test_backcompat_hproduct_nested_hreview():
685 | result = parse_fixture("backcompat/hproduct_hreview_nested.html")
686 | assert ["h-review"] == result["items"][0]["children"][0]["type"]
687 |
688 |
689 | def test_backcompat_hreview_nested_card_event_product():
690 | result = parse_fixture("backcompat/hreview_nested_card_event_product.html")
691 | assert ["h-review"] == result["items"][0]["type"]
692 | items = result["items"][0]["properties"]["item"]
693 | assert 3 == len(items)
694 |
695 | event = items[0]
696 | assert ["h-event"] == event["type"]
697 | assert ["http://example.com/event-url"] == event["properties"]["url"]
698 | assert ["event name"] == event["properties"]["name"]
699 |
700 | card = items[1]
701 | assert ["h-card"] == card["type"]
702 | assert ["http://example.com/card-url"] == card["properties"]["url"]
703 | assert ["card name"] == card["properties"]["name"]
704 |
705 | product = items[2]
706 | assert ["h-product"] == product["type"]
707 | assert ["http://example.com/product-url"] == product["properties"]["url"]
708 | assert ["product name"] == product["properties"]["name"]
709 |
710 |
711 | def test_backcompat_rel_bookmark():
712 | """Confirm that rel=bookmark inside of an h-entry is converted
713 | to u-url.
714 | """
715 | result = parse_fixture("backcompat/feed_with_rel_bookmark.html")
716 | for ii, url in enumerate(
717 | (
718 | "/2014/11/24/jump-rope",
719 | "/2014/11/23/graffiti",
720 | "/2014/11/21/earth",
721 | "/2014/11/19/labor",
722 | )
723 | ):
724 | assert ["h-entry"] == result["items"][ii]["type"]
725 | assert [url] == result["items"][ii]["properties"]["url"]
726 |
727 |
728 | def test_backcompat_rel_bookmark():
729 | """Confirm that rel=bookmark inside of an hentry and hreview is converted
730 | to a u-url and original u-url is ignored
731 | """
732 |
733 | tests = [
734 | "backcompat/hentry_with_rel_bookmark.html",
735 | "backcompat/hreview_with_rel_tag_bookmark.html",
736 | ]
737 |
738 | results = [parse_fixture(x) for x in tests]
739 |
740 | for result in results:
741 | assert [
742 | "https://example.com/bookmark",
743 | "https://example.com/bookmark-url",
744 | ] == result["items"][0]["properties"]["url"]
745 |
746 |
747 | def test_backcompat_rel_tag():
748 | """Confirm that rel=tag inside of an hentry is converted
749 | to a p-category and the last path segment of the href is used.
750 | """
751 |
752 | tests = [
753 | "backcompat/hentry_with_rel_tag.html",
754 | "backcompat/hfeed_with_rel_tag.html",
755 | "backcompat/hrecipe_with_rel_tag.html",
756 | "backcompat/hreview_with_rel_tag_bookmark.html",
757 | ]
758 |
759 | results = [parse_fixture(x) for x in tests]
760 | for result in results:
761 | assert ["cat", "dog", "mountain lion", "mouse", "meerkat"] == result["items"][
762 | 0
763 | ]["properties"]["category"]
764 |
765 |
766 | def test_backcompat_rel_tag_entry_title():
767 | """Confirm that other backcompat properties on a rel=tag are parsed"""
768 |
769 | result = parse_fixture("backcompat/hentry_with_rel_tag_entry_title.html")
770 | assert ["cat"] == result["items"][0]["properties"]["category"]
771 | assert ["rhinoceros"] == result["items"][0]["properties"]["name"]
772 |
773 |
774 | def test_backcompat_rel_multiple_root():
775 | """Confirm that rel=tag and rel=bookmark inside of an hentry+hreview is parsed correctly"""
776 |
777 | result = parse_fixture("backcompat/hreview_hentry_with_rel_tag_bookmark.html")
778 |
779 | assert len(result["items"]) == 1
780 | assert "h-entry" in result["items"][0]["type"]
781 | assert "h-review" in result["items"][0]["type"]
782 |
783 | assert ["cat", "dog", "mountain lion", "mouse", "meerkat"] == result["items"][0][
784 | "properties"
785 | ]["category"]
786 | assert [
787 | "https://example.com/bookmark",
788 | "https://example.com/bookmark-url",
789 | ] == result["items"][0]["properties"]["url"]
790 |
791 |
792 | def test_backcompat_ignore_mf1_root_if_mf2_present():
793 | """Confirm that mf1 root class is ignored if another mf2 root class is present."""
794 | result = parse_fixture("backcompat/ignore_mf1_root_if_mf2_present.html")
795 | assert "h-entry" not in result["items"][0]["type"]
796 | assert "h-event" in result["items"][0]["type"]
797 |
798 |
799 | def test_backcompat_no_implied_properties_mf1_root():
800 | """Confirm that mf1 root class does not have implied properties"""
801 | result = parse_fixture("backcompat/ignore_mf1_root_if_mf2_present.html")
802 | assert "h-entry" not in result["items"][0]["properties"]
803 | assert "name" not in result["items"][0]["type"]
804 | assert "url" not in result["items"][0]["properties"]
805 | assert "photo" not in result["items"][0]["properties"]
806 |
807 |
808 | def test_backcompat_ignore_mf2_properties_in_mf1_root():
809 | """Confirm that mf2 properties are ignored in mf1 root class"""
810 | result = parse_fixture("backcompat/ignore_mf2_properties_in_mf1_root.html")
811 | assert "Correct name" == result["items"][0]["properties"]["name"][0]
812 | assert "Correct summary" == result["items"][0]["properties"]["summary"][0]
813 |
814 |
815 | def test_backcompat_ignore_mf1_properties_in_mf2_root():
816 | """Confirm that mf1 properties are ignored in mf2 root class"""
817 | result = parse_fixture("backcompat/ignore_mf1_properties_in_mf2_root.html")
818 | assert "Correct name" == result["items"][0]["properties"]["name"][0]
819 | assert "Correct summary" == result["items"][0]["properties"]["summary"][0]
820 |
821 |
822 | def test_backcompat_nested_mf2_in_mf1():
823 | """Confirm that mf2 roots nested inside mf1 root are parsed"""
824 | result = parse_fixture("backcompat/nested_mf2_in_mf1.html")
825 | assert "h-feed" == result["items"][0]["type"][0]
826 | assert "h-entry" == result["items"][0]["children"][0]["type"][0]
827 | assert "Correct name" == result["items"][0]["children"][0]["properties"]["name"][0]
828 | assert (
829 | "Correct summary"
830 | == result["items"][0]["children"][0]["properties"]["summary"][0]
831 | )
832 |
833 |
834 | def test_backcompat_nested_mf1_in_mf2():
835 | """Confirm that mf1 roots nested inside mf2 root are parsed"""
836 | result = parse_fixture("backcompat/nested_mf1_in_mf2.html")
837 | assert "h-feed" == result["items"][0]["type"][0]
838 | assert "h-entry" == result["items"][0]["children"][0]["type"][0]
839 | assert "Correct name" == result["items"][0]["children"][0]["properties"]["name"][0]
840 | assert (
841 | "Correct summary"
842 | == result["items"][0]["children"][0]["properties"]["summary"][0]
843 | )
844 |
845 |
846 | def test_backcompat_nested_mf1_in_mf2_e_content():
847 | """Confirm that mf1 roots nested inside mf2 root e-content are parsed as authored"""
848 | result = parse_fixture("backcompat/nested_mf1_in_mf2_e_content.html")
849 |
850 | mf2_entry = result["items"][0]
851 | mf1_entry = mf2_entry["children"][0]
852 |
853 | assert (
854 | '
\nCorrect name\n\nCorrect summary\n
'
855 | == mf2_entry["properties"]["content"][0]["html"]
856 | )
857 |
858 | assert (
859 | "Correct name Correct summary" == mf2_entry["properties"]["content"][0]["value"]
860 | )
861 |
862 | assert "h-entry" == mf1_entry["type"][0]
863 | assert "Correct name" == mf1_entry["properties"]["name"][0]
864 | assert "Correct summary" == mf1_entry["properties"]["summary"][0]
865 |
866 |
867 | def test_backcompat_hentry_content_html():
868 | """Confirm that mf1 entry-content html is parsed as authored without mf2 replacements"""
869 | result = parse_fixture("backcompat/hentry_content_html.html")
870 |
871 | entry = result["items"][0]
872 |
873 | assert (
874 | '
This is a summary
\n
This is mytag inside content.
'
875 | == entry["properties"]["content"][0]["html"]
876 | )
877 |
878 |
879 | def test_whitespace_with_tags_inside_property():
880 | """Whitespace should only be trimmed at the ends of textContent, not inside.
881 |
882 | https://github.com/microformats/mf2py/issues/112
883 | """
884 | result = parse_fixture("tag_whitespace_inside_p_value.html")
885 | assert result["items"][0]["properties"] == {"name": ["foo bar"]}
886 |
887 |
888 | def test_plaintext_p_whitespace():
889 | result = parse_fixture("plaintext_p_whitespace.html")
890 | assert result["items"][0]["properties"]["content"][0]["value"] == "foo\nbar baz"
891 | assert result["items"][1]["properties"]["content"][0]["value"] == "foo\nbar baz"
892 | assert result["items"][2]["properties"]["content"][0]["value"] == "foo bar\nbaz"
893 |
894 |
895 | def test_plaintext_img_whitespace():
896 | result = parse_fixture("plaintext_img_whitespace.html")
897 | assert (
898 | result["items"][0]["properties"]["content"][0]["value"]
899 | == "selfie At some tourist spot"
900 | )
901 | assert (
902 | result["items"][1]["properties"]["content"][0]["value"]
903 | == "At another tourist spot"
904 | )
905 | assert (
906 | result["items"][2]["properties"]["content"][0]["value"]
907 | == "https://example.com/photo.jpg At yet another tourist spot"
908 | )
909 |
910 |
911 | def test_photo_with_alt():
912 | """Confirm that alt text in img is parsed as a u-* property and implied photo"""
913 |
914 | path = "img_with_alt.html"
915 |
916 | result = parse_fixture(path)
917 |
918 | with open(os.path.join(TEST_DIR, path)) as f:
919 | exp_result = Parser(doc=f, html_parser="html5lib").to_dict()
920 |
921 | # simple img with u-*
922 | assert "/photo.jpg" == result["items"][0]["properties"]["photo"][0]
923 | assert "/photo.jpg" == exp_result["items"][0]["properties"]["photo"][0]
924 |
925 | assert {"alt": "alt text", "value": "/photo.jpg"} == result["items"][1][
926 | "properties"
927 | ]["url"][0]
928 | assert "/photo.jpg" == exp_result["items"][1]["properties"]["url"][0]["value"]
929 | assert "alt text" == exp_result["items"][1]["properties"]["url"][0]["alt"]
930 |
931 | assert {"alt": "", "value": "/photo.jpg"} == result["items"][2]["properties"][
932 | "in-reply-to"
933 | ][0]
934 | assert (
935 | "/photo.jpg" == exp_result["items"][2]["properties"]["in-reply-to"][0]["value"]
936 | )
937 | assert "" == exp_result["items"][2]["properties"]["in-reply-to"][0]["alt"]
938 |
939 | # img with u-* and h-* example
940 | assert "h-cite" in result["items"][3]["properties"]["in-reply-to"][0]["type"]
941 | assert (
942 | "/photo.jpg"
943 | == result["items"][3]["properties"]["in-reply-to"][0]["properties"]["photo"][0]
944 | )
945 | assert "/photo.jpg" == result["items"][3]["properties"]["in-reply-to"][0]["value"]
946 | assert "alt" not in result["items"][3]["properties"]["in-reply-to"][0]
947 |
948 | assert "h-cite" in exp_result["items"][3]["properties"]["in-reply-to"][0]["type"]
949 | assert (
950 | "/photo.jpg"
951 | == exp_result["items"][3]["properties"]["in-reply-to"][0]["properties"][
952 | "photo"
953 | ][0]
954 | )
955 | assert (
956 | "/photo.jpg" == exp_result["items"][3]["properties"]["in-reply-to"][0]["value"]
957 | )
958 | assert "alt" not in exp_result["items"][3]["properties"]["in-reply-to"][0]
959 |
960 | assert "h-cite" in result["items"][4]["properties"]["in-reply-to"][0]["type"]
961 | assert {"alt": "alt text", "value": "/photo.jpg"} == result["items"][4][
962 | "properties"
963 | ]["in-reply-to"][0]["properties"]["photo"][0]
964 | assert "/photo.jpg" == result["items"][4]["properties"]["in-reply-to"][0]["value"]
965 | assert "alt" in result["items"][4]["properties"]["in-reply-to"][0]
966 |
967 | assert "h-cite" in exp_result["items"][4]["properties"]["in-reply-to"][0]["type"]
968 | assert (
969 | "/photo.jpg"
970 | == exp_result["items"][4]["properties"]["in-reply-to"][0]["properties"][
971 | "photo"
972 | ][0]["value"]
973 | )
974 | assert (
975 | "/photo.jpg" == exp_result["items"][4]["properties"]["in-reply-to"][0]["value"]
976 | )
977 | assert (
978 | "alt text"
979 | == exp_result["items"][4]["properties"]["in-reply-to"][0]["properties"][
980 | "photo"
981 | ][0]["alt"]
982 | )
983 | assert "alt text" == exp_result["items"][4]["properties"]["in-reply-to"][0]["alt"]
984 |
985 | assert "h-cite" in result["items"][5]["properties"]["in-reply-to"][0]["type"]
986 | assert {"alt": "", "value": "/photo.jpg"} == result["items"][5]["properties"][
987 | "in-reply-to"
988 | ][0]["properties"]["photo"][0]
989 | assert "/photo.jpg" == result["items"][5]["properties"]["in-reply-to"][0]["value"]
990 | assert "alt" in result["items"][5]["properties"]["in-reply-to"][0]
991 |
992 | assert "h-cite" in exp_result["items"][5]["properties"]["in-reply-to"][0]["type"]
993 | assert (
994 | "/photo.jpg"
995 | == exp_result["items"][5]["properties"]["in-reply-to"][0]["properties"][
996 | "photo"
997 | ][0]["value"]
998 | )
999 | assert (
1000 | "/photo.jpg" == exp_result["items"][5]["properties"]["in-reply-to"][0]["value"]
1001 | )
1002 | assert (
1003 | ""
1004 | == exp_result["items"][5]["properties"]["in-reply-to"][0]["properties"][
1005 | "photo"
1006 | ][0]["alt"]
1007 | )
1008 | assert "" == exp_result["items"][5]["properties"]["in-reply-to"][0]["alt"]
1009 |
1010 |
1011 | def test_photo_with_srcset():
1012 | result = parse_fixture("img_with_srcset.html")
1013 |
1014 | assert result["items"][0]["properties"]["photo"][0]["srcset"] == {
1015 | "480w": "elva-fairy-480w.jpg",
1016 | "800w": "elva-fairy-800w.jpg",
1017 | }
1018 | assert result["items"][1]["properties"]["photo"][0]["srcset"] == {
1019 | "1x": "elva-fairy-320w.jpg",
1020 | "1.5x": "elva-fairy-480w.jpg",
1021 | "2x": "elva-fairy-640w.jpg",
1022 | }
1023 | assert (
1024 | result["items"][1]["properties"]["photo"][0]["srcset"]["2x"]
1025 | != "elva-fairy-2w.jpg"
1026 | )
1027 | for i in range(2, 7):
1028 | assert result["items"][i]["properties"]["photo"][0]["srcset"] == {
1029 | "1x": "elva-fairy,320w.jpg",
1030 | "1.5x": "elva-fairy,480w.jpg",
1031 | }
1032 | assert result["items"][7]["properties"]["photo"][0]["srcset"] == {
1033 | "1x": "elva-fairy,320w.jpg",
1034 | }
1035 | assert result["items"][8]["properties"]["photo"][0]["srcset"] == {
1036 | "1x": "elva-fairy,320w.jpg",
1037 | "1.5x": "elva-fairy,480w.jpg",
1038 | "2x": "elva-fairy,640w.jpg",
1039 | }
1040 |
1041 | result = parse_fixture("img_with_srcset_with_base.html")
1042 |
1043 | assert result["items"][0]["properties"]["photo"][0]["srcset"] == {
1044 | "480w": "https://example.com/elva-fairy-480w.jpg",
1045 | "800w": "https://example.com/elva-fairy-800w.jpg",
1046 | }
1047 |
1048 |
1049 | def test_parse_id():
1050 | result = parse_fixture("parse_id.html")
1051 | assert "recentArticles" == result["items"][0]["id"]
1052 | assert "article" == result["items"][0]["children"][0]["id"]
1053 | assert "id" not in result["items"][0]["children"][1]
1054 | assert "theAuthor" == result["items"][0]["properties"]["author"][0]["id"]
1055 |
1056 |
1057 | # unicode tests
1058 |
1059 |
1060 | def get_all_files():
1061 | all_files = []
1062 |
1063 | for dir_, _, files in os.walk(TEST_DIR):
1064 | for filename in files:
1065 | rel_dir = os.path.relpath(dir_, TEST_DIR)
1066 | all_files.append(os.path.join(rel_dir, filename))
1067 |
1068 | return all_files
1069 |
1070 |
1071 | def assert_unicode_everywhere(obj):
1072 | if isinstance(obj, dict):
1073 | for k, v in obj.items():
1074 | assert not isinstance(k, bytes), "key=%r; type=%r" % (k, type(k))
1075 | assert_unicode_everywhere(v)
1076 | elif isinstance(obj, list):
1077 | for v in obj:
1078 | assert_unicode_everywhere(v)
1079 |
1080 | assert not isinstance(obj, bytes), "value=%r; type=%r" % (obj, type(obj))
1081 |
1082 |
1083 | def check_unicode(filename, jsonblob):
1084 | assert_unicode_everywhere(jsonblob)
1085 |
1086 |
1087 | def test_unicode_everywhere():
1088 | """make sure everything is unicode"""
1089 |
1090 | for h in get_all_files():
1091 | result = parse_fixture(h)
1092 | check_unicode(h, result)
1093 |
1094 |
1095 | def test_input_tree_integrity():
1096 | """make sure that if we parse a BS4 soup, our modifications do not leak into the document represented by it"""
1097 |
1098 | for path in get_all_files():
1099 | with open(os.path.join(TEST_DIR, path)) as f:
1100 | soup = BeautifulSoup(f, features="lxml")
1101 | html1 = soup.prettify()
1102 | p = Parser(doc=soup, html_parser="lxml")
1103 | html2 = soup.prettify()
1104 | make_labelled_cmp("tree_integrity_" + path)(html1, html2)
1105 |
1106 |
1107 | def make_labelled_cmp(label):
1108 | def f(html1, html2):
1109 | assert html1 == html2
1110 |
1111 | f.description = label
1112 | return f
1113 |
1114 |
1115 | def test_all_u_cases():
1116 | """test variations of u- parsing and that relative urls are always resolved"""
1117 |
1118 | URL_COUNT = 28
1119 | result = parse_fixture("u_all_cases.html")
1120 |
1121 | assert URL_COUNT == len(result["items"][0]["properties"]["url"])
1122 | for i in range(URL_COUNT):
1123 | make_labelled_cmp("all_u_cases_" + str(i))(
1124 | "http://example.com/test", result["items"][0]["properties"]["url"][i]
1125 | )
1126 |
1127 |
1128 | def test_filtered_roots():
1129 | result = parse_fixture("filter_roots.html")
1130 | assert len(result["items"]) == 8
1131 |
1132 | result = parse_fixture("filter_roots.html", filter_roots=True)
1133 | assert len(result["items"]) == 1
1134 |
1135 | result = parse_fixture(
1136 | "filter_roots_custom.html", filter_roots={"foo", "bar", "bat", "baz"}
1137 | )
1138 | assert len(result["items"]) == 1
1139 |
1140 |
1141 | def test_metaformats_flag_false():
1142 | result = parse_fixture("metaformats_ogp.html")
1143 | assert result["items"] == []
1144 |
1145 |
1146 | def test_metaformats_title_only():
1147 | result = parse_fixture("base.html", metaformats=True)
1148 | assert result["items"] == [
1149 | {
1150 | "type": ["h-entry"],
1151 | "properties": {
1152 | "name": ["Hello World"],
1153 | },
1154 | "source": "metaformats",
1155 | }
1156 | ]
1157 |
1158 |
1159 | def test_metaformats_ogp():
1160 | result = parse_fixture("metaformats_ogp.html", metaformats=True)
1161 | assert result["items"] == [
1162 | {
1163 | "type": ["h-entry"],
1164 | "properties": {
1165 | "name": ["Titull foo"],
1166 | "summary": ["Descrypshun bar"],
1167 | "photo": ["http://example.com/baz.jpg"],
1168 | "audio": ["http://example.com/biff.mp3"],
1169 | "video": ["http://example.com/boff.mov"],
1170 | "author": ["http://tantek.com/me"],
1171 | "published": ["2023-01-02T03:04Z"],
1172 | "updated": ["2023-01-02T05:06Z"],
1173 | },
1174 | "source": "metaformats",
1175 | }
1176 | ]
1177 |
1178 |
1179 | def test_metaformats_twitter():
1180 | result = parse_fixture("metaformats_twitter.html", metaformats=True)
1181 | assert result["items"] == [
1182 | {
1183 | "type": ["h-entry"],
1184 | "properties": {
1185 | "name": ["Titull foo"],
1186 | "summary": ["Descrypshun bar"],
1187 | "photo": ["http://tantek.com/baz.jpg"],
1188 | },
1189 | "source": "metaformats",
1190 | }
1191 | ]
1192 |
1193 |
1194 | def test_metaformats_html_meta():
1195 | result = parse_fixture("metaformats_html_meta.html", metaformats=True)
1196 | assert result["items"] == [
1197 | {
1198 | "type": ["h-entry"],
1199 | "properties": {
1200 | "name": ["Hello World"],
1201 | "summary": ["Descrypshun bar"],
1202 | },
1203 | "source": "metaformats",
1204 | }
1205 | ]
1206 |
1207 |
1208 | def test_language():
1209 | result = parse_fixture("language.html")
1210 | assert result["items"][0]["lang"] == "it"
1211 | assert result["items"][1]["lang"] == "it"
1212 | assert result["items"][1]["properties"]["content"][0]["lang"] == "en"
1213 | assert result["items"][1]["properties"]["content"][1]["lang"] == "it"
1214 | assert result["items"][2]["lang"] == "sv"
1215 | assert result["items"][2]["properties"]["content"][0]["lang"] == "en"
1216 | assert result["items"][2]["properties"]["content"][1]["lang"] == "sv"
1217 |
1218 |
1219 | def test_parser_object():
1220 | with open(os.path.join(TEST_DIR, "festivus.html")) as f:
1221 | p = Parser(doc=f)
1222 | assert len(p.to_dict(filter_by_type="h-card")) == 3
1223 | assert len(p.to_dict(filter_by_type="h-entry")) == 4
1224 | assert (
1225 | p.to_json(filter_by_type="h-card")
1226 | == '[{"type": ["h-card"], "properties": {"name": ["Jerry"]}}, {"type": '
1227 | '["h-card"], "properties": {"name": ["Frank"]}}, {"type": ["h-card"], '
1228 | '"properties": {"name": ["Cosmo"]}}]'
1229 | )
1230 |
--------------------------------------------------------------------------------
/test/test_suite.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import json
3 | import os.path
4 | import sys
5 |
6 | from test_parser import check_unicode
7 |
8 | import mf2py
9 |
10 |
11 | def test_mf2tests():
12 | allfiles = glob.glob(os.path.join(".", "testsuite", "tests", "*", "*", "*.json"))
13 | for jsonfile in allfiles:
14 | htmlfile = jsonfile[:-4] + "html"
15 | with open(htmlfile) as f:
16 | p = mf2py.parse(doc=f, url="http://example.com")
17 | check_unicode(htmlfile, p)
18 | with open(jsonfile) as jsonf:
19 | try:
20 | s = json.load(jsonf)
21 | except:
22 | s = "bad file: " + jsonfile + sys.exc_info()[0]
23 | check_mf2(htmlfile, p, s)
24 |
25 |
26 | def check_mf2(htmlfile, p, s):
27 | # TODO ignore extra keys in p that are not in s
28 | assert p == s
29 |
--------------------------------------------------------------------------------