├── .github
└── workflows
│ └── python-package.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.txt
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
└── metadata_parser
│ ├── __init__.py
│ └── py.typed
├── tests
├── __init__.py
├── html_scaffolds
│ ├── charset_a.html
│ ├── charset_b.html
│ ├── charset_c.html
│ ├── duplicates.html
│ └── simple.html
├── test_document_parsing.py
├── test_ip_tracking.py
├── test_responses.py
├── test_sessions.py
└── test_url_parsing.py
├── tox.ini
└── types.txt
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | jobs:
13 | build:
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | matrix:
17 | os:
18 | - "ubuntu-22.04"
19 | python-version:
20 | - "3.7"
21 | - "3.8"
22 | - "3.9"
23 | - "3.10"
24 | - "3.11"
25 | - "3.12"
26 | - "3.13"
27 | steps:
28 | - uses: actions/checkout@v3
29 | - name: Set up Python ${{ matrix.python-version }}
30 | uses: actions/setup-python@v4
31 | with:
32 | python-version: ${{ matrix.python-version }}
33 | - name: Install dependencies
34 | run: |
35 | python -m pip install --upgrade pip
36 | pip install --upgrade tox setuptools flake8 pytest
37 | pip list
38 | - name: Test with pytest
39 | run: |
40 | tox -e py -- ${{ matrix.pytest-args }}
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tests/private/*
2 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/psf/black
5 | rev: 24.8.0
6 | hooks:
7 | - id: black
8 | - repo: https://github.com/pycqa/flake8
9 | rev: 7.1.1
10 | hooks:
11 | - id: flake8
--------------------------------------------------------------------------------
/CHANGELOG.txt:
--------------------------------------------------------------------------------
1 | 1.0 (unreleased)
2 | 1.0 will include an api overhaul and remove all deprecations
3 |
4 |
5 | 0.13.0
6 | * drop py36; no test options due to github deprecation of ubuntu20.04
7 | * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a
8 | ValueError if a string other than "all" is submitted. The only valid
9 | string is "all", otherwise a list of string - excluding "all" - must be
10 | submitted. Warnings of this have been emitted for several years.
11 | * __init__(`search_head_only`) now defaults to False
12 | * `UrlParserCacheable` has been extended to accepted a `urlparser` argument.
13 | This defaults to `urlparse` and expects the same signature.
14 | * __init__(`cached_urlparser`) has new deprecations to standardize the API
15 | submitting an Int to set max_items is deprecated; instead:
16 | cached_urlparser=True
17 | cached_urlparser_maxitems=int
18 | submitting 0 is deprecated; instead:
19 | cached_urlparser=False
20 | or
21 | cached_urlparser_maxitems=0
22 | cached_urlparser=False
23 | * __init__(`cached_urlparser_maxitems`) has been added
24 | * the next release is likely to be 1.0
25 |
26 | 0.12.3
27 | * pin "BeautifulSoup4<4.15.0"
28 | * See `https://git.launchpad.net/beautifulsoup/tree/CHANGELOG`
29 | > 4.13.0 (20250202)
30 | > These things now give DeprecationWarnings when you try to use them,
31 | and are scheduled to be removed in Beautiful Soup 4.15.0.
32 | * fixes #47
33 |
34 | 0.12.2
35 | * Support Python 3.13 via `legacy-cgi` package.
36 | Thank you, https://github.com/Dryusdan.
37 | See:
38 | https://github.com/jvanasco/metadata_parser/pull/44
39 | https://github.com/jvanasco/metadata_parser/issues/43
40 | * updated pre-commit-config
41 |
42 | 0.12.1
43 | * typing
44 | * added `METADATA_PARSER_FUTURE` environment variable
45 | `export METADATA_PARSER_FUTURE=1` to enable
46 | * is_parsed_valid_url can accept a ParseResultBytes object now
47 |
48 | 0.12.0
49 | * drop python 2.7
50 | * initial typing support
51 |
52 | 0.11.0 | UNRELEASED
53 |
54 | * BREAKING CHANGES
55 | Due to the following breaking changes, the version was bumped to 0.11.0
56 | * `MetadataParser.fetch_url` now returns a third item.
57 |
58 | * COMPATIBLE CHANGES
59 | The following changes are backwards compatible to the 0.10.x releases
60 | * a test-suite for an application leveraging `metadata_parser` experienced
61 | some issues due to changes in the Responses package used to mock tests.
62 | to better faciliate against that, a new change were made:
63 |
64 | MetadataParser now has 2 subclassable attributes for items that should
65 | or should not be parsed:
66 |
67 | + _content_types_parse = ("text/html",)
68 | + _content_types_noparse = ("application/json",)
69 |
70 | Previously, these values were hardcoded into the logic.
71 | * some error log messages were reformatted for clarity
72 | * some error log messages were incorrectly reformatted by black
73 | * added logging for NotParseable situations involving redirects
74 | * added a `.response` attribute to NotParsable errors to help debug
75 | redirects
76 | * added a new ResponseHistory class to track redirects
77 | * it is computed and returned during `MetadataParser.fetch_url`
78 | * `MetadataParser.parse(` optionally accepts it, and will stash
79 | it into ParsedResult
80 | * `ParsedResult`
81 | * ResponseHistory is not stashed in the metadata stash, but a new namespace
82 | * `.response_history` will either be `ResponseHistory` or None
83 | * improving docstrings
84 | * added `decode_html` helper
85 | * extended MetadataParser to allow registration of a defcault_encoder for results
86 | * style cleanup
87 |
88 | 0.10.5
89 | packaging fixes
90 | migrated 'types.txt' out of distribution; it remains in github source
91 | updated some log lines with the url
92 | introduced some new log lines
93 | added `METADATA_PARSER__DISABLE_TLDEXTRACT` env
94 | merged, but reverted PR#34 which addresses Issue#32
95 |
96 |
97 | 0.10.4
98 | * black via pre-commit
99 | * upgraded black; 20.8b1
100 | * integrated with pre-commit
101 | * github actions and tox
102 | * several test files were not in git!
103 |
104 | 0.10.3
105 | updated docs on bad data
106 | black formatting
107 | added pyproject.toml
108 | moved BeautifulSoup generation into it's own method, so anyone can subclass to customize
109 | :fixes: https://github.com/jvanasco/metadata_parser/issues/25
110 | some internal variable changes thanks to flake8
111 |
112 | 0.10.2
113 | added some docs on encoding
114 |
115 | 0.10.1
116 | clarifying some inline docs
117 | BREAKING CHANGE: `fetch_url` now returns a tuple of `(html, encoding)
118 | now tracking in ParsedResult: encoding
119 | ParsedResult.metadata['_internal']['encoding'] = resp.encoding.lower() if resp.encoding else None
120 | `.parse` now accepts `html_encoding`
121 | refactored url fetching to use context managers
122 | refactored url fetching to only insert our hooks when needed
123 | adjusted test harness to close socket connections
124 |
125 | 0.10.0
126 | better Python3 support by using the six library
127 |
128 | 0.9.23
129 | added tests for url entities
130 | better grabbing of the charset
131 | better grabbing of some edge cases
132 |
133 | 0.9.22
134 | removed internal calls to the deprecated `get_metadata`, replacing them with `get_metadatas`.
135 | this will avoid emitting a deprecation warning, allowing users to migrate more easily
136 |
137 | 0.9.21
138 | * requests_toolbelt is now required
139 | ** this is to solve PR#16 / Issue#21
140 | ** the toolbelt and built-in versions of get_encodings_from_content required different workarounds
141 | * the output of urlparse is now cached onto the parser instance.
142 | ** perhaps this will be global cache in the future
143 | * MetadataParser now accepts `cached_urlparser`
144 | ** default: True
145 | options: True: use a instance of UrlParserCacheable(maxitems=30)
146 | : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser)
147 | : None/False/0 - use native urlparse
148 | : other truthy values - use as a custom urlparse
149 |
150 | * addressing issue #17 (https://github.com/jvanasco/metadata_parser/issues/17) where `get_link_` logic does not handle schemeless urls.
151 | ** `MetadataParser.get_metadata_link` will now try to upgrade schemeless links (e.g. urls that start with "//")
152 | ** `MetadataParser.get_metadata_link` will now check values against `FIELDS_REQUIRE_HTTPS` in certain situations to see if the value is valid for http
153 | ** `MetadataParser.schemeless_fields_upgradeable` is a tuple of the fields which can be upgradeable. this defaults to a package definition, but can be changed on a per-parser bases.
154 | The defaults are:
155 | 'image',
156 | 'og:image', 'og:image:url', 'og:audio', 'og:video',
157 | 'og:image:secure_url', 'og:audio:secure_url', 'og:video:secure_url',
158 | ** `MetadataParser.schemeless_fields_disallow` is a tuple of the fields which can not be upgradeable. this defaults to a package definition, but can be changed on a per-parser bases.
159 | The defaults are:
160 | 'canonical',
161 | 'og:url',
162 | ** `MetadataParser.get_url_scheme()` is a new method to expose the scheme of the active url
163 | ** `MetadataParser.upgrade_schemeless_url()` is a new method to upgrade schemeless links
164 | it accepts two arguments: url and field(optional)
165 | if present, the field is checked against the package tuple FIELDS_REQUIRE_HTTPS to see if the value is valid for http
166 | 'og:image:secure_url',
167 | 'og:audio:secure_url',
168 | 'og:video:secure_url',
169 |
170 | 0.9.20
171 | * support for deprecated `twitter:label` and `twitter:data` metatags, which use "value" instead of "content".
172 | * new param to `__init__` and `parse`: `support_malformed` (default `None`).
173 | if true, will support malformed parsing (such as consulting "value" instead of "content".
174 | functionality extended from PR #13 (https://github.com/jvanasco/metadata_parser/pull/13) from https://github.com/amensouissi
175 |
176 | 0.9.19
177 | * addressing https://github.com/jvanasco/metadata_parser/issues/12
178 | on pages with duplicate metadata keys, additional elements are ignored
179 | when parsing the document, duplicate data was not kept.
180 | * `MetadataParser.get_metadata` will always return a single string (or none)
181 | * `MetadataParser.get_metadatas` has been introduced. this will always return an array.
182 | * the internal parsed_metadata store will now store data in a mix of arrays and strings, keeping it backwards compatible
183 | * This new version benches slightly slower because of the mixed format but preserves a smaller footprint.
184 | * the parsed result now contains a version record for tracking the format `_v`.
185 | * standardized single/double quoting
186 | * cleaned up some line
187 | * the library will try to coerce strategy= arguments into the right format
188 | * when getting dublin core data, the result could either be a string of a dict. there's no good way to handle this.
189 | * added tests for encoders
190 | * greatly expanded tests
191 |
192 | 0.9.18
193 | * removed a stray debug line
194 |
195 | 0.9.17
196 | * added `retry_dropped_without_headers` option
197 |
198 | 0.9.16
199 | * added `fix_unicode_url()`
200 | * Added `allow_unicode_url` (default True) to the following calls:
201 | `MetadataParser.get_url_canonical`
202 | `MetadataParser.get_url_opengraph`
203 | `MetadataParser.get_discrete_url`
204 | This functionality will try to recode canonical urls with unicode data into percent-encoded streams
205 |
206 | 0.9.15
207 | * Python3 support returned
208 |
209 | 0.9.14
210 | * added some more tests to ensure encoding detected correctly
211 | * stash the soup sooner when parsing, to aid in debugging
212 |
213 | 0.9.13
214 | * doing some work to guess encoding...
215 | * internal: now using `resp` instead of `r`, it is easier for pdb debugging
216 | * the peername check was changed to be a hook, so it can be processed more immediately
217 | * the custom session redirect test was altered
218 | * changed the DummyResponse encoding fallback to `ENCODING_FALLBACK` which is Latin (not utf8)
219 | this is somewhat backwards incompatible with this library, but maintains compatibility with the underlying `requests` library
220 |
221 | 0.9.12
222 | * added more attributes to DummyResponse:
223 | ** `content`
224 | ** `headers`
225 |
226 | 0.9.11
227 | * some changes to how we handle upgrading bad canonicals
228 | upgrades will no longer happen IF they specify a bad domain.
229 | upgrades from localhost will still transfer over
230 |
231 | 0.9.10
232 | * slight reorder internally of TLD extract support
233 |
234 | 0.9.9
235 | * inspecting `requests` errors for a response and using it if possible
236 | * this will now try to validate urls if the `tldextract` library is present.
237 | this feature can be disabled with a global toggle
238 |
239 | import metadata_parser
240 | metadata_parser.USE_TLDEXTRACT = False
241 |
242 | 0.9.7
243 | * changed some internal variable names to better clarify difference between a hostname and netloc
244 |
245 | 0.9.7
246 | updated the following functions to test for RFC valid characters in the url string
247 | some websites, even BIG PROFESSIONAL ONES, will put html in here.
248 | idiots? amateurs? lazy? doesn't matter, they're now our problem. well, not anymore.
249 | * get_url_canonical
250 | * get_url_opengraph
251 | * get_metadata_link
252 |
253 | 0.9.6
254 | this is being held for an update to the `requests` library
255 | * made the following arguments to `MetadataParser.fetch_url()` default to None - which will then default to the class setting. they are all passed-through to `requests.get`
256 | ** `ssl_verify`
257 | ** `allow_redirects`
258 | ** `requests_timeout`
259 | * removed `force_parse` kwarg from `MetadataParser.parser`
260 | * added 'metadata_parser.RedirectDetected' class. if allow_redirects is False, a detected redirect will raise this.
261 | * added 'metadata_parser.NotParsableRedirect' class. if allow_redirects is False, a detected redirect will raise this if missing a Location.
262 | * added `requests_session` argument to `MetadataParser`
263 | * starting to use httpbin for some tests
264 | * detecting JSON documents
265 | * extended NotParseable exceptions with the MetadataParser instance as `metadataParser`
266 | * added `only_parse_http_ok` which defaults to True (legacy). submitting False will allow non-http200 responses to be parsed.
267 | * shuffled `fetch_url` logic around. it will now process more data before a potential error.
268 | * working on support for custom request sessions that can better handle redirects (requires patch or future version of requests)
269 | * caching the peername onto the response object as `_mp_peername` [ _m(etadata)p(arser)_peername ]. this will allow it to be calculated in a redirect session hook. (see tests/sessions.py)
270 | * added `defer_fetch` argument to `MetadataParser.__init__`, default ``False``. If ``True``, this will overwrite the instance's `deferred_fetch` method to actually fetch the url. this strategy allows for the `page` to be defined and response history caught. Under this situation, a 301 redirecting to a 500 can be observed; in the previous versions only the 500 would be caught.
271 | * starting to encapsulate everything into a "parsed result" class
272 | * fixed opengraph minimum check
273 | * added `MetadataParser.is_redirect_unique`
274 | * added `DummyResponse.history`
275 |
276 | 0.9.5
277 | * failing to load a document into BeautifulSoup will now catch the BS error and raise NotParsable
278 |
279 | 0.9.4
280 | * created `MetadataParser.get_url_canonical`
281 | * created `MetadataParser.get_url_opengraph`
282 | * `MetadataParser.get_discrete_url` now calls `get_url_canonical` and `get_url_opengraph`
283 |
284 | 0.9.3
285 | * fixed packaging error. removed debug "print" statements
286 |
287 | 0.9.2
288 | * upgrade nested local canonical rels correctly
289 |
290 | 0.9.1
291 | * added a new `_internal` storage namespace to the `MetadataParser.metadata` payload.
292 | this simply stashes the `MetadataParser.url` and `MetadataParser.url_actual` attributes to makes objects easier to encode for debugging
293 | * the twitter parsing was incorrectly looking for 'value' not 'content' as in the current spec
294 | * tracking the shortlink on a page
295 |
296 | 0.9.0
297 | - This has a default behavior change regarding `get_discrete_url()` .
298 | - `is_parsed_valid_url()` did not correctly handle `require_public_netloc=True`, and would allow for `localhost` values to pass
299 | - new kwarg `allow_localhosts` added to
300 | * is_parsed_valid_url
301 | * is_url_valid
302 | * url_to_absolute_url
303 | * MetadataParser.__init__
304 | * MetadataParser.absolute_url
305 | * MetadataParser.get_discrete_url
306 | * MetadataParser.get_metadata_link
307 | - new method `get_fallback_url`
308 | - `url_to_absolute_url` will return `None` if not supplied with a fallback and test url. Previously an error in parsing would occur
309 | - `url_to_absolute_url` tries to do a better job at determining the intended url when given a malformed url.
310 |
311 | 0.8.3
312 | - packaging fixes
313 |
314 | 0.8.2
315 | - incorporated fix in https://github.com/jvanasco/metadata_parser/pull/10 to handle windows support of socket objects
316 | - cleaned up some tests
317 | - added `encode_ascii` helper
318 | - added git-ignored `tests/private` directory for non-public tests
319 | - added an `encoder` argument to `get_metadata` for encoding values
320 |
321 | 0.8.1
322 | added 2 new properties to a computed MetadataParser object:
323 | is_redirect = None
324 | is_redirect_same_host = None
325 | in the case of redirects, we only have the peername available for the final URL (not the source)
326 | if a response is a redirect, it may not be for the same host -- and the peername would correspond to the destination URL -- not the origin
327 |
328 | 0.8.0
329 | this bump introduces 2 new arguments and some changed behavior:
330 |
331 | - `search_head_only=None`. previously the meta/og/etc data was only searched in the document head (where expected as per HTML specs).
332 | after indexing millions of pages, many appeared to implement this incorrectly of have html that is so off specification that
333 | parsing libraries can't correctly read it (for example, Twitter.com).
334 | This is currently implemented to default from None to True, but future versions will default to `False`.
335 | This is marked for a future default of `search_head_only=False`
336 |
337 | - `raise_on_invalid`. default False. If True, this will raise a new exception: InvalidDocument if the response
338 | does not look like a proper html document
339 |
340 |
341 |
342 | 0.7.4
343 | - more aggressive attempts to get the peername.
344 |
345 | 0.7.3
346 | - this will now try to cache the `peername` of the request (ie, the remote server) onto the peername attribute
347 |
348 | 0.7.2
349 | - applying a `strip()` to the "title". bad authors/cms often have whitespace.
350 |
351 | 0.7.1
352 | - added kwargs to docstrings
353 | - `get_metadata_link` behavior has been changed as follows:
354 | * if an encoded uri is present (starts with `data:image/`)
355 | ** this will return None by default
356 | ** if a kwarg of `allow_encoded_uri=True` is submitted, will return the encoded url (without a url prefix)
357 |
358 | 0.7.0
359 | - merged https://github.com/jvanasco/metadata_parser/pull/9 from xethorn
360 | - nested all commands to `log` under `__debug__` to avoid calls on production when PYTHONOPTIMIZE is set
361 |
362 | 0.6.18
363 | - migrated version string into __init__.py
364 |
365 | 0.6.17
366 | - added a new `DummyResponse` class to mimic popular attributes of a `requests.response` object when parsing from HTML files
367 |
368 | 0.6.16
369 | - incorporated pull8 (https://github.com/jvanasco/metadata_parser/pull/8) which fixes issue5 (https://github.com/jvanasco/metadata_parser/issues/5) with comments
370 |
371 | 0.6.15
372 | - fixed README which used old api in the example
373 |
374 | 0.6.14
375 | - there was a typo and another bug that passed some tests on BeautifulSoup parsing. they have been fixed. todo- migrate tests to public repo
376 |
377 | 0.6.13
378 | - trying to integrate a "safe read"
379 |
380 | 0.6.12
381 | - now passing "stream=True" to requests.get. this will fetch the headers first, before looping through the response. we can avoid many issues with this approach
382 |
383 | 0.6.11
384 | - now correctly validating urls with ports. had to restructure a lot of the url validation
385 |
386 | 0.6.10
387 | - changed how some nodes are inspected. this should lead to fewer errors
388 |
389 | 0.6.9
390 | - added a new method `get_metadata_link()`, which applies link transformations to a metadata in an attempt to ensure a valid link
391 |
392 | 0.6.8
393 | - added a kwarg `requests_timeout` to proxy a timeout value to `requests.get()`
394 |
395 | 0.6.7
396 | - added a lockdown to `is_parsed_valid_url` titled `http_only` -- requires http/https for the scheme
397 |
398 | 0.6.6
399 | - protecting against bad doctypes, like nasa.gov
400 | -- added `force_doctype` to __init__. defaults to False. this will change the doctype to get around to bs4/lxml issues
401 | -- this is defaulted to False.
402 |
403 | 0.6.5
404 | - keeping the parsed BS4 document; a user may wish to perform further operations on it.
405 | -- `MetadataParser.soup` attribute holds BS4 document
406 |
407 | 0.6.4
408 | - flake8 fixes. purely cosmetic.
409 |
410 | 0.6.3
411 | - no changes. `sdist upload` was picking up a reference file that wasn't in github; that file killed the distribution install
412 |
413 | 0.6.2
414 | - formatting fixes via flake8
415 |
416 | 0.6.1
417 | - Lightweight, but functional, url validation
418 | -- new 'init' argument (defaults to True) : `require_public_netloc`
419 | -- this will ensure a url's hostname/netloc is either an IPV4 or "public DNS" name
420 | -- if the url is entirely numeric, requires it to be IPV4
421 | -- if the url is alphanumeric, requires a TLD + Domain ( exception is "localhost" )
422 | -- this is NOT RFC compliant, but designed for "Real Life" use cases.
423 |
424 | 0.6.0
425 | - Several fixes to improve support of canonical and absolute urls
426 | -- replaced REGEX parsing of urls with `urlparse` parsing and inspection; too many edge cases got in
427 | -- refactored `MediaParser.absolute_url` , now proxies a call to new function `url_to_absolute_url`
428 | -- refactored `MediaParser.get_discrete_url` , now cleaner and leaner.
429 | -- refactored how some tests run, so there is cleaner output
430 |
431 |
432 | 0.5.8
433 | - trying to fix some issues with distribution
434 |
435 | 0.5.7
436 | - trying to parse unparsable pages was creating an error
437 | -- `MetadataParser.init` now accepts `only_parse_file_extensions` -- list of the only file extensions to parse
438 | -- `MetadataParser.init` now accepts `force_parse_invalid_content_type` -- forces to parse invalid content
439 | -- `MetadataParser.fetch_url` will only parse "text/html" content by default
440 |
441 | 0.5.6
442 | - trying to ensure we return a valid url in get_discrete_url()
443 | - adding in some proper unit tests; migrating from the private demo's slowly ( the private demo's hit a lot of internal files and public urls ; wouldn't be proper to make these public )
444 | - setting `self.url_actual = url` on __init__. this will get overridden on a `fetch`, but allows for a fallback on html docs passed through
445 |
446 |
447 | 0.5.5
448 | - Dropped BS3 support
449 | - test Python3 support ( support added by Paul Bonser [ https://github.com/pib ] )
450 |
451 |
452 | 0.5.4
453 | - Pull Request - https://github.com/jvanasco/metadata_parser/pull/1
454 | Credit to Paul Bonser [ https://github.com/pib ]
455 |
456 | 0.5.3
457 | - added a few `.strip()` calls to clean up metadata values
458 |
459 | 0.5.2
460 | - fixed an issue on html title parsing. the old method incorrectly regexed on a BS4 tag, not tag contents, creating character encoding issues.
461 |
462 | 0.5.1
463 | - missed the ssl_verify command
464 |
465 | 0.5.0
466 | - migrated to the requests library
467 |
468 | 0.4.13
469 | - trapping all errors in httplib and urrlib2 ; raising as an NotParsable and sticking the original error into the `raised` attribute.
470 | this will allow for cleaner error handling
471 | - we *really* need to move to requests.py
472 |
473 | 0.4.12
474 | - created a workaround for sharethis hashbang urls, which urllib2 doesn't like
475 | - we need to move to requests.py
476 |
477 | 0.4.11
478 | - added more relaxed controls for parsing safe files
479 |
480 | 0.4.10
481 | - fixed force_parse arg on init
482 | - added support for more filetypes
483 |
484 | 0.4.9
485 | - support for gzip documents that pad with extra data ( spec allows, python module doesn't )
486 | - ensure proper document format
487 |
488 | 0.4.8
489 | - added support for twitter's own og style markup
490 | - cleaned up the beautifulsoup finds for og data
491 | - moved 'try' from encapsulating 'for' blocks to encapsulating the inner loop. this will pull more data out if an error occurs.
492 |
493 | 0.4.7
494 | - cleaned up some code
495 |
496 | 0.4.6
497 | - realized that some servers return gzip content, despite not advertising that this client accepts that content ; fixed by using some ideas from mark pilgrim's feedparser. metadata_parser now advertises gzip and zlib, and processes it as needed
498 |
499 | 0.4.5
500 | - fixed a bug that prevented toplevel directories from being parsed
501 |
502 | 0.4.4
503 | - made redirect/masked/shortened links have better dereferenced url support
504 |
505 | 0.4.2
506 | - Wrapped title tag traversal with an AttributeException try block
507 | - Wrapped canonical tag lookup with a KeyError try block, defaulting to 'href' then 'content'
508 | - Added support for `url_actual` and `url_info` , which persist the data from the urllib2.urlopen object's `geturl()` and `info()`
509 | - `get_discrete_url` and `absolute_url` use the underlying url_actual data
510 | - added support for passing data and headers into urllib2 requests
511 |
512 | 0.4.1
513 | Initial Release
514 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012-2018, Jonathan Vanasco
138 |
139 |
--------------------------------------------------------------------------------
/tests/html_scaffolds/simple.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
37 |
38 |
--------------------------------------------------------------------------------
/tests/test_document_parsing.py:
--------------------------------------------------------------------------------
1 | # stdlib
2 | import os
3 | from typing import Dict
4 | import unittest
5 | import warnings
6 |
7 | # local
8 | import metadata_parser
9 |
10 | # ==============================================================================
11 |
12 |
13 | # this bit lets us run the tests directly during development
14 | _tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
15 | if _tests_dir.endswith("metadata_parser"):
16 | _tests_dir = os.path.join(_tests_dir, "tests")
17 | _examples_dir = os.path.join(_tests_dir, "html_scaffolds")
18 |
19 | # cache these lazily
20 | CACHED_FILESYSTEM_DOCUMENTS = {}
21 |
22 |
23 | doc_base = """%(head)s"""
24 |
25 | docs: Dict = {
26 | "good-canonical-absolute": {
27 | "url-real": """http://example.com""",
28 | "head": {
29 | "url-canonical": """http://example.com/canonical.html""",
30 | "url-og": None,
31 | },
32 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
33 | },
34 | "good-og-absolute": {
35 | "url-real": """http://example.com""",
36 | "head": {"url-canonical": None, "url-og": """http://example.com/og.html"""},
37 | "expected": {"get_discrete_url()": "http://example.com/og.html"},
38 | },
39 | "good-canonical-noscheme-http": {
40 | "url-real": """http://example.com""",
41 | "head": {"url-canonical": """//example.com/canonical.html""", "url-og": None},
42 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
43 | },
44 | "good-og-noscheme-http": {
45 | "url-real": """http://example.com""",
46 | "head": {"url-canonical": None, "url-og": """//example.com/og.html"""},
47 | "expected": {"get_discrete_url()": "http://example.com/og.html"},
48 | },
49 | "good-canonical-noscheme-https": {
50 | "url-real": """https://example.com""",
51 | "head": {"url-canonical": """//example.com/canonical.html""", "url-og": None},
52 | "expected": {"get_discrete_url()": "https://example.com/canonical.html"},
53 | },
54 | "good-og-noscheme-https": {
55 | "url-real": """https://example.com""",
56 | "head": {"url-canonical": None, "url-og": """//example.com/og.html"""},
57 | "expected": {"get_discrete_url()": "https://example.com/og.html"},
58 | },
59 | "good-canonical-relative": {
60 | "url-real": """http://example.com""",
61 | "head": {"url-canonical": """canonical.html""", "url-og": None},
62 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
63 | },
64 | "good-canonical-relative_alt": {
65 | "url-real": """http://example.com""",
66 | "head": {"url-canonical": """/canonical.html""", "url-og": None},
67 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
68 | },
69 | "good-og-relative_alt": {
70 | "url-real": """http://example.com""",
71 | "head": {"url-canonical": None, "url-og": """/og.html"""},
72 | "expected": {"get_discrete_url()": "http://example.com/og.html"},
73 | },
74 | "bad-canonical": {
75 | "url-real": """http://example.com/one-two-three.html""",
76 | "head": {"url-canonical": """...""", "url-og": None},
77 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
78 | },
79 | "bad-canonical2": {
80 | "url-real": """http://example.com/one-two-three.html""",
81 | "head": {"url-canonical": """http://""", "url-og": None},
82 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
83 | },
84 | "bad-canonical3": {
85 | "url-real": """http://example.com/one-two-three.html""",
86 | "head": {"url-canonical": """http://contentcreation""", "url-og": None},
87 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
88 | },
89 | "bad-og": {
90 | "url-real": """http://example.com/one-two-three.html""",
91 | "head": {"url-canonical": None, "url-og": """..."""},
92 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
93 | },
94 | "image-https": {
95 | "url-real": """https://example.com""",
96 | "head": {
97 | "url-canonical": """https://example.com/canonical.html""",
98 | "url-og": None,
99 | "url-og:image": """https://example.com/img.gif""",
100 | },
101 | "expected": {"og:image": """https://example.com/img.gif"""},
102 | },
103 | "image-https-noscheme": {
104 | "url-real": """https://example.com""",
105 | "head": {
106 | "url-canonical": """https://example.com/canonical.html""",
107 | "url-og": None,
108 | "url-og:image": """//example.com/img.gif""",
109 | },
110 | "expected": {"og:image": """https://example.com/img.gif"""},
111 | },
112 | "image-https-noscheme-secure": {
113 | "url-real": """https://example.com""",
114 | "head": {
115 | "url-canonical": """https://example.com/canonical.html""",
116 | "url-og": None,
117 | "url-og:image:secure_url": """//example.com/img.gif""",
118 | },
119 | "expected": {"og:image:secure_url": """https://example.com/img.gif"""},
120 | },
121 | "image-http": {
122 | "url-real": """http://example.com""",
123 | "head": {
124 | "url-canonical": """http://example.com/canonical.html""",
125 | "url-og": None,
126 | "url-og:image": """http://example.com/img.gif""",
127 | },
128 | "expected": {"og:image": """http://example.com/img.gif"""},
129 | },
130 | "image-http-noscheme": {
131 | "url-real": """http://example.com""",
132 | "head": {
133 | "url-canonical": """http://example.com/canonical.html""",
134 | "url-og": None,
135 | "url-og:image": """//example.com/img.gif""",
136 | },
137 | "expected": {"og:image": """http://example.com/img.gif"""},
138 | },
139 | "image-http-noscheme-secure": {
140 | "url-real": """http://example.com""",
141 | "head": {
142 | "url-canonical": """//example.com/canonical.html""",
143 | "url-og": None,
144 | "url-og:image:secure_url": """//example.com/img.gif""",
145 | },
146 | "expected": {"og:image:secure_url": None},
147 | },
148 | }
149 |
150 |
151 | def encoder_capitalizer(decoded):
152 | if type(decoded) is dict:
153 | return {k.upper(): v.upper() for k, v in decoded.items()}
154 | return decoded.upper()
155 |
156 |
157 | # setup the test_docs with html bodies
158 | for test in list(docs.keys()):
159 | head = ""
160 | if "url-og" in docs[test]["head"]:
161 | if docs[test]["head"]["url-og"] is not None:
162 | head += (
163 | """"""
164 | % docs[test]["head"]["url-og"]
165 | )
166 | if "url-canonical" in docs[test]["head"]:
167 | if docs[test]["head"]["url-canonical"] is not None:
168 | head += (
169 | """"""
170 | % docs[test]["head"]["url-canonical"]
171 | )
172 | if "url-og:image" in docs[test]["head"]:
173 | if docs[test]["head"]["url-og:image"] is not None:
174 | head += (
175 | """"""
176 | % docs[test]["head"]["url-og:image"]
177 | )
178 | if "url-og:image:secure_url" in docs[test]["head"]:
179 | if docs[test]["head"]["url-og:image:secure_url"] is not None:
180 | head += (
181 | """"""
182 | % docs[test]["head"]["url-og:image:secure_url"]
183 | )
184 | custom_vars = {"head": head}
185 | docs[test]["doc"] = doc_base % custom_vars
186 |
187 |
188 | def _docs_test(test_names):
189 | errors = []
190 | for test in test_names:
191 | tests = []
192 | url = docs[test]["url-real"]
193 | parsed = metadata_parser.MetadataParser(url=url, html=docs[test]["doc"])
194 | if "get_discrete_url()" in docs[test]["expected"]:
195 | tests.append("get_discrete_url()")
196 | url_expected = docs[test]["expected"]["get_discrete_url()"]
197 | url_retrieved = parsed.get_discrete_url()
198 | if url_retrieved != url_expected:
199 | errors.append([test, "get_discrete_url()", url_expected, url_retrieved])
200 | if "og:image" in docs[test]["expected"]:
201 | tests.append("og:image")
202 | url_expected = docs[test]["expected"]["og:image"]
203 | url_retrieved = parsed.get_metadata_link("og:image")
204 | if url_retrieved != url_expected:
205 | errors.append([test, "og:image", url_expected, url_retrieved])
206 | if "og:image:secure_url" in docs[test]["expected"]:
207 | tests.append("og:image:secure_url")
208 | url_expected = docs[test]["expected"]["og:image:secure_url"]
209 | url_retrieved = parsed.get_metadata_link("og:image:secure_url")
210 | if url_retrieved != url_expected:
211 | errors.append(
212 | [test, "og:image:secure_url", url_expected, url_retrieved]
213 | )
214 | if not tests:
215 | raise ValueError("No tests!")
216 | return errors
217 |
218 |
219 | def _docs_test_parser(test_names, cached_urlparser, cached_urlparser_maxitems=None):
220 | errors = []
221 | for test in test_names:
222 | tests = []
223 | url = docs[test]["url-real"]
224 | kwargs = {}
225 | if cached_urlparser != "*no-kwarg":
226 | kwargs["cached_urlparser"] = cached_urlparser
227 | if cached_urlparser_maxitems is not None:
228 | kwargs["cached_urlparser_maxitems"] = cached_urlparser_maxitems
229 | parsed = metadata_parser.MetadataParser(
230 | url=url, html=docs[test]["doc"], **kwargs
231 | )
232 | if "get_discrete_url()" in docs[test]["expected"]:
233 | tests.append("get_discrete_url()")
234 | url_expected = docs[test]["expected"]["get_discrete_url()"]
235 | url_retrieved = parsed.get_discrete_url()
236 | if url_retrieved != url_expected:
237 | errors.append([test, "get_discrete_url()", url_expected, url_retrieved])
238 | if not tests:
239 | raise ValueError("No tests!")
240 | return errors
241 |
242 |
243 | class TestHtmlDocument(unittest.TestCase):
244 | """
245 | python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__good_relative
246 | python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__good_absolute
247 | python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__bad
248 | """
249 |
250 | def test_get_discrete_url__good_relative(self):
251 | errors = _docs_test(
252 | [
253 | "good-canonical-relative",
254 | "good-canonical-relative_alt",
255 | "good-og-relative_alt",
256 | ]
257 | )
258 | if errors:
259 | raise ValueError(errors)
260 |
261 | def test_get_discrete_url__good_absolute(self):
262 | errors = _docs_test(["good-canonical-absolute", "good-og-absolute"])
263 | if errors:
264 | raise ValueError(errors)
265 |
266 | def test_get_discrete_url__good_noscheme(self):
267 | errors = _docs_test(
268 | [
269 | "good-canonical-noscheme-http",
270 | "good-og-noscheme-http",
271 | "good-canonical-noscheme-https",
272 | "good-og-noscheme-https",
273 | ]
274 | )
275 | if errors:
276 | raise ValueError(errors)
277 |
278 | def test_get_discrete_url__bad(self):
279 | errors = _docs_test(
280 | ["bad-canonical", "bad-canonical2", "bad-canonical3", "bad-og"]
281 | )
282 | if errors:
283 | raise ValueError(errors)
284 |
285 | def test_get_image(self):
286 | errors = _docs_test(
287 | [
288 | "image-http-noscheme-secure",
289 | "image-https-noscheme-secure",
290 | "image-http",
291 | "image-https",
292 | "image-http-noscheme",
293 | "image-https-noscheme",
294 | ]
295 | )
296 | if errors:
297 | raise ValueError(errors)
298 |
299 |
300 | class TestEncoders(unittest.TestCase):
301 | """
302 | python -munittest tests.test_document_parsing.TestEncoders
303 | """
304 |
305 | _data = {
306 | "unicode_whitespace": {
307 | "raw": """Example line with\xa0unicode whitespace.""",
308 | "ascii": """Example line with unicode whitespace.""",
309 | },
310 | "unicode_chars": {
311 | "raw": """Example line with\xc2\xa0unicode chars.""",
312 | "ascii": """Example line withA unicode chars.""",
313 | },
314 | "decode_html_encoder": {
315 | "html": """""",
316 | "parsed": "Foo Bar, "Biz Bang Bash."",
317 | "decoded": 'Foo Bar, "Biz Bang Bash."',
318 | },
319 | }
320 |
321 | def _make_raw(self, data_option):
322 | # create a parsed result, and inject raw data.
323 | # data coming through beautifulsoup will be parsed differently
324 | parsed = metadata_parser.MetadataParser()
325 | parsed.metadata["meta"]["title"] = self._data[data_option]["raw"]
326 | return parsed
327 |
328 | def _make_html(self, data_option, default_encoder=None):
329 | # data coming through beautifulsoup is parsed by that library
330 | parsed = metadata_parser.MetadataParser(
331 | html=self._data[data_option]["html"],
332 | force_doctype=True,
333 | default_encoder=default_encoder,
334 | )
335 | return parsed
336 |
337 | def test_unicode_whitespace(self):
338 | parsed = self._make_raw("unicode_whitespace")
339 | # title_raw = parsed.get_metadata('title')
340 | title_ascii = parsed.get_metadata("title", encoder=metadata_parser.encode_ascii)
341 | self.assertEqual(title_ascii, self._data["unicode_whitespace"]["ascii"])
342 |
343 | def test_unicode_chars(self):
344 | parsed = self._make_raw("unicode_chars")
345 | # title_raw = parsed.get_metadata('title')
346 | title_ascii = parsed.get_metadata("title", encoder=metadata_parser.encode_ascii)
347 | self.assertEqual(title_ascii, self._data["unicode_chars"]["ascii"])
348 |
349 | def test_decode_html_encoder(self):
350 | parsed = self._make_html("decode_html_encoder")
351 | parsed_description = parsed.get_metadata("description")
352 |
353 | decoded_direct = metadata_parser.decode_html(parsed_description)
354 | self.assertEqual(decoded_direct, self._data["decode_html_encoder"]["decoded"])
355 |
356 | decoded_decoder = parsed.get_metadata(
357 | "description", encoder=metadata_parser.decode_html
358 | )
359 | self.assertEqual(decoded_decoder, self._data["decode_html_encoder"]["decoded"])
360 |
361 | def test_default_encoder(self):
362 | """
363 | ensure the default decoder is invoked
364 | """
365 | parsed_with_default = self._make_html(
366 | "decode_html_encoder", default_encoder=metadata_parser.decode_html
367 | )
368 | parsed_no_default = self._make_html("decode_html_encoder")
369 |
370 | # does the default_decoder work?
371 | decoded_default = parsed_with_default.get_metadata("description")
372 | self.assertEqual(decoded_default, self._data["decode_html_encoder"]["decoded"])
373 |
374 | # does the no decoder work as expected?
375 | not_decoded = parsed_no_default.get_metadata("description")
376 | self.assertEqual(not_decoded, self._data["decode_html_encoder"]["parsed"])
377 |
378 | # can we override the default_decoder to get RAW?
379 | decoded_override = parsed_with_default.get_metadata(
380 | "description", encoder="raw"
381 | )
382 | self.assertEqual(decoded_override, self._data["decode_html_encoder"]["parsed"])
383 |
384 | # can we override the default_decoder to get something else?
385 | # ensure these 2 aren't equal, otherwise the next bit doesn't really test!
386 | self.assertNotEqual(
387 | self._data["decode_html_encoder"]["parsed"],
388 | self._data["decode_html_encoder"]["parsed"].upper(),
389 | )
390 | decoded_override = parsed_with_default.get_metadata(
391 | "description", encoder=lambda x: x.upper()
392 | )
393 | self.assertEqual(
394 | decoded_override, self._data["decode_html_encoder"]["parsed"].upper()
395 | )
396 |
397 |
398 | class TestDocumentParsing(unittest.TestCase):
399 | """
400 | python -m unittest tests.document_parsing.TestDocumentParsing
401 | python -m unittest tests.document_parsing.TestDocumentParsing.test_simple_html
402 | python -m unittest tests.document_parsing.TestDocumentParsing.test_html_urls
403 | python -m unittest tests.document_parsing.TestDocumentParsing.test_complex_html
404 | python -m unittest tests.document_parsing.TestDocumentParsing.test_charsets
405 | """
406 |
407 | def _MakeOne(self, filename):
408 | """lazy cache of files as needed"""
409 | global CACHED_FILESYSTEM_DOCUMENTS
410 | if filename not in CACHED_FILESYSTEM_DOCUMENTS:
411 | CACHED_FILESYSTEM_DOCUMENTS[filename] = open(
412 | os.path.join(_examples_dir, filename)
413 | ).read()
414 | return CACHED_FILESYSTEM_DOCUMENTS[filename]
415 |
416 | def test_simple_html(self):
417 | """this tests simple.html to have certain fields"""
418 | html = self._MakeOne("simple.html")
419 | parsed = metadata_parser.MetadataParser(url=None, html=html)
420 | self.assertEqual(
421 | parsed.metadata["meta"]["article:publisher"],
422 | "https://www.example.com/meta/property=article:publisher",
423 | )
424 | self.assertEqual(parsed.metadata["meta"]["author"], "meta.author")
425 | self.assertEqual(parsed.metadata["meta"]["description"], "meta.description")
426 | self.assertEqual(parsed.metadata["meta"]["keywords"], "meta.keywords")
427 | self.assertEqual(
428 | parsed.metadata["meta"]["og:description"], "meta.property=og:description"
429 | )
430 | self.assertEqual(
431 | parsed.metadata["meta"]["og:image"],
432 | "https://www.example.com/meta/property=og:image",
433 | )
434 | self.assertEqual(
435 | parsed.metadata["meta"]["og:site_name"], "meta.property=og:site_name"
436 | )
437 | self.assertEqual(parsed.metadata["meta"]["og:title"], "meta.property=og:title")
438 | self.assertEqual(parsed.metadata["meta"]["og:type"], "meta.property=og:type")
439 | self.assertEqual(
440 | parsed.metadata["meta"]["og:url"],
441 | "https://www.example.com/meta/property=og:url",
442 | )
443 | self.assertEqual(
444 | parsed.metadata["meta"]["twitter:card"], "meta.name=twitter:card"
445 | )
446 | self.assertEqual(
447 | parsed.metadata["meta"]["twitter:description"],
448 | "meta.name=twitter:description",
449 | )
450 | self.assertEqual(
451 | parsed.metadata["meta"]["twitter:image:src"],
452 | "https://example.com/meta/name=twitter:image:src",
453 | )
454 | self.assertEqual(
455 | parsed.metadata["meta"]["twitter:site"], "meta.name=twitter:site"
456 | )
457 | self.assertEqual(
458 | parsed.metadata["meta"]["twitter:title"], "meta.name=twitter:title"
459 | )
460 | self.assertEqual(
461 | parsed.metadata["meta"]["twitter:url"],
462 | "https://example.com/meta/name=twitter:url",
463 | )
464 | self.assertEqual(
465 | parsed.metadata["og"]["description"], "meta.property=og:description"
466 | )
467 | self.assertEqual(
468 | parsed.metadata["og"]["image"],
469 | "https://www.example.com/meta/property=og:image",
470 | )
471 | self.assertEqual(
472 | parsed.metadata["og"]["site_name"], "meta.property=og:site_name"
473 | )
474 | self.assertEqual(parsed.metadata["og"]["title"], "meta.property=og:title")
475 | self.assertEqual(parsed.metadata["og"]["type"], "meta.property=og:type")
476 | self.assertEqual(
477 | parsed.metadata["og"]["url"], "https://www.example.com/meta/property=og:url"
478 | )
479 | self.assertEqual(
480 | parsed.metadata["page"]["canonical"],
481 | "http://example.com/meta/rel=canonical",
482 | )
483 | self.assertEqual(
484 | parsed.metadata["page"]["shortlink"],
485 | "http://example.com/meta/rel=shortlink",
486 | )
487 | self.assertEqual(parsed.metadata["page"]["title"], "title")
488 | self.assertEqual(parsed.metadata["twitter"]["card"], "meta.name=twitter:card")
489 | self.assertEqual(
490 | parsed.metadata["twitter"]["description"], "meta.name=twitter:description"
491 | )
492 | self.assertEqual(
493 | parsed.metadata["twitter"]["image:src"],
494 | "https://example.com/meta/name=twitter:image:src",
495 | )
496 | self.assertEqual(parsed.metadata["twitter"]["site"], "meta.name=twitter:site")
497 | self.assertEqual(parsed.metadata["twitter"]["title"], "meta.name=twitter:title")
498 | self.assertEqual(
499 | parsed.metadata["twitter"]["url"],
500 | "https://example.com/meta/name=twitter:url",
501 | )
502 | self.assertEqual(
503 | parsed.metadata["twitter"]["data"], "meta.name=twitter:data||value"
504 | )
505 | self.assertNotIn("label", parsed.metadata["twitter"])
506 | self.assertEqual(parsed.is_opengraph_minimum(), True)
507 |
508 | def test_html_urls(self):
509 | """this tests simple.html to have certain fields"""
510 | html = self._MakeOne("simple.html")
511 | parsed = metadata_parser.MetadataParser(url=None, html=html)
512 | # by default we do og_first
513 | self.assertEqual(
514 | parsed.get_discrete_url(), "https://www.example.com/meta/property=og:url"
515 | )
516 | self.assertEqual(
517 | parsed.get_discrete_url(canonical_first=True, og_first=False),
518 | "http://example.com/meta/rel=canonical",
519 | )
520 | self.assertEqual(
521 | parsed.get_url_opengraph(), "https://www.example.com/meta/property=og:url"
522 | )
523 | self.assertEqual(
524 | parsed.get_url_canonical(), "http://example.com/meta/rel=canonical"
525 | )
526 |
527 | def test_encoding_fallback(self):
528 | """this tests simple.html to have certain fields"""
529 | html = """body"""
530 | parsed = metadata_parser.MetadataParser(url=None, html=html)
531 | # typing scope
532 | assert parsed.response is not None
533 | self.assertEqual(parsed.response.encoding, "ISO-8859-1")
534 |
535 | def test_encoding_declared(self):
536 | html = """body"""
537 | parsed = metadata_parser.MetadataParser(url=None, html=html)
538 | # typing scope
539 | assert parsed.response is not None
540 | self.assertEqual(parsed.response.encoding, "UTF-8")
541 |
542 | def test_complex_html(self):
543 | """
544 | this tests duplicates.html to have certain fields
545 |
546 | this also ensures some legacy behavior is supported
547 |
548 | such as calling both:
549 | * `parsed.parsed_result.get_metadatas`
550 | * `parsed.get_metadatas`
551 | """
552 | html = self._MakeOne("duplicates.html")
553 | parsed = metadata_parser.MetadataParser(url=None, html=html)
554 |
555 | # this is just a property and should be the same object
556 | self.assertIs(parsed.metadata, parsed.parsed_result.metadata)
557 |
558 | # we should be tracking the verison now
559 | self.assertIn("_v", parsed.metadata)
560 |
561 | # it should be the same version
562 | self.assertEqual(parsed.metadata_version, metadata_parser.ParsedResult._version)
563 | self.assertEqual(
564 | parsed.parsed_result.metadata_version, metadata_parser.ParsedResult._version
565 | )
566 |
567 | # we have 3 og:image entries in this file
568 | _computed_link = parsed.get_metadata_link("image", strategy=["og"])
569 | assert _computed_link == "https://www.example.com/meta/property=og:image"
570 | _all_og_images = parsed.get_metadatas("og:image")
571 | assert _all_og_images is not None
572 | assert len(_all_og_images) == 3
573 | assert "https://www.example.com/meta/property=og:image" in _all_og_images
574 | # bs4 cleans up the ampersand internally into an entity, but prints it deserialized by default
575 | assert (
576 | "https://www.example.com/meta?property=og:image&duplicate=1"
577 | in _all_og_images
578 | )
579 | assert (
580 | "https://www.example.com/meta?property=og:image&duplicate=2"
581 | in _all_og_images
582 | )
583 |
584 | # -----
585 | # this is a duplicate element and should be stored in the metadata dict as a list
586 | _citation_authors = [
587 | "citation_author:1",
588 | "citation_author:2",
589 | "citation_author:3",
590 | ]
591 | # these should be lists
592 | self.assertEqual(parsed.metadata["meta"]["citation_author"], _citation_authors)
593 | self.assertEqual(
594 | parsed.parsed_result.get_metadatas("citation_author", ["meta"]),
595 | _citation_authors,
596 | )
597 | self.assertEqual(
598 | parsed.get_metadatas("citation_author", ["meta"]), _citation_authors
599 | )
600 | # this is a string
601 | self.assertEqual(
602 | parsed.parsed_result.get_metadata("citation_author", ["meta"]),
603 | _citation_authors[0],
604 | )
605 | self.assertEqual(
606 | parsed.get_metadata("citation_author", ["meta"]), _citation_authors[0]
607 | )
608 |
609 | _meta_authors = ["meta.author:1", "meta.author:2"]
610 | # these should be lists
611 | self.assertEqual(parsed.metadata["meta"]["author"], _meta_authors)
612 | self.assertEqual(
613 | parsed.parsed_result.get_metadatas("author", ["meta"]), _meta_authors
614 | )
615 | self.assertEqual(parsed.get_metadatas("author", ["meta"]), _meta_authors)
616 | # this is a string
617 | self.assertEqual(
618 | parsed.parsed_result.get_metadata("author", ["meta"]), _meta_authors[0]
619 | )
620 | self.assertEqual(parsed.get_metadata("author", ["meta"]), _meta_authors[0])
621 |
622 | _meta_kws = ["meta.keywords:1", "meta.keywords:2"]
623 | # these should be lists
624 | self.assertEqual(parsed.metadata["meta"]["keywords"], _meta_kws)
625 | self.assertEqual(
626 | parsed.parsed_result.get_metadatas("keywords", ["meta"]), _meta_kws
627 | )
628 | self.assertEqual(parsed.get_metadatas("keywords", ["meta"]), _meta_kws)
629 | # this is a string
630 | self.assertEqual(
631 | parsed.parsed_result.get_metadata("keywords", ["meta"]), _meta_kws[0]
632 | )
633 | self.assertEqual(parsed.get_metadata("keywords", ["meta"]), _meta_kws[0])
634 |
635 | # -----
636 | # this is a single element and should be stored in the metadata dict as a string
637 | _description = "meta.description"
638 |
639 | # these should be lists
640 | self.assertEqual(
641 | parsed.parsed_result.get_metadatas("description", ["meta"]), [_description]
642 | )
643 | self.assertEqual(parsed.get_metadatas("description", ["meta"]), [_description])
644 |
645 | # this is a string
646 | self.assertEqual(parsed.metadata["meta"]["description"], _description)
647 | self.assertEqual(
648 | parsed.parsed_result.get_metadata("description", ["meta"]), _description
649 | )
650 | self.assertEqual(parsed.get_metadata("description", ["meta"]), _description)
651 |
652 | # -----
653 | # dc creator has a language variant
654 | # 'dc': {'Creator': [{'content': 'Plato'},
655 | # {'content': 'Platon', 'lang': 'fr'}],
656 |
657 | self.assertIn("Creator", parsed.metadata["dc"])
658 | dc_creator = parsed.metadata["dc"]["Creator"]
659 | # so this should be a list
660 | self.assertIs(type(dc_creator), list)
661 | # with a length of 2
662 | self.assertEqual(len(dc_creator), 2)
663 | self.assertIs(type(dc_creator[0]), dict)
664 | self.assertIs(type(dc_creator[1]), dict)
665 | self.assertIn("content", dc_creator[0])
666 | self.assertEqual(dc_creator[0]["content"], "Plato")
667 | self.assertIn("content", dc_creator[1])
668 | self.assertEqual(dc_creator[1]["content"], "Platon")
669 | self.assertIn("lang", dc_creator[1])
670 | self.assertEqual(dc_creator[1]["lang"], "fr")
671 |
672 | # -----
673 | # dc subject has a scheme variant
674 | # 'Subject': [{'content': 'heart attack'},
675 | # {'content': 'Myocardial Infarction; Pericardial Effusion',
676 | # 'scheme': 'MESH'},
677 | # {'content': 'vietnam war'},
678 | # {'content': 'Vietnamese Conflict, 1961-1975',
679 | # 'scheme': 'LCSH'},
680 | # {'content': 'Friendship'},
681 | # {'content': '158.25', 'scheme': 'ddc'}]},
682 | dcSubjectsExpected = [
683 | {"content": "heart attack"},
684 | {
685 | "content": "Myocardial Infarction; Pericardial Effusion",
686 | "scheme": "MESH",
687 | },
688 | {"content": "vietnam war"},
689 | {"content": "Vietnamese Conflict, 1961-1975", "scheme": "LCSH"},
690 | {"content": "Friendship"},
691 | {"content": "158.25", "scheme": "ddc"},
692 | ]
693 | self.assertIn("Subject", parsed.metadata["dc"])
694 | dc_subject = parsed.metadata["dc"]["Subject"]
695 | self.assertIs(type(dc_subject), list)
696 | self.assertEqual(len(dc_subject), len(dcSubjectsExpected))
697 | for idx, _expected in enumerate(dc_subject):
698 | self.assertIs(type(dc_subject[idx]), dict)
699 | self.assertEqual(
700 | len(dc_subject[idx].keys()), len(dcSubjectsExpected[idx].keys())
701 | )
702 | self.assertEqual(
703 | sorted(dc_subject[idx].keys()), sorted(dcSubjectsExpected[idx].keys())
704 | )
705 | for _key in dc_subject[idx].keys():
706 | self.assertEqual(dc_subject[idx][_key], dcSubjectsExpected[idx][_key])
707 |
708 | # -----
709 | # dc TestMixedCandidates1
710 | # handle the ordering of results
711 | # the raw info tested is the same as the above Subject test...
712 | dcTestMixedCandidates1aExpected = {"content": "Friendship"}
713 | self.assertIn("TestMixedCandidates1a", parsed.metadata["dc"])
714 | dc_mixed_candidates = parsed.metadata["dc"]["TestMixedCandidates1a"]
715 | self.assertIs(type(dc_mixed_candidates), dict)
716 | self.assertEqual(
717 | len(dc_mixed_candidates.keys()), len(dcTestMixedCandidates1aExpected.keys())
718 | )
719 | self.assertEqual(
720 | sorted(dc_mixed_candidates.keys()),
721 | sorted(dcTestMixedCandidates1aExpected.keys()),
722 | )
723 | for _key in dc_mixed_candidates.keys():
724 | self.assertEqual(
725 | dc_mixed_candidates[_key], dcTestMixedCandidates1aExpected[_key]
726 | )
727 | # but we need to test get_metadata and get_metadatas
728 | with self.assertRaises(ValueError) as cm:
729 | parsed.get_metadata("TestMixedCandidates1a", strategy="dc")
730 | self.assertEqual(
731 | cm.exception.args[0], "If `strategy` is not a `list`, it must be 'all'."
732 | )
733 |
734 | self.assertEqual(
735 | parsed.get_metadata("TestMixedCandidates1a", strategy=["dc"]), "Friendship"
736 | )
737 | self.assertEqual(
738 | parsed.get_metadatas("TestMixedCandidates1a", strategy=["dc"]),
739 | [dcTestMixedCandidates1aExpected],
740 | )
741 | self.assertEqual(
742 | parsed.get_metadata(
743 | "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer
744 | ),
745 | "FRIENDSHIP",
746 | )
747 | self.assertEqual(
748 | parsed.get_metadatas(
749 | "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer
750 | ),
751 | [{"CONTENT": "FRIENDSHIP"}],
752 | )
753 |
754 | # 1b
755 | dcTestMixedCandidates1bExpected = {"content": "158.25", "scheme": "ddc"}
756 | self.assertIn("TestMixedCandidates1b", parsed.metadata["dc"])
757 | dc_mixed_candidates = parsed.metadata["dc"]["TestMixedCandidates1b"]
758 | self.assertIs(type(dc_mixed_candidates), dict)
759 | self.assertEqual(
760 | len(dc_mixed_candidates.keys()), len(dcTestMixedCandidates1bExpected.keys())
761 | )
762 | self.assertEqual(
763 | sorted(dc_mixed_candidates.keys()),
764 | sorted(dcTestMixedCandidates1bExpected.keys()),
765 | )
766 | for _key in dc_mixed_candidates.keys():
767 | self.assertEqual(
768 | dc_mixed_candidates[_key], dcTestMixedCandidates1bExpected[_key]
769 | )
770 | # but we need to test get_metadata and get_metadatas
771 | self.assertEqual(
772 | parsed.get_metadata("TestMixedCandidates1b", strategy=["dc"]), "158.25"
773 | )
774 | self.assertEqual(
775 | parsed.get_metadatas("TestMixedCandidates1b", strategy=["dc"]),
776 | [dcTestMixedCandidates1bExpected],
777 | )
778 | self.assertEqual(
779 | parsed.get_metadata(
780 | "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer
781 | ),
782 | "158.25",
783 | )
784 | self.assertEqual(
785 | parsed.get_metadatas(
786 | "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer
787 | ),
788 | [{"CONTENT": "158.25", "SCHEME": "DDC"}],
789 | )
790 |
791 | # -----
792 | # dc TestMixedCandidates2
793 | # handle the ordering of results
794 | # the raw info tested is the same as the above Subject test...
795 | dcTestMixedCandidates2aExpected = [
796 | {"content": "158.25", "scheme": "ddc"},
797 | {"content": "Friendship"},
798 | ]
799 | self.assertIn("TestMixedCandidates2a", parsed.metadata["dc"])
800 | dc_mixed_candidates = parsed.metadata["dc"]["TestMixedCandidates2a"]
801 | self.assertIs(type(dc_mixed_candidates), list)
802 | self.assertEqual(len(dc_mixed_candidates), len(dcTestMixedCandidates2aExpected))
803 | for idx, _expected in enumerate(dc_mixed_candidates):
804 | self.assertIs(type(dc_mixed_candidates[idx]), dict)
805 | self.assertEqual(
806 | len(dc_mixed_candidates[idx].keys()),
807 | len(dcTestMixedCandidates2aExpected[idx].keys()),
808 | )
809 | self.assertEqual(
810 | sorted(dc_mixed_candidates[idx].keys()),
811 | sorted(dcTestMixedCandidates2aExpected[idx].keys()),
812 | )
813 | for _key in dc_mixed_candidates[idx].keys():
814 | self.assertEqual(
815 | dc_mixed_candidates[idx][_key],
816 | dcTestMixedCandidates2aExpected[idx][_key],
817 | )
818 | # but we need to test get_metadata and get_metadatas
819 |
820 | self.assertEqual(
821 | parsed.get_metadata("TestMixedCandidates2a", strategy=["dc"]), "Friendship"
822 | )
823 | self.assertEqual(
824 | parsed.get_metadatas("TestMixedCandidates2a", strategy=["dc"]),
825 | dcTestMixedCandidates2aExpected,
826 | )
827 | self.assertEqual(
828 | parsed.get_metadata(
829 | "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
830 | ),
831 | "FRIENDSHIP",
832 | )
833 | self.assertEqual(
834 | parsed.get_metadatas(
835 | "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
836 | ),
837 | [{"CONTENT": "158.25", "SCHEME": "DDC"}, {"CONTENT": "FRIENDSHIP"}],
838 | )
839 |
840 | # 2b
841 | dcTestMixedCandidates2bExpected = [
842 | {"content": "Friendship"},
843 | {"content": "158.25", "scheme": "ddc"},
844 | ]
845 | self.assertIn("TestMixedCandidates2b", parsed.metadata["dc"])
846 | dc_mixed_candidates = parsed.metadata["dc"]["TestMixedCandidates2b"]
847 | self.assertIs(type(dc_mixed_candidates), list)
848 | self.assertEqual(len(dc_mixed_candidates), len(dcTestMixedCandidates2bExpected))
849 | for idx, _expected in enumerate(dc_mixed_candidates):
850 | self.assertIs(type(dc_mixed_candidates[idx]), dict)
851 | self.assertEqual(
852 | len(dc_mixed_candidates[idx].keys()),
853 | len(dcTestMixedCandidates2bExpected[idx].keys()),
854 | )
855 | self.assertEqual(
856 | sorted(dc_mixed_candidates[idx].keys()),
857 | sorted(dcTestMixedCandidates2bExpected[idx].keys()),
858 | )
859 | for _key in dc_mixed_candidates[idx].keys():
860 | self.assertEqual(
861 | dc_mixed_candidates[idx][_key],
862 | dcTestMixedCandidates2bExpected[idx][_key],
863 | )
864 | # but we need to test get_metadata and get_metadatas
865 | self.assertEqual(
866 | parsed.get_metadata("TestMixedCandidates2b", strategy=["dc"]), "Friendship"
867 | )
868 | self.assertEqual(
869 | parsed.get_metadatas("TestMixedCandidates2b", strategy=["dc"]),
870 | dcTestMixedCandidates2bExpected,
871 | )
872 | self.assertEqual(
873 | parsed.get_metadata(
874 | "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
875 | ),
876 | "FRIENDSHIP",
877 | )
878 | self.assertEqual(
879 | parsed.get_metadatas(
880 | "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
881 | ),
882 | [{"CONTENT": "FRIENDSHIP"}, {"CONTENT": "158.25", "SCHEME": "DDC"}],
883 | )
884 |
885 | # ok, mixedfield tests:
886 | # TestMixedField0
887 | self.assertEqual(parsed.get_metadata("TestMixedField0", strategy=["dc"]), None)
888 | self.assertEqual(
889 | parsed.get_metadata("TestMixedField0", strategy=["meta"]),
890 | "meta:TestMixedField0",
891 | )
892 | self.assertEqual(
893 | parsed.get_metadata("TestMixedField0", strategy="all"),
894 | {"meta": "meta:TestMixedField0"},
895 | )
896 | self.assertEqual(
897 | parsed.get_metadata(
898 | "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
899 | ),
900 | None,
901 | )
902 | self.assertEqual(
903 | parsed.get_metadata(
904 | "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
905 | ),
906 | "META:TESTMIXEDFIELD0",
907 | )
908 | self.assertEqual(
909 | parsed.get_metadata(
910 | "TestMixedField0", strategy="all", encoder=encoder_capitalizer
911 | ),
912 | {"meta": "META:TESTMIXEDFIELD0"},
913 | )
914 | self.assertEqual(parsed.get_metadatas("TestMixedField0", strategy=["dc"]), None)
915 | self.assertEqual(
916 | parsed.get_metadatas("TestMixedField0", strategy=["meta"]),
917 | ["meta:TestMixedField0"],
918 | )
919 | self.assertEqual(
920 | parsed.get_metadatas("TestMixedField0", strategy="all"),
921 | {"meta": ["meta:TestMixedField0"]},
922 | )
923 | self.assertEqual(
924 | parsed.get_metadatas(
925 | "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
926 | ),
927 | None,
928 | )
929 | self.assertEqual(
930 | parsed.get_metadatas(
931 | "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
932 | ),
933 | ["META:TESTMIXEDFIELD0"],
934 | )
935 | self.assertEqual(
936 | parsed.get_metadatas(
937 | "TestMixedField0", strategy="all", encoder=encoder_capitalizer
938 | ),
939 | {"meta": ["META:TESTMIXEDFIELD0"]},
940 | )
941 |
942 | # TestMixedField1
943 | self.assertEqual(
944 | parsed.get_metadata("TestMixedField1", strategy=["dc"]),
945 | "dc:TestMixedField1",
946 | )
947 | self.assertEqual(
948 | parsed.get_metadata("TestMixedField1", strategy=["meta"]),
949 | "meta:TestMixedField1",
950 | )
951 | self.assertEqual(
952 | parsed.get_metadata("TestMixedField1", strategy="all"),
953 | {"meta": "meta:TestMixedField1", "dc": "dc:TestMixedField1"},
954 | )
955 | self.assertEqual(
956 | parsed.get_metadata(
957 | "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
958 | ),
959 | "DC:TESTMIXEDFIELD1",
960 | )
961 | self.assertEqual(
962 | parsed.get_metadata(
963 | "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
964 | ),
965 | "META:TESTMIXEDFIELD1",
966 | )
967 | self.assertEqual(
968 | parsed.get_metadata(
969 | "TestMixedField1", strategy="all", encoder=encoder_capitalizer
970 | ),
971 | {"meta": "META:TESTMIXEDFIELD1", "dc": "DC:TESTMIXEDFIELD1"},
972 | )
973 | self.assertEqual(
974 | parsed.get_metadatas("TestMixedField1", strategy=["dc"]),
975 | [{"content": "dc:TestMixedField1"}],
976 | )
977 | self.assertEqual(
978 | parsed.get_metadatas("TestMixedField1", strategy=["meta"]),
979 | ["meta:TestMixedField1"],
980 | )
981 | self.assertEqual(
982 | parsed.get_metadatas("TestMixedField1", strategy="all"),
983 | {
984 | "meta": ["meta:TestMixedField1"],
985 | "dc": [{"content": "dc:TestMixedField1"}],
986 | },
987 | )
988 | self.assertEqual(
989 | parsed.get_metadatas(
990 | "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
991 | ),
992 | [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
993 | )
994 | self.assertEqual(
995 | parsed.get_metadatas(
996 | "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
997 | ),
998 | ["META:TESTMIXEDFIELD1"],
999 | )
1000 | self.assertEqual(
1001 | parsed.get_metadatas(
1002 | "TestMixedField1", strategy="all", encoder=encoder_capitalizer
1003 | ),
1004 | {
1005 | "meta": ["META:TESTMIXEDFIELD1"],
1006 | "dc": [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1007 | },
1008 | )
1009 | # TestMixedField2
1010 | self.assertEqual(
1011 | parsed.get_metadata("TestMixedField2", strategy=["dc"]),
1012 | "dc:TestMixedField2",
1013 | )
1014 | self.assertEqual(
1015 | parsed.get_metadata("TestMixedField2", strategy=["meta"]),
1016 | "meta:TestMixedField2",
1017 | )
1018 | self.assertEqual(
1019 | parsed.get_metadata("TestMixedField2", strategy="all"),
1020 | {"meta": "meta:TestMixedField2", "dc": "dc:TestMixedField2"},
1021 | )
1022 | self.assertEqual(
1023 | parsed.get_metadata(
1024 | "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
1025 | ),
1026 | "DC:TESTMIXEDFIELD2",
1027 | )
1028 | self.assertEqual(
1029 | parsed.get_metadata(
1030 | "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
1031 | ),
1032 | "META:TESTMIXEDFIELD2",
1033 | )
1034 | self.assertEqual(
1035 | parsed.get_metadata(
1036 | "TestMixedField2", strategy="all", encoder=encoder_capitalizer
1037 | ),
1038 | {"meta": "META:TESTMIXEDFIELD2", "dc": "DC:TESTMIXEDFIELD2"},
1039 | )
1040 | self.assertEqual(
1041 | parsed.get_metadatas("TestMixedField2", strategy=["dc"]),
1042 | [
1043 | {"content": "dc:TestMixedField2"},
1044 | {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1045 | ],
1046 | )
1047 | self.assertEqual(
1048 | parsed.get_metadatas("TestMixedField2", strategy=["meta"]),
1049 | ["meta:TestMixedField2"],
1050 | )
1051 | self.assertEqual(
1052 | parsed.get_metadatas("TestMixedField2", strategy="all"),
1053 | {
1054 | "meta": ["meta:TestMixedField2"],
1055 | "dc": [
1056 | {"content": "dc:TestMixedField2"},
1057 | {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1058 | ],
1059 | },
1060 | )
1061 | self.assertEqual(
1062 | parsed.get_metadatas(
1063 | "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
1064 | ),
1065 | [
1066 | {"CONTENT": "DC:TESTMIXEDFIELD2"},
1067 | {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1068 | ],
1069 | )
1070 | self.assertEqual(
1071 | parsed.get_metadatas(
1072 | "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
1073 | ),
1074 | ["META:TESTMIXEDFIELD2"],
1075 | )
1076 | self.assertEqual(
1077 | parsed.get_metadatas(
1078 | "TestMixedField2", strategy="all", encoder=encoder_capitalizer
1079 | ),
1080 | {
1081 | "meta": ["META:TESTMIXEDFIELD2"],
1082 | "dc": [
1083 | {"CONTENT": "DC:TESTMIXEDFIELD2"},
1084 | {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1085 | ],
1086 | },
1087 | )
1088 |
1089 | # TestMixedField3
1090 | self.assertEqual(
1091 | parsed.get_metadata("TestMixedField3", strategy=["dc"]),
1092 | "dc:TestMixedField3",
1093 | )
1094 | self.assertEqual(
1095 | parsed.get_metadata("TestMixedField3", strategy=["meta"]),
1096 | "meta:TestMixedField3",
1097 | )
1098 | self.assertEqual(
1099 | parsed.get_metadata("TestMixedField3", strategy="all"),
1100 | {"meta": "meta:TestMixedField3", "dc": "dc:TestMixedField3"},
1101 | )
1102 | self.assertEqual(
1103 | parsed.get_metadata(
1104 | "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
1105 | ),
1106 | "DC:TESTMIXEDFIELD3",
1107 | )
1108 | self.assertEqual(
1109 | parsed.get_metadata(
1110 | "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
1111 | ),
1112 | "META:TESTMIXEDFIELD3",
1113 | )
1114 | self.assertEqual(
1115 | parsed.get_metadata(
1116 | "TestMixedField3", strategy="all", encoder=encoder_capitalizer
1117 | ),
1118 | {"meta": "META:TESTMIXEDFIELD3", "dc": "DC:TESTMIXEDFIELD3"},
1119 | )
1120 | self.assertEqual(
1121 | parsed.get_metadatas("TestMixedField3", strategy=["dc"]),
1122 | [{"content": "dc:TestMixedField3"}],
1123 | )
1124 | self.assertEqual(
1125 | parsed.get_metadatas("TestMixedField3", strategy=["meta"]),
1126 | ["meta:TestMixedField3"],
1127 | )
1128 | self.assertEqual(
1129 | parsed.get_metadatas("TestMixedField3", strategy="all"),
1130 | {
1131 | "meta": ["meta:TestMixedField3"],
1132 | "dc": [{"content": "dc:TestMixedField3"}],
1133 | },
1134 | )
1135 | self.assertEqual(
1136 | parsed.get_metadatas(
1137 | "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
1138 | ),
1139 | [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1140 | )
1141 | self.assertEqual(
1142 | parsed.get_metadatas(
1143 | "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
1144 | ),
1145 | ["META:TESTMIXEDFIELD3"],
1146 | )
1147 | self.assertEqual(
1148 | parsed.get_metadatas(
1149 | "TestMixedField3", strategy="all", encoder=encoder_capitalizer
1150 | ),
1151 | {
1152 | "meta": ["META:TESTMIXEDFIELD3"],
1153 | "dc": [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1154 | },
1155 | )
1156 |
1157 | self.assertEqual(parsed.get_metadata("news_keywords", strategy=["meta"]), "")
1158 | self.assertEqual(
1159 | parsed.get_metadata("auto-publish", strategy=["meta"]), "timely"
1160 | )
1161 | self.assertEqual(
1162 | parsed.get_metadata("article:modified_time", strategy=["meta"]),
1163 | "2017-10-11 01:01:01",
1164 | )
1165 | self.assertEqual(
1166 | parsed.get_metadata("msapplication-tap-highlight", strategy=["meta"]), "no"
1167 | )
1168 | self.assertEqual(
1169 | parsed.get_metadata("google-site-verification", strategy=["meta"]),
1170 | "123123123",
1171 | )
1172 | self.assertEqual(
1173 | parsed.get_metadata("twitter:data1", strategy=["meta"]), "8 min read"
1174 | )
1175 | self.assertEqual(
1176 | parsed.get_metadata("google", strategy=["meta"]), "notranslate"
1177 | )
1178 | self.assertEqual(parsed.get_metadata("news_keywords", strategy=["meta"]), "")
1179 | self.assertEqual(
1180 | parsed.get_metadatas("viewport", strategy=["meta"]),
1181 | [
1182 | "width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no",
1183 | "width=device-width, initial-scale=1, maximum-scale=1",
1184 | ],
1185 | )
1186 | self.assertEqual(
1187 | parsed.get_metadata("thumbnail", strategy=["meta"]),
1188 | "https://example.com/path/to/image.jpg",
1189 | )
1190 | self.assertEqual(
1191 | parsed.get_metadata_link("thumbnail", strategy=["meta"]),
1192 | "https://example.com/path/to/image.jpg",
1193 | )
1194 | self.assertEqual(
1195 | parsed.get_metadata("thumbnail-2", strategy=["meta"]),
1196 | "//example.com/path/to/image.jpg",
1197 | )
1198 | self.assertEqual(
1199 | parsed.get_metadata_link("thumbnail-2", strategy=["meta"]), None
1200 | )
1201 | self.assertEqual(
1202 | parsed.get_metadata("thumbnail-3", strategy=["meta"]), "/path/to/image.jpg"
1203 | )
1204 | self.assertEqual(
1205 | parsed.get_metadata_link("thumbnail-3", strategy=["meta"]), None
1206 | )
1207 |
1208 | # this should error!
1209 | with self.assertRaises(ValueError) as cm:
1210 | parsed.get_metadatas("canonical", strategy=["all"])
1211 | self.assertEqual(
1212 | cm.exception.args[0], 'Submit "all" as a `str`, not in a `list`.'
1213 | )
1214 |
1215 | # ok, now test the return types
1216 | # some behavior was changed in the .7 release
1217 |
1218 | # get_metadata - single section
1219 | self.assertEqual(
1220 | parsed.get_metadata("canonical", strategy=["page"]),
1221 | "http://example.com/meta/rel=canonical",
1222 | )
1223 | self.assertEqual(parsed.get_metadata("canonical", strategy=["meta"]), None)
1224 | self.assertEqual(
1225 | parsed.get_metadata("canonical", strategy="all"),
1226 | {"page": "http://example.com/meta/rel=canonical"},
1227 | )
1228 |
1229 | # get_metadatas - single section
1230 | self.assertEqual(
1231 | parsed.get_metadatas("canonical", strategy=["page"]),
1232 | [
1233 | "http://example.com/meta/rel=canonical",
1234 | ],
1235 | )
1236 | self.assertEqual(parsed.get_metadatas("canonical", strategy=["meta"]), None)
1237 | self.assertEqual(
1238 | parsed.get_metadatas("canonical", strategy="all"),
1239 | {
1240 | "page": [
1241 | "http://example.com/meta/rel=canonical",
1242 | ]
1243 | },
1244 | )
1245 |
1246 | # get_metadata - multiple section
1247 | self.assertEqual(
1248 | parsed.get_metadata("description", strategy=["meta"]), "meta.description"
1249 | )
1250 | self.assertEqual(
1251 | parsed.get_metadata("description", strategy="all"),
1252 | {
1253 | "og": "meta.property=og:description",
1254 | "meta": "meta.description",
1255 | "twitter": "meta.name=twitter:description",
1256 | },
1257 | )
1258 | # get_metadatas - multiple section
1259 | self.assertEqual(
1260 | parsed.get_metadatas("description", strategy=["meta"]), ["meta.description"]
1261 | )
1262 | self.assertEqual(
1263 | parsed.get_metadatas("description", strategy="all"),
1264 | {
1265 | "og": [
1266 | "meta.property=og:description",
1267 | ],
1268 | "meta": [
1269 | "meta.description",
1270 | ],
1271 | "twitter": ["meta.name=twitter:description"],
1272 | },
1273 | )
1274 |
1275 | # multiple candidates!
1276 | self.assertEqual(
1277 | parsed.get_metadata("keywords", strategy=["meta"]), "meta.keywords:1"
1278 | )
1279 | self.assertEqual(
1280 | parsed.get_metadatas("keywords", strategy=["meta"]),
1281 | ["meta.keywords:1", "meta.keywords:2"],
1282 | )
1283 |
1284 | def test_malformed_twitter(self):
1285 | """
1286 | this tests simple.html to have certain fields
1287 | python -munittest tests.document_parsing.TestDocumentParsing.test_malformed_twitter
1288 | """
1289 | html = self._MakeOne("simple.html")
1290 |
1291 | # the default behavior is to not support malformed
1292 | # that means we should consult 'value' for data and 'label'
1293 | # in `simple.html`, "label" (incorrectly) uses "content" and "data" uses "label"
1294 | parsed = metadata_parser.MetadataParser(url=None, html=html)
1295 | self.assertEqual(
1296 | parsed.metadata["twitter"]["data"], "meta.name=twitter:data||value"
1297 | )
1298 | self.assertNotIn("label", parsed.metadata["twitter"])
1299 | self.assertNotIn("invalid", parsed.metadata["twitter"])
1300 |
1301 | # now with `support_malformed` support we will load the label!
1302 | parsed2 = metadata_parser.MetadataParser(
1303 | url=None, html=html, support_malformed=True
1304 | )
1305 | self.assertEqual(
1306 | parsed2.metadata["twitter"]["data"], "meta.name=twitter:data||value"
1307 | )
1308 | self.assertEqual(
1309 | parsed2.metadata["twitter"]["label"], "meta.name=twitter:label||content"
1310 | )
1311 | self.assertEqual(
1312 | parsed2.metadata["twitter"]["invalid"], "meta.name=twitter:invalid"
1313 | )
1314 |
1315 | # try it with dupes...
1316 | html_dupes = self._MakeOne("duplicates.html")
1317 | parsed_dupe = metadata_parser.MetadataParser(url=None, html=html_dupes)
1318 | # two items for each of data/label, but label is empty strings
1319 | self.assertEqual(
1320 | parsed_dupe.metadata["twitter"]["data"],
1321 | ["meta.name=twitter:data||value,1", "meta.name=twitter:data||value,2"],
1322 | )
1323 | self.assertNotIn("label", parsed.metadata["twitter"])
1324 |
1325 | # everyone is happy when metadata is malformed!
1326 | parsed_dupe = metadata_parser.MetadataParser(
1327 | url=None, html=html_dupes, support_malformed=True
1328 | )
1329 | self.assertEqual(
1330 | parsed_dupe.metadata["twitter"]["data"],
1331 | ["meta.name=twitter:data||value,1", "meta.name=twitter:data||value,2"],
1332 | )
1333 | self.assertEqual(
1334 | parsed_dupe.metadata["twitter"]["label"],
1335 | [
1336 | "meta.name=twitter:label||content,1",
1337 | "meta.name=twitter:label||content,2",
1338 | ],
1339 | )
1340 |
1341 | def test_charsets(self):
1342 | """
1343 | python -m unittest tests.document_parsing.TestDocumentParsing.test_charsets
1344 | """
1345 | a_html = self._MakeOne("charset_a.html")
1346 | a_parsed = metadata_parser.MetadataParser(url=None, html=a_html)
1347 | self.assertEqual(
1348 | a_parsed.metadata["meta"]["content-type"], "text/html; charset=UTF-8"
1349 | )
1350 |
1351 | b_html = self._MakeOne("charset_b.html")
1352 | b_parsed = metadata_parser.MetadataParser(url=None, html=b_html)
1353 | self.assertEqual(b_parsed.metadata["meta"]["charset"], "UTF-8")
1354 |
1355 | c_html = self._MakeOne("charset_c.html")
1356 | c_parsed = metadata_parser.MetadataParser(url=None, html=c_html)
1357 | self.assertEqual(c_parsed.metadata["meta"]["charset"], "UTF-8")
1358 |
1359 |
1360 | class Test_UrlParserCacheable(unittest.TestCase):
1361 | """
1362 | python -m unittest tests.document_parsing.Test_UrlParserCacheable
1363 | """
1364 |
1365 | def test__default(self):
1366 | """MetadataParser()"""
1367 | errors = _docs_test_parser(
1368 | [
1369 | "good-canonical-relative",
1370 | "good-canonical-relative_alt",
1371 | "good-og-relative_alt",
1372 | ],
1373 | "*no-kwarg",
1374 | )
1375 | if errors:
1376 | raise ValueError(errors)
1377 |
1378 | def test__True(self):
1379 | """MetadataParser(cached_urlparser=True)"""
1380 | errors = _docs_test_parser(
1381 | [
1382 | "good-canonical-relative",
1383 | "good-canonical-relative_alt",
1384 | "good-og-relative_alt",
1385 | ],
1386 | True,
1387 | )
1388 | if errors:
1389 | raise ValueError(errors)
1390 |
1391 | def test__Int_1(self):
1392 | """MetadataParser(cached_urlparser=1)"""
1393 | with warnings.catch_warnings(record=True) as warned:
1394 | warnings.simplefilter("always")
1395 | errors = _docs_test_parser(
1396 | [
1397 | "good-canonical-relative",
1398 | "good-canonical-relative_alt",
1399 | "good-og-relative_alt",
1400 | ],
1401 | 1,
1402 | )
1403 | if errors:
1404 | raise ValueError(errors)
1405 | assert len(warned) >= 1
1406 | _found = False
1407 | for w in warned:
1408 | if isinstance(w.message, FutureWarning):
1409 | if w.message.args[0].startswith(
1410 | "Supplying an int to `cached_urlparser` to set maxitems is deprecated."
1411 | ):
1412 | _found = True
1413 | assert (
1414 | "Supply `cached_urlparser=True, cached_urlparser_maxitems=int` instead."
1415 | in w.message.args[0]
1416 | )
1417 | assert _found is True
1418 |
1419 | def test__Int_0(self):
1420 | """MetadataParser(cached_urlparser=1)"""
1421 | with warnings.catch_warnings(record=True) as warned:
1422 | warnings.simplefilter("always")
1423 | errors = _docs_test_parser(
1424 | [
1425 | "good-canonical-relative",
1426 | "good-canonical-relative_alt",
1427 | "good-og-relative_alt",
1428 | ],
1429 | 0,
1430 | )
1431 | if errors:
1432 | raise ValueError(errors)
1433 | assert len(warned) >= 1
1434 | _found = False
1435 | for w in warned:
1436 | if isinstance(w.message, FutureWarning):
1437 | if w.message.args[0].startswith(
1438 | "Supplying `0` to `cached_urlparser` to set maxitems is deprecated."
1439 | ):
1440 | _found = True
1441 | assert (
1442 | "Supply `cached_urlparser=False` instead"
1443 | in w.message.args[0]
1444 | )
1445 | assert _found is True
1446 |
1447 | def test__None(self):
1448 | errors = _docs_test_parser(
1449 | [
1450 | "good-canonical-relative",
1451 | "good-canonical-relative_alt",
1452 | "good-og-relative_alt",
1453 | ],
1454 | None,
1455 | )
1456 | if errors:
1457 | raise ValueError(errors)
1458 |
1459 | def test__False(self):
1460 | errors = _docs_test_parser(
1461 | [
1462 | "good-canonical-relative",
1463 | "good-canonical-relative_alt",
1464 | "good-og-relative_alt",
1465 | ],
1466 | False,
1467 | )
1468 | if errors:
1469 | raise ValueError(errors)
1470 |
1471 | def test__CustomParser(self):
1472 | custom_parser_obj = metadata_parser.UrlParserCacheable()
1473 | custom_parser = custom_parser_obj.urlparse
1474 | errors = _docs_test_parser(
1475 | [
1476 | "good-canonical-relative",
1477 | "good-canonical-relative_alt",
1478 | "good-og-relative_alt",
1479 | ],
1480 | custom_parser,
1481 | )
1482 | if errors:
1483 | raise ValueError(errors)
1484 |
1485 |
1486 | class Test_UrlParserCacheable_MaxItems(unittest.TestCase):
1487 |
1488 | def test__default(self):
1489 | """MetadataParser()"""
1490 | errors = _docs_test_parser(
1491 | [
1492 | "good-canonical-relative",
1493 | "good-canonical-relative_alt",
1494 | "good-og-relative_alt",
1495 | ],
1496 | "*no-kwarg",
1497 | cached_urlparser_maxitems=1,
1498 | )
1499 | if errors:
1500 | raise ValueError(errors)
1501 |
1502 | def test__True(self):
1503 | # this should fail
1504 | errors = _docs_test_parser(
1505 | [
1506 | "good-canonical-relative",
1507 | "good-canonical-relative_alt",
1508 | "good-og-relative_alt",
1509 | ],
1510 | True,
1511 | cached_urlparser_maxitems=1,
1512 | )
1513 | if errors:
1514 | raise ValueError(errors)
1515 |
1516 | def test__False(self):
1517 | # this should fail
1518 | with self.assertRaises(ValueError) as cm:
1519 | errors = _docs_test_parser(
1520 | [
1521 | "good-canonical-relative",
1522 | "good-canonical-relative_alt",
1523 | "good-og-relative_alt",
1524 | ],
1525 | False,
1526 | cached_urlparser_maxitems=1,
1527 | )
1528 | if errors:
1529 | raise ValueError(errors)
1530 | assert isinstance(cm.exception, ValueError)
1531 | assert (
1532 | cm.exception.args[0]
1533 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
1534 | )
1535 |
1536 | def test__Int_1(self):
1537 | # this should fail
1538 | with self.assertRaises(ValueError) as cm:
1539 | errors = _docs_test_parser(
1540 | [
1541 | "good-canonical-relative",
1542 | "good-canonical-relative_alt",
1543 | "good-og-relative_alt",
1544 | ],
1545 | 1,
1546 | cached_urlparser_maxitems=1,
1547 | )
1548 | if errors:
1549 | raise ValueError(errors)
1550 | assert isinstance(cm.exception, ValueError)
1551 | assert (
1552 | cm.exception.args[0]
1553 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
1554 | )
1555 |
1556 | def test__Int_0(self):
1557 | # this should fail
1558 | with self.assertRaises(ValueError) as cm:
1559 | errors = _docs_test_parser(
1560 | [
1561 | "good-canonical-relative",
1562 | "good-canonical-relative_alt",
1563 | "good-og-relative_alt",
1564 | ],
1565 | 0,
1566 | cached_urlparser_maxitems=1,
1567 | )
1568 | if errors:
1569 | raise ValueError(errors)
1570 | assert isinstance(cm.exception, ValueError)
1571 | assert (
1572 | cm.exception.args[0]
1573 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
1574 | )
1575 |
1576 | def test__None(self):
1577 | # this should fail
1578 | with self.assertRaises(ValueError) as cm:
1579 | errors = _docs_test_parser(
1580 | [
1581 | "good-canonical-relative",
1582 | "good-canonical-relative_alt",
1583 | "good-og-relative_alt",
1584 | ],
1585 | None,
1586 | cached_urlparser_maxitems=1,
1587 | )
1588 | if errors:
1589 | raise ValueError(errors)
1590 | assert isinstance(cm.exception, ValueError)
1591 | assert (
1592 | cm.exception.args[0]
1593 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
1594 | )
1595 |
1596 | def test__CustomParser(self):
1597 | # this should fail
1598 | custom_parser_obj = metadata_parser.UrlParserCacheable()
1599 | custom_parser = custom_parser_obj.urlparse
1600 | with self.assertRaises(ValueError) as cm:
1601 | errors = _docs_test_parser(
1602 | [
1603 | "good-canonical-relative",
1604 | "good-canonical-relative_alt",
1605 | "good-og-relative_alt",
1606 | ],
1607 | custom_parser,
1608 | cached_urlparser_maxitems=1,
1609 | )
1610 | if errors:
1611 | raise ValueError(errors)
1612 | assert isinstance(cm.exception, ValueError)
1613 | assert (
1614 | cm.exception.args[0]
1615 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
1616 | )
1617 |
--------------------------------------------------------------------------------
/tests/test_ip_tracking.py:
--------------------------------------------------------------------------------
1 | # stdlib
2 | import unittest
3 |
4 | # local
5 | import metadata_parser
6 |
7 | # ==============================================================================
8 |
9 |
10 | class TestIpLookups(unittest.TestCase):
11 | """"""
12 |
13 | def test_ip_lookup(self):
14 | """
15 | this is using the live internet
16 |
17 | todo: use httpbin
18 | """
19 | url = "https://example.com/"
20 | page = metadata_parser.MetadataParser(url=url)
21 | self.assertTrue(page.peername)
22 |
--------------------------------------------------------------------------------
/tests/test_responses.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # stdlib
4 | import unittest
5 |
6 | # pypi
7 | import requests
8 | import responses
9 |
10 | # local
11 | from metadata_parser import derive_encoding__hook
12 |
13 | # ==============================================================================
14 |
15 |
16 | URLS_HEADER = {
17 | "https://example.com/header=none": (None, "ISO-8859-1", "♥"),
18 | "https://example.com/header=ISO-8859-1": ("ISO-8859-1", "ISO-8859-1", "♥"),
19 | "https://example.com/header=utf-8": ("utf-8", "utf-8", "♥"),
20 | "https://example.com/header=UTF-8": ("UTF-8", "UTF-8", "♥"),
21 | }
22 | URLS_META = {
23 | "https://example.com/content_type=none": (None, "ISO-8859-1", "♥"),
24 | "https://example.com/content_type=ISO-8859-1": (
25 | "ISO-8859-1",
26 | "ISO-8859-1",
27 | "♥",
28 | ),
29 | "https://example.com/content_type=utf-8": ("utf-8", "utf-8", "♥"),
30 | "https://example.com/content_type=UTF-8": ("UTF-8", "UTF-8", "♥"),
31 | }
32 |
33 |
34 | class TestMockedResponse(unittest.TestCase):
35 | def test_simple_encoding_found(self):
36 | """these tests just check to see we derive the right content with `derive_encoding__hook`"""
37 |
38 | requests_session = requests.Session()
39 | requests_session.hooks["response"].append(derive_encoding__hook)
40 |
41 | with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
42 | # track results to this
43 | to_test = {}
44 |
45 | # set up the header tests
46 | for url in URLS_HEADER.keys():
47 | (_header, _expected, _body_char) = URLS_HEADER[url]
48 | _content_type = "text/html"
49 | if _header:
50 | _content_type = "text/html; charset=%s" % _header
51 | _body = "%s" % _body_char
52 | rsps.add(
53 | responses.GET,
54 | url,
55 | body=_body,
56 | status=200,
57 | content_type=_content_type,
58 | )
59 | to_test[url] = (_expected, _body)
60 |
61 | # set up the meta tests
62 | for url in URLS_META.keys():
63 | (_header, _expected, _body_char) = URLS_META[url]
64 | _body = "%s" % _body_char
65 | if _header:
66 | _body = (
67 | '%s'
68 | % (_header, _body_char)
69 | )
70 | rsps.add(
71 | responses.GET, url, body=_body, status=200, content_type="text/html"
72 | )
73 | to_test[url] = (_expected, _body)
74 |
75 | for url in to_test:
76 | (_expected, _body) = to_test[url]
77 | r = requests_session.get(url)
78 | self.assertEqual(r.status_code, 200)
79 | self.assertEqual(r.encoding, _expected)
80 | self.assertEqual(r.text, _body)
81 |
--------------------------------------------------------------------------------
/tests/test_sessions.py:
--------------------------------------------------------------------------------
1 | # stdlib
2 | from typing import Optional
3 | import unittest
4 |
5 | # pypi
6 | from httpbin import app as httpbin_app
7 | import pytest_httpbin.serve
8 | import requests
9 |
10 | # local
11 | import metadata_parser
12 |
13 | # ==============================================================================
14 |
15 |
16 | class SessionRedirect(requests.Session):
17 | num_checked = None
18 |
19 | def get_redirect_target(self, resp):
20 | # previous versions cached this for later use, but now we use a hook
21 | # cached_peername = metadata_parser.get_response_peername(resp)
22 | def _get():
23 | if self.num_checked is None:
24 | self.num_checked = 0
25 | self.num_checked += 1
26 | if resp.is_redirect:
27 | return resp.headers["location"]
28 | if resp.status_code == 200:
29 | # some servers will do a 200 but put a redirect header in there. WTF
30 | dumb_redirect = resp.headers.get("location")
31 | if dumb_redirect:
32 | return dumb_redirect
33 | return None
34 |
35 | # --
36 | if not hasattr(resp, "_redirect_target"):
37 | resp._redirect_target = _get()
38 | return resp._redirect_target
39 |
40 |
41 | class TestSessionsHttpBin(unittest.TestCase):
42 | def setUp(self):
43 | self.httpbin_server = pytest_httpbin.serve.Server(application=httpbin_app)
44 | self.httpbin_server.start()
45 |
46 | def tearDown(self):
47 | self.httpbin_server.stop()
48 | try:
49 | # we're not invoking `pytest_httpbin.serve.Server` in the standard way
50 | # our implementation was copied off another project
51 | # the `_server` is a wsgiref server, and in Py3 simply calling
52 | # `stop()` wil shutdown the server, but it will not `close()` any
53 | # lingering sockets. this explicitly does that.
54 | self.httpbin_server._server.socket.close()
55 | except Exception as exc: # noqa: F841
56 | pass
57 |
58 | def test_no_session(self):
59 | """just checking for args"""
60 | url = self.httpbin_server.url + "/html"
61 | page = metadata_parser.MetadataParser(url=url)
62 | assert page
63 | assert page.url == url
64 |
65 | def test_simple_session(self):
66 | """just checking for args"""
67 | url = self.httpbin_server.url + "/html"
68 | with requests.Session() as s:
69 | page = metadata_parser.MetadataParser(url=url, requests_session=s)
70 | assert page
71 | assert page.url == url
72 |
73 | def test_custom_session(self):
74 | """just checking for a custom session"""
75 | num_redirects = 4
76 | url = self.httpbin_server.url + "/redirect/%s" % num_redirects
77 | with SessionRedirect() as s:
78 | page: Optional[metadata_parser.MetadataParser]
79 | try:
80 | page = metadata_parser.MetadataParser(url=url, requests_session=s)
81 | except metadata_parser.NotParsableJson as e:
82 | page = e.metadataParser
83 | # typing scope
84 | assert page is not None
85 | assert page.response is not None
86 | # we end on get
87 | self.assertEqual(page.response.url, self.httpbin_server.url + "/get")
88 | # the session should have checked the following responses: redirects + final
89 | self.assertEqual(num_redirects + 1, s.num_checked)
90 | self.assertEqual(num_redirects, len(page.response.history))
91 |
92 | # make sure that we tracked the peername. httpbin will encode
93 | self.assertTrue(metadata_parser.get_response_peername(page.response))
94 | for h in page.response.history:
95 | self.assertTrue(metadata_parser.get_response_peername(h))
96 |
--------------------------------------------------------------------------------
/tests/test_url_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf-8 -*-
2 |
3 | # stdlib
4 | import unittest
5 | from urllib.parse import ParseResult
6 | from urllib.parse import ParseResultBytes
7 | from urllib.parse import urlparse
8 |
9 | # local
10 | import metadata_parser
11 |
12 | # ==============================================================================
13 |
14 |
15 | URLS_VALID = [
16 | "http://example.com",
17 | "http://example.com/",
18 | "http://example.com/one",
19 | "http://example.com/one/two.html",
20 | "http://foo.example.com",
21 | "http://example.com:80",
22 | "http://example.com:80/",
23 | "http://example.com:80/one",
24 | "http://example.com:80/one/two.html",
25 | "http://192.168.1.1",
26 | "http://192.168.1.1/",
27 | "http://192.168.1.1:80",
28 | "http://192.168.1.1:8080",
29 | "http://192.168.1.1:80/",
30 | "http://192.168.1.1:8080/",
31 | "http://192.168.1.1:80/a.html",
32 | "http://192.168.1.1:8080/a.html",
33 | "https://example.com",
34 | "https://example.com/",
35 | "https://example.com/one",
36 | "https://example.com/one/two.html",
37 | "https://foo.example.com",
38 | "https://example.com:80",
39 | "https://example.com:80/",
40 | "https://example.com:80/one",
41 | "https://example.com:80/one/two.html",
42 | "https://192.168.1.1",
43 | "https://192.168.1.1/",
44 | "https://192.168.1.1:80",
45 | "https://192.168.1.1:8080",
46 | "https://192.168.1.1:80/",
47 | "https://192.168.1.1:8080/",
48 | "https://192.168.1.1:80/a.html",
49 | "https://192.168.1.1:8080/a.html",
50 | ]
51 |
52 | URLS_VALID_CONDITIONAL = [
53 | "http://localhost",
54 | "http://localhost:80",
55 | "http://localhost:8000",
56 | "http://localhost/foo",
57 | "http://localhost:80/foo",
58 | "http://localhost:8000/foo",
59 | "https://localhost",
60 | "https://localhost:80",
61 | "https://localhost:8000",
62 | "https://localhost/foo",
63 | "https://localhost:80/foo",
64 | "https://localhost:8000/foo",
65 | "http://127.0.0.1",
66 | "http://127.0.0.1:80",
67 | "http://127.0.0.1:8000",
68 | "http://127.0.0.1/foo",
69 | "http://127.0.0.1:80/foo",
70 | "http://127.0.0.1:8000/foo",
71 | "https://127.0.0.1",
72 | "https://127.0.0.1:80",
73 | "https://127.0.0.1:8000",
74 | "https://127.0.0.1/foo",
75 | "https://127.0.0.1:80/foo",
76 | "https://127.0.0.1:8000/foo",
77 | "http://0.0.0.0",
78 | "http://0.0.0.0:80",
79 | "http://0.0.0.0:8000",
80 | "http://0.0.0.0/foo",
81 | "http://0.0.0.0:80/foo",
82 | "http://0.0.0.0:8000/foo",
83 | "https://0.0.0.0",
84 | "https://0.0.0.0:80",
85 | "https://0.0.0.0:8000",
86 | "https://0.0.0.0/foo",
87 | "https://0.0.0.0:80/foo",
88 | "https://0.0.0.0:8000/foo",
89 | ]
90 |
91 | URLS_INVALID = [
92 | "http://example_com",
93 | "http://example_com/",
94 | "http://example_com/one",
95 | "http://999.999.999.999/",
96 | "http://999.999.999.999.999/",
97 | "http://999.999.999.999.999:8080:8080",
98 | "https://example_com",
99 | "https://example_com/",
100 | "https://example_com/one",
101 | "https://999.999.999.999/",
102 | "https://999.999.999.999.999/",
103 | "https://999.999.999.999.999:8080:8080",
104 | ]
105 |
106 |
107 | RFC_REGEX_VALID = [
108 | """http://user:password@one.example.com/foo/bar;one=two&three=four?foo=bar&biz=bash#foo"""
109 | ]
110 |
111 | RFC_REGEX_INVALID = ["""
Then l""", """ccurl" style="display:none;" """] 112 | 113 | 114 | class TestUrlRfcValid(unittest.TestCase): 115 | """ 116 | python -m unittest tests.url_parsing.TestUrlRfcValid 117 | 118 | Ensures URLs contain rfc valid components 119 | """ 120 | 121 | def test_urls_valid(self): 122 | for i in RFC_REGEX_VALID: 123 | matched = metadata_parser.RE_rfc3986_valid_characters.match(i) 124 | self.assertTrue(matched) 125 | 126 | def test_urls_invalid(self): 127 | for i in RFC_REGEX_INVALID: 128 | matched = metadata_parser.RE_rfc3986_valid_characters.match(i) 129 | self.assertTrue(matched is None) 130 | 131 | 132 | class TestUrlParsing(unittest.TestCase): 133 | """ 134 | python -m unittest tests.url_parsing.TestUrls 135 | 136 | Ensures URLs are parsed correctly as valid/invalid 137 | """ 138 | 139 | def test_urls_valid(self): 140 | for i in URLS_VALID: 141 | parsed = urlparse(i) 142 | self.assertTrue(metadata_parser.is_parsed_valid_url(parsed)) 143 | 144 | def test_urls_invalid(self): 145 | for i in URLS_INVALID: 146 | parsed = urlparse(i) 147 | self.assertFalse(metadata_parser.is_parsed_valid_url(parsed)) 148 | 149 | def test_urls_valid_conditional(self): 150 | for i in URLS_VALID_CONDITIONAL: 151 | parsed = urlparse(i) 152 | self.assertFalse( 153 | metadata_parser.is_parsed_valid_url( 154 | parsed, require_public_netloc=True, allow_localhosts=False 155 | ) 156 | ) 157 | self.assertTrue( 158 | metadata_parser.is_parsed_valid_url( 159 | parsed, require_public_netloc=False, allow_localhosts=True 160 | ) 161 | ) 162 | 163 | 164 | class TestAbsoluteUpgrades(unittest.TestCase): 165 | """ 166 | python -m unittest tests.url_parsing.TestAbsoluteUpgrades 167 | 168 | Ensures URLs are parsed correctly as valid/invalid 169 | """ 170 | 171 | def test_none_returns_none(self): 172 | absolute = metadata_parser.url_to_absolute_url(None, url_fallback=None) 173 | self.assertEqual(absolute, None) 174 | 175 | def test_nothing(self): 176 | absolute = metadata_parser.url_to_absolute_url( 177 | "http://example.com", url_fallback="http://example.com" 178 | ) 179 | self.assertEqual(absolute, "http://example.com") 180 | 181 | def test_upgrade(self): 182 | absolute = metadata_parser.url_to_absolute_url( 183 | "a.html", url_fallback="http://example.com" 184 | ) 185 | self.assertEqual(absolute, "http://example.com/a.html") 186 | 187 | def test_fallback(self): 188 | absolute = metadata_parser.url_to_absolute_url( 189 | None, url_fallback="http://example.com" 190 | ) 191 | self.assertEqual(absolute, "http://example.com") 192 | 193 | 194 | class _DocumentCanonicalsMixin(object): 195 | def _MakeOne(self, url): 196 | """generates a canonical document""" 197 | doc_base = """
%(head)s""" 198 | canonical_base = """""" 199 | _canonical_html = canonical_base % {"canonical": url} 200 | _doc_html = doc_base % {"head": _canonical_html} 201 | return _doc_html 202 | 203 | 204 | class TestDocumentCanonicals(unittest.TestCase, _DocumentCanonicalsMixin): 205 | """ 206 | python -m unittest tests.url_parsing.TestDocumentCanonicals 207 | """ 208 | 209 | def test_canonical_simple(self): 210 | """someone did their job""" 211 | url = None 212 | rel_canonical = "https://example.com/canonical" 213 | rel_expected = "https://example.com/canonical" 214 | html_doc = self._MakeOne(rel_canonical) 215 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 216 | parsed_url = parsed.get_discrete_url() 217 | self.assertEqual(parsed_url, rel_expected) 218 | 219 | def test_canonical_upgrade(self): 220 | """someone else did their job. not as good, but did their job""" 221 | url = "https://example.com" 222 | rel_canonical = "/canonical" 223 | rel_expected = "https://example.com/canonical" 224 | html_doc = self._MakeOne(rel_canonical) 225 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 226 | parsed_url = parsed.get_discrete_url() 227 | self.assertEqual(parsed_url, rel_expected) 228 | 229 | def test_upgrade_invalid_root(self): 230 | """ 231 | you had one job... 232 | """ 233 | url = "https://example.com" 234 | rel_canonical = "http://localhost:8080" 235 | rel_expected = "https://example.com" 236 | html_doc = self._MakeOne(rel_canonical) 237 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 238 | parsed_url = parsed.get_discrete_url() 239 | self.assertEqual(parsed_url, rel_expected) 240 | 241 | def test_upgrade_utf8_path(self): 242 | """ 243 | you had one job... but you didn't read the RFC you shitty third rate enterprise cms 244 | """ 245 | url = "https://example.com" 246 | rel_canonical = r"https://example.com/canonical-ü" 247 | rel_expected = r"https://example.com/canonical-%C3%BC" 248 | html_doc = self._MakeOne(rel_canonical) 249 | parsed = metadata_parser.MetadataParser( 250 | url=url, 251 | html=html_doc, 252 | derive_encoding=False, 253 | default_encoding="utf-8", 254 | html_encoding="utf-8", 255 | ) 256 | parsed_url = parsed.get_discrete_url() 257 | self.assertEqual(parsed_url, rel_expected) 258 | 259 | def test_upgrade_invalid_file(self): 260 | """ 261 | you had one job... 262 | if someone lists the canonical as an invalid domain, remount the right domain 263 | 264 | python -m unittest tests.url_parsing.TestDocumentCanonicals.test_upgrade_invalid_file 265 | """ 266 | url = "https://example.com/a" 267 | rel_canonical = "http://localhost:8080" 268 | rel_expected = "https://example.com" 269 | html_doc = self._MakeOne(rel_canonical) 270 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 271 | parsed_url = parsed.get_discrete_url() 272 | self.assertEqual(parsed_url, rel_expected) 273 | 274 | def test_upgrade_invalid_file_b(self): 275 | """ 276 | you had one job... 277 | if someone lists the canonical as a different file on an invalid domain, remount the right domain 278 | """ 279 | url = "https://example.com/a" 280 | rel_canonical = "http://localhost:8080/b" 281 | rel_expected = "https://example.com/b" 282 | html_doc = self._MakeOne(rel_canonical) 283 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 284 | parsed_url = parsed.get_discrete_url() 285 | self.assertEqual(parsed_url, rel_expected) 286 | 287 | def test_readme_scenario(self): 288 | """ 289 | you had one job... 290 | if someone lists the canonical as an invalid LOCAL domain, remount the right domain 291 | 292 | python -m unittest tests.url_parsing.TestDocumentCanonicals.test_readme_scenario 293 | """ 294 | url = "https://example.com/a" 295 | rel_canonical = "http://localhost:8000/alt-path/to/foo" 296 | rel_expected = "https://example.com/alt-path/to/foo" 297 | rel_expected_legacy = rel_canonical 298 | html_doc = self._MakeOne(rel_canonical) 299 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 300 | 301 | # ensure we replace the bad domain with the right one 302 | parsed_url = parsed.get_discrete_url() 303 | self.assertEqual(parsed_url, rel_expected) 304 | 305 | # ensure support for the legacy behavior... 306 | parsed_url = parsed.get_discrete_url(require_public_global=False) 307 | self.assertEqual(parsed_url, rel_expected_legacy) 308 | 309 | 310 | class TestDocumentCanonicalsRelative(unittest.TestCase, _DocumentCanonicalsMixin): 311 | """ 312 | python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative 313 | python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative.test_upgrade_local_a 314 | python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative.test_upgrade_local_b 315 | """ 316 | 317 | def test_upgrade_local_a(self): 318 | """""" 319 | url = "https://example.com/nested/A.html" 320 | rel_canonical = "/nested/B.html" 321 | rel_expected = "https://example.com/nested/B.html" 322 | html_doc = self._MakeOne(rel_canonical) 323 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 324 | parsed_url = parsed.get_discrete_url() 325 | self.assertEqual(parsed_url, rel_expected) 326 | 327 | def test_upgrade_local_b(self): 328 | """""" 329 | url = "https://example.com/nested/A.html" 330 | rel_canonical = "B.html" 331 | rel_expected = "https://example.com/nested/B.html" 332 | html_doc = self._MakeOne(rel_canonical) 333 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 334 | parsed_url = parsed.get_discrete_url() 335 | self.assertEqual(parsed_url, rel_expected) 336 | 337 | def test_upgrade_local_bb(self): 338 | """""" 339 | url = "https://example.com/nested/A.html" 340 | rel_canonical = "path/to/B.html" 341 | rel_expected = "https://example.com/nested/path/to/B.html" 342 | html_doc = self._MakeOne(rel_canonical) 343 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 344 | parsed_url = parsed.get_discrete_url() 345 | self.assertEqual(parsed_url, rel_expected) 346 | 347 | def test_upgrade_local_c(self): 348 | """""" 349 | url = "https://example.com/nested/A.html" 350 | rel_canonical = "/B.html" 351 | rel_expected = "https://example.com/B.html" 352 | html_doc = self._MakeOne(rel_canonical) 353 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 354 | parsed_url = parsed.get_discrete_url() 355 | self.assertEqual(parsed_url, rel_expected) 356 | 357 | def test_noupgrade_a(self): 358 | """ 359 | these tests currently require tldextract; otherwise they won't work right. 360 | """ 361 | if not metadata_parser.USE_TLDEXTRACT: 362 | raise ValueError("these tests currently require tldextract") 363 | 364 | url = "https://example.com/nested/A.html" 365 | rel_canonical = "https://foo.local/B.html" 366 | rel_expected = None 367 | html_doc = self._MakeOne(rel_canonical) 368 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 369 | 370 | parsed_url = parsed.get_url_canonical(require_public_global=True) 371 | self.assertEqual(parsed_url, rel_expected) 372 | 373 | parsed_url = parsed.get_url_opengraph(require_public_global=True) 374 | self.assertEqual(parsed_url, rel_expected) 375 | 376 | parsed_url = parsed.get_url_canonical( 377 | require_public_global=True, url_fallback=url 378 | ) 379 | self.assertEqual(parsed_url, rel_expected) 380 | 381 | parsed_url = parsed.get_url_opengraph( 382 | require_public_global=True, url_fallback=url 383 | ) 384 | self.assertEqual(parsed_url, rel_expected) 385 | 386 | 387 | class TestFixUnicodeUrls(unittest.TestCase): 388 | def test_fix_unicode_path(self): 389 | _test_pairs = ( 390 | ( 391 | "https://example.com/2017/12/abcdefgühijklmnop?a=%20foo", 392 | "https://example.com/2017/12/abcdefg%C3%BChijklmnop?a=%20foo", 393 | ), 394 | ) 395 | for raw, expected in _test_pairs: 396 | cleaned = metadata_parser.fix_unicode_url(raw) 397 | self.assertEqual(cleaned, expected) 398 | 399 | def test_fix_unicode_path_leave_unicode_kwargs(self): 400 | _test_pairs = ( 401 | ( 402 | "https://example.com/2017/12/abcdefgühijklmnop?a=%20foo&b=ü", 403 | "https://example.com/2017/12/abcdefg%C3%BChijklmnop?a=%20foo&b=ü", 404 | ), 405 | ) 406 | for raw, expected in _test_pairs: 407 | cleaned = metadata_parser.fix_unicode_url(raw) 408 | self.assertEqual(cleaned, expected) 409 | 410 | 411 | class TestArgsExceptions(unittest.TestCase, _DocumentCanonicalsMixin): 412 | """ 413 | python -m unittest tests.url_parsing.TestArgsExceptions 414 | """ 415 | 416 | def test_no_args__good(self): 417 | url = "https://example.com/nested/A.html" 418 | rel_canonical = "/B.html" 419 | rel_expected = "https://example.com/B.html" # noqa: F841 420 | html_doc = self._MakeOne(rel_canonical) 421 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 422 | parsed_url = parsed.get_discrete_url() # noqa: F841 423 | 424 | def test_og_first__good(self): 425 | url = "https://example.com/nested/A.html" 426 | rel_canonical = "/B.html" 427 | rel_expected = "https://example.com/B.html" # noqa: F841 428 | html_doc = self._MakeOne(rel_canonical) 429 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 430 | parsed_url = parsed.get_discrete_url(og_first=True) # noqa: F841 431 | 432 | def test_og_first_canonical_first__bad(self): 433 | url = "https://example.com/nested/A.html" 434 | rel_canonical = "/B.html" 435 | rel_expected = "https://example.com/B.html" # noqa: F841 436 | html_doc = self._MakeOne(rel_canonical) 437 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 438 | self.assertRaises( 439 | ValueError, parsed.get_discrete_url, og_first=True, canonical_first=True 440 | ) 441 | 442 | def test_canonical_first__bad(self): 443 | url = "https://example.com/nested/A.html" 444 | rel_canonical = "/B.html" 445 | rel_expected = "https://example.com/B.html" # noqa: F841 446 | html_doc = self._MakeOne(rel_canonical) 447 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 448 | self.assertRaises(ValueError, parsed.get_discrete_url, canonical_first=True) 449 | 450 | def test_canonical_first__good(self): 451 | url = "https://example.com/nested/A.html" 452 | rel_canonical = "/B.html" 453 | rel_expected = "https://example.com/B.html" # noqa: F841 454 | html_doc = self._MakeOne(rel_canonical) 455 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 456 | parsed_url = parsed.get_discrete_url( # noqa: F841 457 | og_first=False, canonical_first=True 458 | ) 459 | 460 | 461 | class TestCommands(unittest.TestCase, _DocumentCanonicalsMixin): 462 | """ 463 | python -m unittest tests.url_parsing.TestCommands 464 | """ 465 | 466 | def test_is_parsed_valid_url__string(self): 467 | url = "https://example.com/A.html" 468 | parsed = urlparse(url) 469 | self.assertIsInstance(parsed, ParseResult) 470 | is_valid = metadata_parser.is_parsed_valid_url(parsed) 471 | self.assertTrue(is_valid) 472 | 473 | def test_is_parsed_valid_url__bytes(self): 474 | url = b"https://example.com/A.html" 475 | parsed = urlparse(url) 476 | self.assertIsInstance(parsed, ParseResultBytes) 477 | is_valid = metadata_parser.is_parsed_valid_url(parsed) 478 | self.assertTrue(is_valid) 479 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | lint, 4 | mypy, 5 | py37,py38,py39,py310,py311,py312,py313 6 | 7 | [testenv] 8 | commands = 9 | python --version 10 | pytest {posargs:} 11 | extras = 12 | testing 13 | -------------------------------------------------------------------------------- /types.txt: -------------------------------------------------------------------------------- 1 | 2 | types { 3 | text/html html htm shtml; 4 | text/css css; 5 | text/xml xml; 6 | image/gif gif; 7 | image/jpeg jpeg jpg; 8 | application/javascript js; 9 | application/atom+xml atom; 10 | application/rss+xml rss; 11 | 12 | text/mathml mml; 13 | text/plain txt; 14 | text/vnd.sun.j2me.app-descriptor jad; 15 | text/vnd.wap.wml wml; 16 | text/x-component htc; 17 | 18 | image/png png; 19 | image/tiff tif tiff; 20 | image/vnd.wap.wbmp wbmp; 21 | image/x-icon ico; 22 | image/x-jng jng; 23 | image/x-ms-bmp bmp; 24 | image/svg+xml svg svgz; 25 | image/webp webp; 26 | 27 | application/font-woff woff; 28 | application/java-archive jar war ear; 29 | application/json json; 30 | application/mac-binhex40 hqx; 31 | application/msword doc; 32 | application/pdf pdf; 33 | application/postscript ps eps ai; 34 | application/rtf rtf; 35 | application/vnd.ms-excel xls; 36 | application/vnd.ms-fontobject eot; 37 | application/vnd.ms-powerpoint ppt; 38 | application/vnd.wap.wmlc wmlc; 39 | application/vnd.google-earth.kml+xml kml; 40 | application/vnd.google-earth.kmz kmz; 41 | application/x-7z-compressed 7z; 42 | application/x-cocoa cco; 43 | application/x-java-archive-diff jardiff; 44 | application/x-java-jnlp-file jnlp; 45 | application/x-makeself run; 46 | application/x-perl pl pm; 47 | application/x-pilot prc pdb; 48 | application/x-rar-compressed rar; 49 | application/x-redhat-package-manager rpm; 50 | application/x-sea sea; 51 | application/x-shockwave-flash swf; 52 | application/x-stuffit sit; 53 | application/x-tcl tcl tk; 54 | application/x-x509-ca-cert der pem crt; 55 | application/x-xpinstall xpi; 56 | application/xhtml+xml xhtml; 57 | application/zip zip; 58 | 59 | application/octet-stream bin exe dll; 60 | application/octet-stream deb; 61 | application/octet-stream dmg; 62 | application/octet-stream iso img; 63 | application/octet-stream msi msp msm; 64 | 65 | application/vnd.openxmlformats-officedocument.wordprocessingml.document docx; 66 | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx; 67 | application/vnd.openxmlformats-officedocument.presentationml.presentation pptx; 68 | 69 | audio/midi mid midi kar; 70 | audio/mpeg mp3; 71 | audio/ogg ogg; 72 | audio/x-m4a m4a; 73 | audio/x-realaudio ra; 74 | 75 | video/3gpp 3gpp 3gp; 76 | video/mp4 mp4; 77 | video/mpeg mpeg mpg; 78 | video/quicktime mov; 79 | video/webm webm; 80 | video/x-flv flv; 81 | video/x-m4v m4v; 82 | video/x-mng mng; 83 | video/x-ms-asf asx asf; 84 | video/x-ms-wmv wmv; 85 | video/x-msvideo avi; 86 | } 87 | --------------------------------------------------------------------------------