├── .github
└── workflows
│ └── python-package.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.txt
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── TODO.txt
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
└── metadata_parser
│ ├── __init__.py
│ ├── config.py
│ ├── exceptions.py
│ ├── py.typed
│ ├── regex.py
│ ├── requests_extensions.py
│ ├── typing.py
│ └── utils.py
├── tests
├── __init__.py
├── html_scaffolds
│ ├── charset_a.html
│ ├── charset_b.html
│ ├── charset_c.html
│ ├── duplicates.html
│ └── simple.html
├── test_document_parsing.py
├── test_ip_tracking.py
├── test_responses.py
├── test_sessions.py
└── test_url_parsing.py
├── tox.ini
└── types.txt
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches:
9 | - main
10 | - "branch-*"
11 | pull_request:
12 | branches:
13 | - main
14 | - "branch-*"
15 |
16 | jobs:
17 | build:
18 | runs-on: ${{ matrix.os }}
19 | strategy:
20 | matrix:
21 | os:
22 | - "ubuntu-22.04"
23 | python-version:
24 | - "3.7"
25 | - "3.8"
26 | - "3.9"
27 | - "3.10"
28 | - "3.11"
29 | - "3.12"
30 | - "3.13"
31 | steps:
32 | - uses: actions/checkout@v3
33 | - name: Set up Python ${{ matrix.python-version }}
34 | uses: actions/setup-python@v4
35 | with:
36 | python-version: ${{ matrix.python-version }}
37 | - name: Install dependencies
38 | run: |
39 | python -m pip install --upgrade pip
40 | pip install --upgrade tox setuptools flake8 pytest
41 | pip list
42 | - name: Test with pytest
43 | run: |
44 | tox -e py -- ${{ matrix.pytest-args }}
45 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tests/private/*
2 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/psf/black
5 | rev: 24.8.0
6 | hooks:
7 | - id: black
8 | - repo: https://github.com/pycqa/flake8
9 | rev: 7.1.1
10 | hooks:
11 | - id: flake8
--------------------------------------------------------------------------------
/CHANGELOG.txt:
--------------------------------------------------------------------------------
1 | 1.0.0 (unreleased)
2 |
3 | IMPORTANT
4 |
5 | This release has many breaking changes.
6 |
7 | Deprecated legacy code was removed.
8 |
9 | Work has been done to make the API more consistent.
10 |
11 | Several long-standing bugs and inconsistencies were fixed.
12 |
13 |
14 | Backwards Incompatible Changes:
15 |
16 | Remove Deprecated Functions:
17 | ``MetadataParser.get_metadata``
18 | ``MetadataParser.get_metadatas``
19 | ``MetadataParser.is_opengraph_minimum``
20 | ``MetadataParser.metadata``
21 | ``MetadataParser.metadata_encoding``
22 | ``MetadataParser.metadata_version``
23 | ``MetadataParser.soup``
24 | ``ParsedResult.get_metadata``
25 |
26 | Remove Deprecated Functionality:
27 | ``MetadataParser.__init__::cached_urlparser``
28 | no longer accepts `int` to control `cached_urlparser_maxitems`
29 |
30 | Encoder changes
31 | affected functions:
32 | ``decode_html``
33 | ``encode_ascii``
34 | ``ParsedResult.default_encoder``
35 | ``ParsedResult.get_metadatas::encoder``
36 | ``MetadataParser.__init__::default_encoder``
37 | previously, encoders accepted one argument, which was documented to
38 | be a string. This would cause issues if the elements were DC, as
39 | that storage uses a dict. The new behavior is to send a first
40 | raw input value that could either be a Dict or String, and a
41 | second value that is a string identifiying the storage type.
42 | now they accept two arguments:
43 | Arg 1 is a string or dict
44 | Arg 2 is optional string, identifying the strategy/store
45 |
46 | API Changes
47 | The package was split into namespaces.
48 | ``MetadataParser.__init__`` now validates submitted `strategy` args
49 |
50 | ``MetadataParser.strategy`` now defaults to: `["meta", "page", "og", "dc", "twitter"]`
51 | previously this was: `["og", "dc", "meta", "page", "twitter"]`
52 |
53 | ``ParsedResult.get_metadatas`` will now return a dict or None.
54 | A bug was discovered in which it would return the first matched
55 | elements when there were multiple options
56 |
57 | An invalid strategy will now raise `InvalidStrategy`, a subclass of `ValueError`
58 |
59 | `InvalidDocument` no longer has a .message attribute
60 |
61 | Exceptions now invoke `super().__init__(args)`
62 |
63 | New Functionality
64 |
65 | ```ParsedResult.select_first_match(field, strategy)```
66 | will return the first match for the given, or default strategy
67 |
68 |
69 |
70 | 0.13.1
71 | * guard against incorrect warnings; see Issue#52
72 | * add support for branches in github actions
73 |
74 | 0.13.0
75 | * drop py36; no test options due to github deprecation of ubuntu20.04
76 | * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a
77 | ValueError if a string other than "all" is submitted. The only valid
78 | string is "all", otherwise a list of string - excluding "all" - must be
79 | submitted. Warnings of this have been emitted for several years.
80 | * __init__(`search_head_only`) now defaults to False
81 | * `UrlParserCacheable` has been extended to accepted a `urlparser` argument.
82 | This defaults to `urlparse` and expects the same signature.
83 | * __init__(`cached_urlparser`) has new deprecations to standardize the API
84 | submitting an Int to set max_items is deprecated; instead:
85 | cached_urlparser=True
86 | cached_urlparser_maxitems=int
87 | submitting 0 is deprecated; instead:
88 | cached_urlparser=False
89 | or
90 | cached_urlparser_maxitems=0
91 | cached_urlparser=False
92 | * __init__(`cached_urlparser_maxitems`) has been added
93 | * the next release is likely to be 1.0
94 |
95 | 0.12.3
96 | * pin "BeautifulSoup4<4.15.0"
97 | * See `https://git.launchpad.net/beautifulsoup/tree/CHANGELOG`
98 | > 4.13.0 (20250202)
99 | > These things now give DeprecationWarnings when you try to use them,
100 | and are scheduled to be removed in Beautiful Soup 4.15.0.
101 | * fixes #47
102 |
103 | 0.12.2
104 | * Support Python 3.13 via `legacy-cgi` package.
105 | Thank you, https://github.com/Dryusdan.
106 | See:
107 | https://github.com/jvanasco/metadata_parser/pull/44
108 | https://github.com/jvanasco/metadata_parser/issues/43
109 | * updated pre-commit-config
110 |
111 | 0.12.1
112 | * typing
113 | * added `METADATA_PARSER_FUTURE` environment variable
114 | `export METADATA_PARSER_FUTURE=1` to enable
115 | * is_parsed_valid_url can accept a ParseResultBytes object now
116 |
117 | 0.12.0
118 | * drop python 2.7
119 | * initial typing support
120 |
121 | 0.11.0 | UNRELEASED
122 |
123 | * BREAKING CHANGES
124 | Due to the following breaking changes, the version was bumped to 0.11.0
125 | * `MetadataParser.fetch_url` now returns a third item.
126 |
127 | * COMPATIBLE CHANGES
128 | The following changes are backwards compatible to the 0.10.x releases
129 | * a test-suite for an application leveraging `metadata_parser` experienced
130 | some issues due to changes in the Responses package used to mock tests.
131 | to better faciliate against that, a new change were made:
132 |
133 | MetadataParser now has 2 subclassable attributes for items that should
134 | or should not be parsed:
135 |
136 | + _content_types_parse = ("text/html",)
137 | + _content_types_noparse = ("application/json",)
138 |
139 | Previously, these values were hardcoded into the logic.
140 | * some error log messages were reformatted for clarity
141 | * some error log messages were incorrectly reformatted by black
142 | * added logging for NotParseable situations involving redirects
143 | * added a `.response` attribute to NotParsable errors to help debug
144 | redirects
145 | * added a new ResponseHistory class to track redirects
146 | * it is computed and returned during `MetadataParser.fetch_url`
147 | * `MetadataParser.parse(` optionally accepts it, and will stash
148 | it into ParsedResult
149 | * `ParsedResult`
150 | * ResponseHistory is not stashed in the metadata stash, but a new namespace
151 | * `.response_history` will either be `ResponseHistory` or None
152 | * improving docstrings
153 | * added `decode_html` helper
154 | * extended MetadataParser to allow registration of a defcault_encoder for results
155 | * style cleanup
156 |
157 | 0.10.5
158 | packaging fixes
159 | migrated 'types.txt' out of distribution; it remains in github source
160 | updated some log lines with the url
161 | introduced some new log lines
162 | added `METADATA_PARSER__DISABLE_TLDEXTRACT` env
163 | merged, but reverted PR#34 which addresses Issue#32
164 |
165 |
166 | 0.10.4
167 | * black via pre-commit
168 | * upgraded black; 20.8b1
169 | * integrated with pre-commit
170 | * github actions and tox
171 | * several test files were not in git!
172 |
173 | 0.10.3
174 | updated docs on bad data
175 | black formatting
176 | added pyproject.toml
177 | moved BeautifulSoup generation into it's own method, so anyone can subclass to customize
178 | :fixes: https://github.com/jvanasco/metadata_parser/issues/25
179 | some internal variable changes thanks to flake8
180 |
181 | 0.10.2
182 | added some docs on encoding
183 |
184 | 0.10.1
185 | clarifying some inline docs
186 | BREAKING CHANGE: `fetch_url` now returns a tuple of `(html, encoding)
187 | now tracking in ParsedResult: encoding
188 | ParsedResult.metadata['_internal']['encoding'] = resp.encoding.lower() if resp.encoding else None
189 | `.parse` now accepts `html_encoding`
190 | refactored url fetching to use context managers
191 | refactored url fetching to only insert our hooks when needed
192 | adjusted test harness to close socket connections
193 |
194 | 0.10.0
195 | better Python3 support by using the six library
196 |
197 | 0.9.23
198 | added tests for url entities
199 | better grabbing of the charset
200 | better grabbing of some edge cases
201 |
202 | 0.9.22
203 | removed internal calls to the deprecated `get_metadata`, replacing them with `get_metadatas`.
204 | this will avoid emitting a deprecation warning, allowing users to migrate more easily
205 |
206 | 0.9.21
207 | * requests_toolbelt is now required
208 | ** this is to solve PR#16 / Issue#21
209 | ** the toolbelt and built-in versions of get_encodings_from_content required different workarounds
210 | * the output of urlparse is now cached onto the parser instance.
211 | ** perhaps this will be global cache in the future
212 | * MetadataParser now accepts `cached_urlparser`
213 | ** default: True
214 | options: True: use a instance of UrlParserCacheable(maxitems=30)
215 | : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser)
216 | : None/False/0 - use native urlparse
217 | : other truthy values - use as a custom urlparse
218 |
219 | * addressing issue #17 (https://github.com/jvanasco/metadata_parser/issues/17) where `get_link_` logic does not handle schemeless urls.
220 | ** `MetadataParser.get_metadata_link` will now try to upgrade schemeless links (e.g. urls that start with "//")
221 | ** `MetadataParser.get_metadata_link` will now check values against `FIELDS_REQUIRE_HTTPS` in certain situations to see if the value is valid for http
222 | ** `MetadataParser.schemeless_fields_upgradeable` is a tuple of the fields which can be upgradeable. this defaults to a package definition, but can be changed on a per-parser bases.
223 | The defaults are:
224 | 'image',
225 | 'og:image', 'og:image:url', 'og:audio', 'og:video',
226 | 'og:image:secure_url', 'og:audio:secure_url', 'og:video:secure_url',
227 | ** `MetadataParser.schemeless_fields_disallow` is a tuple of the fields which can not be upgradeable. this defaults to a package definition, but can be changed on a per-parser bases.
228 | The defaults are:
229 | 'canonical',
230 | 'og:url',
231 | ** `MetadataParser.get_url_scheme()` is a new method to expose the scheme of the active url
232 | ** `MetadataParser.upgrade_schemeless_url()` is a new method to upgrade schemeless links
233 | it accepts two arguments: url and field(optional)
234 | if present, the field is checked against the package tuple FIELDS_REQUIRE_HTTPS to see if the value is valid for http
235 | 'og:image:secure_url',
236 | 'og:audio:secure_url',
237 | 'og:video:secure_url',
238 |
239 | 0.9.20
240 | * support for deprecated `twitter:label` and `twitter:data` metatags, which use "value" instead of "content".
241 | * new param to `__init__` and `parse`: `support_malformed` (default `None`).
242 | if true, will support malformed parsing (such as consulting "value" instead of "content".
243 | functionality extended from PR #13 (https://github.com/jvanasco/metadata_parser/pull/13) from https://github.com/amensouissi
244 |
245 | 0.9.19
246 | * addressing https://github.com/jvanasco/metadata_parser/issues/12
247 | on pages with duplicate metadata keys, additional elements are ignored
248 | when parsing the document, duplicate data was not kept.
249 | * `MetadataParser.get_metadata` will always return a single string (or none)
250 | * `MetadataParser.get_metadatas` has been introduced. this will always return an array.
251 | * the internal parsed_metadata store will now store data in a mix of arrays and strings, keeping it backwards compatible
252 | * This new version benches slightly slower because of the mixed format but preserves a smaller footprint.
253 | * the parsed result now contains a version record for tracking the format `_v`.
254 | * standardized single/double quoting
255 | * cleaned up some line
256 | * the library will try to coerce strategy= arguments into the right format
257 | * when getting dublin core data, the result could either be a string of a dict. there's no good way to handle this.
258 | * added tests for encoders
259 | * greatly expanded tests
260 |
261 | 0.9.18
262 | * removed a stray debug line
263 |
264 | 0.9.17
265 | * added `retry_dropped_without_headers` option
266 |
267 | 0.9.16
268 | * added `fix_unicode_url()`
269 | * Added `allow_unicode_url` (default True) to the following calls:
270 | `MetadataParser.get_url_canonical`
271 | `MetadataParser.get_url_opengraph`
272 | `MetadataParser.get_discrete_url`
273 | This functionality will try to recode canonical urls with unicode data into percent-encoded streams
274 |
275 | 0.9.15
276 | * Python3 support returned
277 |
278 | 0.9.14
279 | * added some more tests to ensure encoding detected correctly
280 | * stash the soup sooner when parsing, to aid in debugging
281 |
282 | 0.9.13
283 | * doing some work to guess encoding...
284 | * internal: now using `resp` instead of `r`, it is easier for pdb debugging
285 | * the peername check was changed to be a hook, so it can be processed more immediately
286 | * the custom session redirect test was altered
287 | * changed the DummyResponse encoding fallback to `ENCODING_FALLBACK` which is Latin (not utf8)
288 | this is somewhat backwards incompatible with this library, but maintains compatibility with the underlying `requests` library
289 |
290 | 0.9.12
291 | * added more attributes to DummyResponse:
292 | ** `content`
293 | ** `headers`
294 |
295 | 0.9.11
296 | * some changes to how we handle upgrading bad canonicals
297 | upgrades will no longer happen IF they specify a bad domain.
298 | upgrades from localhost will still transfer over
299 |
300 | 0.9.10
301 | * slight reorder internally of TLD extract support
302 |
303 | 0.9.9
304 | * inspecting `requests` errors for a response and using it if possible
305 | * this will now try to validate urls if the `tldextract` library is present.
306 | this feature can be disabled with a global toggle
307 |
308 | import metadata_parser
309 | metadata_parser.USE_TLDEXTRACT = False
310 |
311 | 0.9.7
312 | * changed some internal variable names to better clarify difference between a hostname and netloc
313 |
314 | 0.9.7
315 | updated the following functions to test for RFC valid characters in the url string
316 | some websites, even BIG PROFESSIONAL ONES, will put html in here.
317 | idiots? amateurs? lazy? doesn't matter, they're now our problem. well, not anymore.
318 | * get_url_canonical
319 | * get_url_opengraph
320 | * get_metadata_link
321 |
322 | 0.9.6
323 | this is being held for an update to the `requests` library
324 | * made the following arguments to `MetadataParser.fetch_url()` default to None - which will then default to the class setting. they are all passed-through to `requests.get`
325 | ** `ssl_verify`
326 | ** `allow_redirects`
327 | ** `requests_timeout`
328 | * removed `force_parse` kwarg from `MetadataParser.parser`
329 | * added 'metadata_parser.RedirectDetected' class. if allow_redirects is False, a detected redirect will raise this.
330 | * added 'metadata_parser.NotParsableRedirect' class. if allow_redirects is False, a detected redirect will raise this if missing a Location.
331 | * added `requests_session` argument to `MetadataParser`
332 | * starting to use httpbin for some tests
333 | * detecting JSON documents
334 | * extended NotParseable exceptions with the MetadataParser instance as `metadataParser`
335 | * added `only_parse_http_ok` which defaults to True (legacy). submitting False will allow non-http200 responses to be parsed.
336 | * shuffled `fetch_url` logic around. it will now process more data before a potential error.
337 | * working on support for custom request sessions that can better handle redirects (requires patch or future version of requests)
338 | * caching the peername onto the response object as `_mp_peername` [ _m(etadata)p(arser)_peername ]. this will allow it to be calculated in a redirect session hook. (see tests/sessions.py)
339 | * added `defer_fetch` argument to `MetadataParser.__init__`, default ``False``. If ``True``, this will overwrite the instance's `deferred_fetch` method to actually fetch the url. this strategy allows for the `page` to be defined and response history caught. Under this situation, a 301 redirecting to a 500 can be observed; in the previous versions only the 500 would be caught.
340 | * starting to encapsulate everything into a "parsed result" class
341 | * fixed opengraph minimum check
342 | * added `MetadataParser.is_redirect_unique`
343 | * added `DummyResponse.history`
344 |
345 | 0.9.5
346 | * failing to load a document into BeautifulSoup will now catch the BS error and raise NotParsable
347 |
348 | 0.9.4
349 | * created `MetadataParser.get_url_canonical`
350 | * created `MetadataParser.get_url_opengraph`
351 | * `MetadataParser.get_discrete_url` now calls `get_url_canonical` and `get_url_opengraph`
352 |
353 | 0.9.3
354 | * fixed packaging error. removed debug "print" statements
355 |
356 | 0.9.2
357 | * upgrade nested local canonical rels correctly
358 |
359 | 0.9.1
360 | * added a new `_internal` storage namespace to the `MetadataParser.metadata` payload.
361 | this simply stashes the `MetadataParser.url` and `MetadataParser.url_actual` attributes to makes objects easier to encode for debugging
362 | * the twitter parsing was incorrectly looking for 'value' not 'content' as in the current spec
363 | * tracking the shortlink on a page
364 |
365 | 0.9.0
366 | - This has a default behavior change regarding `get_discrete_url()` .
367 | - `is_parsed_valid_url()` did not correctly handle `require_public_netloc=True`, and would allow for `localhost` values to pass
368 | - new kwarg `allow_localhosts` added to
369 | * is_parsed_valid_url
370 | * is_url_valid
371 | * url_to_absolute_url
372 | * MetadataParser.__init__
373 | * MetadataParser.absolute_url
374 | * MetadataParser.get_discrete_url
375 | * MetadataParser.get_metadata_link
376 | - new method `get_fallback_url`
377 | - `url_to_absolute_url` will return `None` if not supplied with a fallback and test url. Previously an error in parsing would occur
378 | - `url_to_absolute_url` tries to do a better job at determining the intended url when given a malformed url.
379 |
380 | 0.8.3
381 | - packaging fixes
382 |
383 | 0.8.2
384 | - incorporated fix in https://github.com/jvanasco/metadata_parser/pull/10 to handle windows support of socket objects
385 | - cleaned up some tests
386 | - added `encode_ascii` helper
387 | - added git-ignored `tests/private` directory for non-public tests
388 | - added an `encoder` argument to `get_metadata` for encoding values
389 |
390 | 0.8.1
391 | added 2 new properties to a computed MetadataParser object:
392 | is_redirect = None
393 | is_redirect_same_host = None
394 | in the case of redirects, we only have the peername available for the final URL (not the source)
395 | if a response is a redirect, it may not be for the same host -- and the peername would correspond to the destination URL -- not the origin
396 |
397 | 0.8.0
398 | this bump introduces 2 new arguments and some changed behavior:
399 |
400 | - `search_head_only=None`. previously the meta/og/etc data was only searched in the document head (where expected as per HTML specs).
401 | after indexing millions of pages, many appeared to implement this incorrectly of have html that is so off specification that
402 | parsing libraries can't correctly read it (for example, Twitter.com).
403 | This is currently implemented to default from None to True, but future versions will default to `False`.
404 | This is marked for a future default of `search_head_only=False`
405 |
406 | - `raise_on_invalid`. default False. If True, this will raise a new exception: InvalidDocument if the response
407 | does not look like a proper html document
408 |
409 |
410 |
411 | 0.7.4
412 | - more aggressive attempts to get the peername.
413 |
414 | 0.7.3
415 | - this will now try to cache the `peername` of the request (ie, the remote server) onto the peername attribute
416 |
417 | 0.7.2
418 | - applying a `strip()` to the "title". bad authors/cms often have whitespace.
419 |
420 | 0.7.1
421 | - added kwargs to docstrings
422 | - `get_metadata_link` behavior has been changed as follows:
423 | * if an encoded uri is present (starts with `data:image/`)
424 | ** this will return None by default
425 | ** if a kwarg of `allow_encoded_uri=True` is submitted, will return the encoded url (without a url prefix)
426 |
427 | 0.7.0
428 | - merged https://github.com/jvanasco/metadata_parser/pull/9 from xethorn
429 | - nested all commands to `log` under `__debug__` to avoid calls on production when PYTHONOPTIMIZE is set
430 |
431 | 0.6.18
432 | - migrated version string into __init__.py
433 |
434 | 0.6.17
435 | - added a new `DummyResponse` class to mimic popular attributes of a `requests.response` object when parsing from HTML files
436 |
437 | 0.6.16
438 | - incorporated pull8 (https://github.com/jvanasco/metadata_parser/pull/8) which fixes issue5 (https://github.com/jvanasco/metadata_parser/issues/5) with comments
439 |
440 | 0.6.15
441 | - fixed README which used old api in the example
442 |
443 | 0.6.14
444 | - there was a typo and another bug that passed some tests on BeautifulSoup parsing. they have been fixed. todo- migrate tests to public repo
445 |
446 | 0.6.13
447 | - trying to integrate a "safe read"
448 |
449 | 0.6.12
450 | - now passing "stream=True" to requests.get. this will fetch the headers first, before looping through the response. we can avoid many issues with this approach
451 |
452 | 0.6.11
453 | - now correctly validating urls with ports. had to restructure a lot of the url validation
454 |
455 | 0.6.10
456 | - changed how some nodes are inspected. this should lead to fewer errors
457 |
458 | 0.6.9
459 | - added a new method `get_metadata_link()`, which applies link transformations to a metadata in an attempt to ensure a valid link
460 |
461 | 0.6.8
462 | - added a kwarg `requests_timeout` to proxy a timeout value to `requests.get()`
463 |
464 | 0.6.7
465 | - added a lockdown to `is_parsed_valid_url` titled `http_only` -- requires http/https for the scheme
466 |
467 | 0.6.6
468 | - protecting against bad doctypes, like nasa.gov
469 | -- added `force_doctype` to __init__. defaults to False. this will change the doctype to get around to bs4/lxml issues
470 | -- this is defaulted to False.
471 |
472 | 0.6.5
473 | - keeping the parsed BS4 document; a user may wish to perform further operations on it.
474 | -- `MetadataParser.soup` attribute holds BS4 document
475 |
476 | 0.6.4
477 | - flake8 fixes. purely cosmetic.
478 |
479 | 0.6.3
480 | - no changes. `sdist upload` was picking up a reference file that wasn't in github; that file killed the distribution install
481 |
482 | 0.6.2
483 | - formatting fixes via flake8
484 |
485 | 0.6.1
486 | - Lightweight, but functional, url validation
487 | -- new 'init' argument (defaults to True) : `require_public_netloc`
488 | -- this will ensure a url's hostname/netloc is either an IPV4 or "public DNS" name
489 | -- if the url is entirely numeric, requires it to be IPV4
490 | -- if the url is alphanumeric, requires a TLD + Domain ( exception is "localhost" )
491 | -- this is NOT RFC compliant, but designed for "Real Life" use cases.
492 |
493 | 0.6.0
494 | - Several fixes to improve support of canonical and absolute urls
495 | -- replaced REGEX parsing of urls with `urlparse` parsing and inspection; too many edge cases got in
496 | -- refactored `MediaParser.absolute_url` , now proxies a call to new function `url_to_absolute_url`
497 | -- refactored `MediaParser.get_discrete_url` , now cleaner and leaner.
498 | -- refactored how some tests run, so there is cleaner output
499 |
500 |
501 | 0.5.8
502 | - trying to fix some issues with distribution
503 |
504 | 0.5.7
505 | - trying to parse unparsable pages was creating an error
506 | -- `MetadataParser.init` now accepts `only_parse_file_extensions` -- list of the only file extensions to parse
507 | -- `MetadataParser.init` now accepts `force_parse_invalid_content_type` -- forces to parse invalid content
508 | -- `MetadataParser.fetch_url` will only parse "text/html" content by default
509 |
510 | 0.5.6
511 | - trying to ensure we return a valid url in get_discrete_url()
512 | - adding in some proper unit tests; migrating from the private demo's slowly ( the private demo's hit a lot of internal files and public urls ; wouldn't be proper to make these public )
513 | - setting `self.url_actual = url` on __init__. this will get overridden on a `fetch`, but allows for a fallback on html docs passed through
514 |
515 |
516 | 0.5.5
517 | - Dropped BS3 support
518 | - test Python3 support ( support added by Paul Bonser [ https://github.com/pib ] )
519 |
520 |
521 | 0.5.4
522 | - Pull Request - https://github.com/jvanasco/metadata_parser/pull/1
523 | Credit to Paul Bonser [ https://github.com/pib ]
524 |
525 | 0.5.3
526 | - added a few `.strip()` calls to clean up metadata values
527 |
528 | 0.5.2
529 | - fixed an issue on html title parsing. the old method incorrectly regexed on a BS4 tag, not tag contents, creating character encoding issues.
530 |
531 | 0.5.1
532 | - missed the ssl_verify command
533 |
534 | 0.5.0
535 | - migrated to the requests library
536 |
537 | 0.4.13
538 | - trapping all errors in httplib and urrlib2 ; raising as an NotParsable and sticking the original error into the `raised` attribute.
539 | this will allow for cleaner error handling
540 | - we *really* need to move to requests.py
541 |
542 | 0.4.12
543 | - created a workaround for sharethis hashbang urls, which urllib2 doesn't like
544 | - we need to move to requests.py
545 |
546 | 0.4.11
547 | - added more relaxed controls for parsing safe files
548 |
549 | 0.4.10
550 | - fixed force_parse arg on init
551 | - added support for more filetypes
552 |
553 | 0.4.9
554 | - support for gzip documents that pad with extra data ( spec allows, python module doesn't )
555 | - ensure proper document format
556 |
557 | 0.4.8
558 | - added support for twitter's own og style markup
559 | - cleaned up the beautifulsoup finds for og data
560 | - moved 'try' from encapsulating 'for' blocks to encapsulating the inner loop. this will pull more data out if an error occurs.
561 |
562 | 0.4.7
563 | - cleaned up some code
564 |
565 | 0.4.6
566 | - realized that some servers return gzip content, despite not advertising that this client accepts that content ; fixed by using some ideas from mark pilgrim's feedparser. metadata_parser now advertises gzip and zlib, and processes it as needed
567 |
568 | 0.4.5
569 | - fixed a bug that prevented toplevel directories from being parsed
570 |
571 | 0.4.4
572 | - made redirect/masked/shortened links have better dereferenced url support
573 |
574 | 0.4.2
575 | - Wrapped title tag traversal with an AttributeException try block
576 | - Wrapped canonical tag lookup with a KeyError try block, defaulting to 'href' then 'content'
577 | - Added support for `url_actual` and `url_info` , which persist the data from the urllib2.urlopen object's `geturl()` and `info()`
578 | - `get_discrete_url` and `absolute_url` use the underlying url_actual data
579 | - added support for passing data and headers into urllib2 requests
580 |
581 | 0.4.1
582 | Initial Release
583 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012-2018, Jonathan Vanasco
148 |
149 |
--------------------------------------------------------------------------------
/tests/html_scaffolds/simple.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
37 |
38 |
--------------------------------------------------------------------------------
/tests/test_document_parsing.py:
--------------------------------------------------------------------------------
1 | # stdlib
2 | import os
3 | from typing import Callable
4 | from typing import Dict
5 | from typing import List
6 | from typing import Optional
7 | from typing import Tuple
8 | from typing import Union
9 | import unittest
10 |
11 | # local
12 | import metadata_parser
13 | from metadata_parser import MetadataParser
14 | from metadata_parser import urlparse
15 | from metadata_parser.exceptions import InvalidStrategy
16 |
17 |
18 | # ==============================================================================
19 |
20 |
21 | # this bit lets us run the tests directly during development
22 | _tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
23 | if _tests_dir.endswith("metadata_parser"):
24 | _tests_dir = os.path.join(_tests_dir, "tests")
25 | _examples_dir = os.path.join(_tests_dir, "html_scaffolds")
26 |
27 | # cache these lazily
28 | CACHED_FILESYSTEM_DOCUMENTS = {}
29 |
30 |
31 | doc_base = """%(head)s"""
32 |
33 | docs: Dict = {
34 | "good-canonical-absolute": {
35 | "url-real": """http://example.com""",
36 | "head": {
37 | "url-canonical": """http://example.com/canonical.html""",
38 | "url-og": None,
39 | },
40 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
41 | },
42 | "good-og-absolute": {
43 | "url-real": """http://example.com""",
44 | "head": {"url-canonical": None, "url-og": """http://example.com/og.html"""},
45 | "expected": {"get_discrete_url()": "http://example.com/og.html"},
46 | },
47 | "good-canonical-noscheme-http": {
48 | "url-real": """http://example.com""",
49 | "head": {"url-canonical": """//example.com/canonical.html""", "url-og": None},
50 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
51 | },
52 | "good-og-noscheme-http": {
53 | "url-real": """http://example.com""",
54 | "head": {"url-canonical": None, "url-og": """//example.com/og.html"""},
55 | "expected": {"get_discrete_url()": "http://example.com/og.html"},
56 | },
57 | "good-canonical-noscheme-https": {
58 | "url-real": """https://example.com""",
59 | "head": {"url-canonical": """//example.com/canonical.html""", "url-og": None},
60 | "expected": {"get_discrete_url()": "https://example.com/canonical.html"},
61 | },
62 | "good-og-noscheme-https": {
63 | "url-real": """https://example.com""",
64 | "head": {"url-canonical": None, "url-og": """//example.com/og.html"""},
65 | "expected": {"get_discrete_url()": "https://example.com/og.html"},
66 | },
67 | "good-canonical-relative": {
68 | "url-real": """http://example.com""",
69 | "head": {"url-canonical": """canonical.html""", "url-og": None},
70 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
71 | },
72 | "good-canonical-relative_alt": {
73 | "url-real": """http://example.com""",
74 | "head": {"url-canonical": """/canonical.html""", "url-og": None},
75 | "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
76 | },
77 | "good-og-relative_alt": {
78 | "url-real": """http://example.com""",
79 | "head": {"url-canonical": None, "url-og": """/og.html"""},
80 | "expected": {"get_discrete_url()": "http://example.com/og.html"},
81 | },
82 | "bad-canonical": {
83 | "url-real": """http://example.com/one-two-three.html""",
84 | "head": {"url-canonical": """...""", "url-og": None},
85 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
86 | },
87 | "bad-canonical2": {
88 | "url-real": """http://example.com/one-two-three.html""",
89 | "head": {"url-canonical": """http://""", "url-og": None},
90 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
91 | },
92 | "bad-canonical3": {
93 | "url-real": """http://example.com/one-two-three.html""",
94 | "head": {"url-canonical": """http://contentcreation""", "url-og": None},
95 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
96 | },
97 | "bad-og": {
98 | "url-real": """http://example.com/one-two-three.html""",
99 | "head": {"url-canonical": None, "url-og": """..."""},
100 | "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
101 | },
102 | "image-https": {
103 | "url-real": """https://example.com""",
104 | "head": {
105 | "url-canonical": """https://example.com/canonical.html""",
106 | "url-og": None,
107 | "url-og:image": """https://example.com/img.gif""",
108 | },
109 | "expected": {"og:image": """https://example.com/img.gif"""},
110 | },
111 | "image-https-noscheme": {
112 | "url-real": """https://example.com""",
113 | "head": {
114 | "url-canonical": """https://example.com/canonical.html""",
115 | "url-og": None,
116 | "url-og:image": """//example.com/img.gif""",
117 | },
118 | "expected": {"og:image": """https://example.com/img.gif"""},
119 | },
120 | "image-https-noscheme-secure": {
121 | "url-real": """https://example.com""",
122 | "head": {
123 | "url-canonical": """https://example.com/canonical.html""",
124 | "url-og": None,
125 | "url-og:image:secure_url": """//example.com/img.gif""",
126 | },
127 | "expected": {"og:image:secure_url": """https://example.com/img.gif"""},
128 | },
129 | "image-http": {
130 | "url-real": """http://example.com""",
131 | "head": {
132 | "url-canonical": """http://example.com/canonical.html""",
133 | "url-og": None,
134 | "url-og:image": """http://example.com/img.gif""",
135 | },
136 | "expected": {"og:image": """http://example.com/img.gif"""},
137 | },
138 | "image-http-noscheme": {
139 | "url-real": """http://example.com""",
140 | "head": {
141 | "url-canonical": """http://example.com/canonical.html""",
142 | "url-og": None,
143 | "url-og:image": """//example.com/img.gif""",
144 | },
145 | "expected": {"og:image": """http://example.com/img.gif"""},
146 | },
147 | "image-http-noscheme-secure": {
148 | "url-real": """http://example.com""",
149 | "head": {
150 | "url-canonical": """//example.com/canonical.html""",
151 | "url-og": None,
152 | "url-og:image:secure_url": """//example.com/img.gif""",
153 | },
154 | "expected": {"og:image:secure_url": None},
155 | },
156 | }
157 |
158 |
159 | def encoder_capitalizer(
160 | raw: Union[str, Dict], strategy: Optional[str] = None
161 | ) -> Union[str, dict]:
162 | # note, an api compliant encoder will only return str
163 | if isinstance(raw, dict):
164 | return {k.upper(): v.upper() for k, v in raw.items()}
165 | return raw.upper()
166 |
167 |
168 | def encoder_lowercaser(
169 | raw: Union[str, Dict], strategy: Optional[str] = None
170 | ) -> Union[str, dict]:
171 | # note, an api compliant encoder will only return str
172 | if isinstance(raw, dict):
173 | return {k.lower(): v.lower() for k, v in raw.items()}
174 | return raw.lower()
175 |
176 |
177 | # setup the test_docs with html bodies
178 | for test in list(docs.keys()):
179 | head = ""
180 | if "url-og" in docs[test]["head"]:
181 | if docs[test]["head"]["url-og"] is not None:
182 | head += (
183 | """"""
184 | % docs[test]["head"]["url-og"]
185 | )
186 | if "url-canonical" in docs[test]["head"]:
187 | if docs[test]["head"]["url-canonical"] is not None:
188 | head += (
189 | """"""
190 | % docs[test]["head"]["url-canonical"]
191 | )
192 | if "url-og:image" in docs[test]["head"]:
193 | if docs[test]["head"]["url-og:image"] is not None:
194 | head += (
195 | """"""
196 | % docs[test]["head"]["url-og:image"]
197 | )
198 | if "url-og:image:secure_url" in docs[test]["head"]:
199 | if docs[test]["head"]["url-og:image:secure_url"] is not None:
200 | head += (
201 | """"""
202 | % docs[test]["head"]["url-og:image:secure_url"]
203 | )
204 | custom_vars = {"head": head}
205 | docs[test]["doc"] = doc_base % custom_vars
206 |
207 |
208 | def _docs_test(test_names):
209 | errors = []
210 | for test in test_names:
211 | tests = []
212 | url = docs[test]["url-real"]
213 | parsed = metadata_parser.MetadataParser(url=url, html=docs[test]["doc"])
214 | if "get_discrete_url()" in docs[test]["expected"]:
215 | tests.append("get_discrete_url()")
216 | url_expected = docs[test]["expected"]["get_discrete_url()"]
217 | url_retrieved = parsed.get_discrete_url()
218 | if url_retrieved != url_expected:
219 | errors.append([test, "get_discrete_url()", url_expected, url_retrieved])
220 | if "og:image" in docs[test]["expected"]:
221 | tests.append("og:image")
222 | url_expected = docs[test]["expected"]["og:image"]
223 | url_retrieved = parsed.get_metadata_link("og:image")
224 | if url_retrieved != url_expected:
225 | errors.append([test, "og:image", url_expected, url_retrieved])
226 | if "og:image:secure_url" in docs[test]["expected"]:
227 | tests.append("og:image:secure_url")
228 | url_expected = docs[test]["expected"]["og:image:secure_url"]
229 | url_retrieved = parsed.get_metadata_link("og:image:secure_url")
230 | if url_retrieved != url_expected:
231 | errors.append(
232 | [test, "og:image:secure_url", url_expected, url_retrieved]
233 | )
234 | if not tests:
235 | raise ValueError("No tests!")
236 | return errors
237 |
238 |
239 | def _docs_test_parser(
240 | test_names, cached_urlparser, cached_urlparser_maxitems=None
241 | ) -> Tuple[metadata_parser.MetadataParser, List]:
242 | errors = []
243 | for test in test_names:
244 | tests = []
245 | url = docs[test]["url-real"]
246 | kwargs = {}
247 | if cached_urlparser != "*no-kwarg":
248 | kwargs["cached_urlparser"] = cached_urlparser
249 | if cached_urlparser_maxitems is not None:
250 | kwargs["cached_urlparser_maxitems"] = cached_urlparser_maxitems
251 | parsed = metadata_parser.MetadataParser(
252 | url=url, html=docs[test]["doc"], **kwargs
253 | )
254 | if "get_discrete_url()" in docs[test]["expected"]:
255 | tests.append("get_discrete_url()")
256 | url_expected = docs[test]["expected"]["get_discrete_url()"]
257 | url_retrieved = parsed.get_discrete_url()
258 | if url_retrieved != url_expected:
259 | errors.append([test, "get_discrete_url()", url_expected, url_retrieved])
260 | if not tests:
261 | raise ValueError("No tests!")
262 | return parsed, errors
263 |
264 |
265 | class TestHtmlDocument(unittest.TestCase):
266 | """
267 | python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__good_relative
268 | python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__good_absolute
269 | python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__bad
270 | """
271 |
272 | def test_get_discrete_url__good_relative(self):
273 | errors = _docs_test(
274 | [
275 | "good-canonical-relative",
276 | "good-canonical-relative_alt",
277 | "good-og-relative_alt",
278 | ]
279 | )
280 | if errors:
281 | raise ValueError(errors)
282 |
283 | def test_get_discrete_url__good_absolute(self):
284 | errors = _docs_test(["good-canonical-absolute", "good-og-absolute"])
285 | if errors:
286 | raise ValueError(errors)
287 |
288 | def test_get_discrete_url__good_noscheme(self):
289 | errors = _docs_test(
290 | [
291 | "good-canonical-noscheme-http",
292 | "good-og-noscheme-http",
293 | "good-canonical-noscheme-https",
294 | "good-og-noscheme-https",
295 | ]
296 | )
297 | if errors:
298 | raise ValueError(errors)
299 |
300 | def test_get_discrete_url__bad(self):
301 | errors = _docs_test(
302 | ["bad-canonical", "bad-canonical2", "bad-canonical3", "bad-og"]
303 | )
304 | if errors:
305 | raise ValueError(errors)
306 |
307 | def test_get_image(self):
308 | errors = _docs_test(
309 | [
310 | "image-http-noscheme-secure",
311 | "image-https-noscheme-secure",
312 | "image-http",
313 | "image-https",
314 | "image-http-noscheme",
315 | "image-https-noscheme",
316 | ]
317 | )
318 | if errors:
319 | raise ValueError(errors)
320 |
321 |
322 | class TestEncoders(unittest.TestCase):
323 | """
324 | python -munittest tests.test_document_parsing.TestEncoders
325 | """
326 |
327 | _data = {
328 | "unicode_whitespace": {
329 | "raw": """Example line with\xa0unicode whitespace.""",
330 | "ascii": """Example line with unicode whitespace.""",
331 | },
332 | "unicode_chars": {
333 | "raw": """Example line with\xc2\xa0unicode chars.""",
334 | "ascii": """Example line withA unicode chars.""",
335 | },
336 | "decode_html_encoder": {
337 | "html": """""",
338 | "parsed": "Foo Bar, "Biz Bang Bash."",
339 | "decoded": 'Foo Bar, "Biz Bang Bash."',
340 | },
341 | }
342 |
343 | def _make_raw(self, data_option):
344 | # create a parsed result, and inject raw data.
345 | # data coming through beautifulsoup will be parsed differently
346 | parsed = metadata_parser.MetadataParser()
347 | parsed.parsed_result.metadata["meta"]["title"] = self._data[data_option]["raw"]
348 | return parsed
349 |
350 | def _make_html(self, data_option, default_encoder: Optional[Callable] = None):
351 | # data coming through beautifulsoup is parsed by that library
352 | parsed = metadata_parser.MetadataParser(
353 | html=self._data[data_option]["html"],
354 | force_doctype=True,
355 | default_encoder=default_encoder,
356 | )
357 | return parsed
358 |
359 | def test_unicode_whitespace(self):
360 | parsed = self._make_raw("unicode_whitespace")
361 | # title_raw = parsed.parsed_result.get_metadatas('title')
362 | _title_ascii = parsed.parsed_result.get_metadatas(
363 | "title", encoder=metadata_parser.utils.encode_ascii
364 | )
365 | title_ascii = _title_ascii["meta"]
366 | self.assertEqual(title_ascii[0], self._data["unicode_whitespace"]["ascii"])
367 |
368 | def test_unicode_chars(self):
369 | parsed = self._make_raw("unicode_chars")
370 | # title_raw = parsed.parsed_result.get_metadatas('title')
371 | _title_ascii = parsed.parsed_result.get_metadatas(
372 | "title", encoder=metadata_parser.utils.encode_ascii
373 | )
374 | title_ascii = _title_ascii["meta"]
375 | self.assertEqual(title_ascii[0], self._data["unicode_chars"]["ascii"])
376 |
377 | def test_decode_html_encoder(self):
378 | parsed = self._make_html("decode_html_encoder")
379 | _parsed_description = parsed.parsed_result.get_metadatas("description")
380 | parsed_description = _parsed_description["meta"]
381 |
382 | decoded_direct = metadata_parser.utils.decode_html(parsed_description[0])
383 | self.assertEqual(decoded_direct, self._data["decode_html_encoder"]["decoded"])
384 |
385 | _decoded_decoder = parsed.parsed_result.get_metadatas(
386 | "description", encoder=metadata_parser.utils.decode_html
387 | )
388 | decoded_decoder = _decoded_decoder["meta"]
389 | self.assertEqual(
390 | decoded_decoder[0], self._data["decode_html_encoder"]["decoded"]
391 | )
392 |
393 | def test_default_encoder(self):
394 | """
395 | ensure the default decoder is invoked
396 | """
397 | parsed_with_default = self._make_html(
398 | "decode_html_encoder", default_encoder=metadata_parser.utils.decode_html
399 | )
400 | parsed_no_default = self._make_html("decode_html_encoder")
401 |
402 | # does the default_decoder work?
403 | _decoded_default = parsed_with_default.parsed_result.get_metadatas(
404 | "description"
405 | )
406 | decoded_default = _decoded_default["meta"]
407 | self.assertEqual(
408 | decoded_default[0], self._data["decode_html_encoder"]["decoded"]
409 | )
410 |
411 | # does the no decoder work as expected?
412 | _not_decoded = parsed_no_default.parsed_result.get_metadatas("description")
413 | not_decoded = _not_decoded["meta"]
414 | self.assertEqual(not_decoded[0], self._data["decode_html_encoder"]["parsed"])
415 |
416 | # can we override the default_decoder to get RAW?
417 | _decoded_override = parsed_with_default.parsed_result.get_metadatas(
418 | "description", encoder="raw"
419 | )
420 | decoded_override = _decoded_override["meta"]
421 | self.assertEqual(
422 | decoded_override[0], self._data["decode_html_encoder"]["parsed"]
423 | )
424 |
425 | # can we override the default_decoder to get something else?
426 | # ensure these 2 aren't equal, otherwise the next bit doesn't really test!
427 | self.assertNotEqual(
428 | self._data["decode_html_encoder"]["parsed"],
429 | self._data["decode_html_encoder"]["parsed"].upper(),
430 | )
431 | _decoded_override = parsed_with_default.parsed_result.get_metadatas(
432 | "description", encoder=encoder_capitalizer
433 | )
434 | decoded_override = _decoded_override["meta"]
435 | self.assertEqual(
436 | decoded_override[0], self._data["decode_html_encoder"]["parsed"].upper()
437 | )
438 |
439 |
440 | class _TestDocumentParsingCore:
441 |
442 | def _MakeOne(self, filename):
443 | """lazy cache of files as needed"""
444 | global CACHED_FILESYSTEM_DOCUMENTS
445 | if filename not in CACHED_FILESYSTEM_DOCUMENTS:
446 | CACHED_FILESYSTEM_DOCUMENTS[filename] = open(
447 | os.path.join(_examples_dir, filename)
448 | ).read()
449 | return CACHED_FILESYSTEM_DOCUMENTS[filename]
450 |
451 | def _MakeOneParsed(self, **kwargs) -> metadata_parser.MetadataParser:
452 | html = self._MakeOne("duplicates.html")
453 |
454 | mp_kwargs = {}
455 | if "strategy" in kwargs:
456 | mp_kwargs["strategy"] = kwargs["strategy"]
457 |
458 | parsed = metadata_parser.MetadataParser(url=None, html=html, **mp_kwargs)
459 |
460 | # we should be tracking the verison now
461 | self.assertIn("_v", parsed.parsed_result.metadata)
462 |
463 | # it should be the same version
464 | self.assertEqual(
465 | parsed.parsed_result.metadata_version,
466 | metadata_parser.ParsedResult._version,
467 | )
468 |
469 | # we should be tracking the verison now
470 | self.assertIn("_v", parsed.parsed_result.metadata)
471 |
472 | # it should be the same version
473 | self.assertEqual(
474 | parsed.parsed_result.metadata_version, metadata_parser.ParsedResult._version
475 | )
476 | return parsed
477 |
478 |
479 | class TestDocumentParsing_Exceptions(unittest.TestCase, _TestDocumentParsingCore):
480 |
481 | def test__all_in_list(self):
482 | parsed = self._MakeOneParsed()
483 | # this should error!
484 | with self.assertRaises(InvalidStrategy) as cm:
485 | parsed.parsed_result.get_metadatas("canonical", strategy=["all"])
486 | self.assertEqual(
487 | cm.exception.args[0],
488 | 'Submit "all" as a `str`, not in a `list`.',
489 | )
490 |
491 | def test__known_as_str(self):
492 | parsed = self._MakeOneParsed()
493 | # this should error!
494 | with self.assertRaises(InvalidStrategy) as cm:
495 | parsed.parsed_result.get_metadatas("TestMixedCandidates1a", strategy="dc")
496 | self.assertEqual(
497 | cm.exception.args[0],
498 | 'If `strategy` is not a `list`, it must be "all".',
499 | )
500 |
501 | def test__unknown_in_list(self):
502 | parsed = self._MakeOneParsed()
503 | # this should error!
504 | with self.assertRaises(InvalidStrategy) as cm:
505 | parsed.parsed_result.get_metadatas("canonical", strategy=["unknown"])
506 | self.assertEqual(
507 | cm.exception.args[0],
508 | 'Invalid strategy: "unknown".',
509 | )
510 | with self.assertRaises(InvalidStrategy) as cm:
511 | parsed.parsed_result.get_metadatas(
512 | "canonical", strategy=["unknown", "unknown-too"]
513 | )
514 | self.assertEqual(
515 | cm.exception.args[0],
516 | 'Invalid strategy: "unknown", "unknown-too".',
517 | )
518 |
519 |
520 | class TestDocumentParsing(unittest.TestCase, _TestDocumentParsingCore):
521 | """
522 | python -m unittest tests.document_parsing.TestDocumentParsing
523 | python -m unittest tests.document_parsing.TestDocumentParsing.test_simple_html
524 | python -m unittest tests.document_parsing.TestDocumentParsing.test_html_urls
525 | python -m unittest tests.document_parsing.TestDocumentParsing.test_complex_html
526 | python -m unittest tests.document_parsing.TestDocumentParsing.test_charsets
527 | """
528 |
529 | def test_simple_html(self):
530 | """this tests simple.html to have certain fields"""
531 | html = self._MakeOne("simple.html")
532 | parsed = metadata_parser.MetadataParser(url=None, html=html)
533 | self.assertEqual(
534 | parsed.parsed_result.metadata["meta"]["article:publisher"],
535 | "https://www.example.com/meta/property=article:publisher",
536 | )
537 | self.assertEqual(parsed.parsed_result.metadata["meta"]["author"], "meta.author")
538 | self.assertEqual(
539 | parsed.parsed_result.metadata["meta"]["description"], "meta.description"
540 | )
541 | self.assertEqual(
542 | parsed.parsed_result.metadata["meta"]["keywords"], "meta.keywords"
543 | )
544 | self.assertEqual(
545 | parsed.parsed_result.metadata["meta"]["og:description"],
546 | "meta.property=og:description",
547 | )
548 | self.assertEqual(
549 | parsed.parsed_result.metadata["meta"]["og:image"],
550 | "https://www.example.com/meta/property=og:image",
551 | )
552 | self.assertEqual(
553 | parsed.parsed_result.metadata["meta"]["og:site_name"],
554 | "meta.property=og:site_name",
555 | )
556 | self.assertEqual(
557 | parsed.parsed_result.metadata["meta"]["og:title"], "meta.property=og:title"
558 | )
559 | self.assertEqual(
560 | parsed.parsed_result.metadata["meta"]["og:type"], "meta.property=og:type"
561 | )
562 | self.assertEqual(
563 | parsed.parsed_result.metadata["meta"]["og:url"],
564 | "https://www.example.com/meta/property=og:url",
565 | )
566 | self.assertEqual(
567 | parsed.parsed_result.metadata["meta"]["twitter:card"],
568 | "meta.name=twitter:card",
569 | )
570 | self.assertEqual(
571 | parsed.parsed_result.metadata["meta"]["twitter:description"],
572 | "meta.name=twitter:description",
573 | )
574 | self.assertEqual(
575 | parsed.parsed_result.metadata["meta"]["twitter:image:src"],
576 | "https://example.com/meta/name=twitter:image:src",
577 | )
578 | self.assertEqual(
579 | parsed.parsed_result.metadata["meta"]["twitter:site"],
580 | "meta.name=twitter:site",
581 | )
582 | self.assertEqual(
583 | parsed.parsed_result.metadata["meta"]["twitter:title"],
584 | "meta.name=twitter:title",
585 | )
586 | self.assertEqual(
587 | parsed.parsed_result.metadata["meta"]["twitter:url"],
588 | "https://example.com/meta/name=twitter:url",
589 | )
590 | self.assertEqual(
591 | parsed.parsed_result.metadata["og"]["description"],
592 | "meta.property=og:description",
593 | )
594 | self.assertEqual(
595 | parsed.parsed_result.metadata["og"]["image"],
596 | "https://www.example.com/meta/property=og:image",
597 | )
598 | self.assertEqual(
599 | parsed.parsed_result.metadata["og"]["site_name"],
600 | "meta.property=og:site_name",
601 | )
602 | self.assertEqual(
603 | parsed.parsed_result.metadata["og"]["title"], "meta.property=og:title"
604 | )
605 | self.assertEqual(
606 | parsed.parsed_result.metadata["og"]["type"], "meta.property=og:type"
607 | )
608 | self.assertEqual(
609 | parsed.parsed_result.metadata["og"]["url"],
610 | "https://www.example.com/meta/property=og:url",
611 | )
612 | self.assertEqual(
613 | parsed.parsed_result.metadata["page"]["canonical"],
614 | "http://example.com/meta/rel=canonical",
615 | )
616 | self.assertEqual(
617 | parsed.parsed_result.metadata["page"]["shortlink"],
618 | "http://example.com/meta/rel=shortlink",
619 | )
620 | self.assertEqual(parsed.parsed_result.metadata["page"]["title"], "title")
621 | self.assertEqual(
622 | parsed.parsed_result.metadata["twitter"]["card"], "meta.name=twitter:card"
623 | )
624 | self.assertEqual(
625 | parsed.parsed_result.metadata["twitter"]["description"],
626 | "meta.name=twitter:description",
627 | )
628 | self.assertEqual(
629 | parsed.parsed_result.metadata["twitter"]["image:src"],
630 | "https://example.com/meta/name=twitter:image:src",
631 | )
632 | self.assertEqual(
633 | parsed.parsed_result.metadata["twitter"]["site"], "meta.name=twitter:site"
634 | )
635 | self.assertEqual(
636 | parsed.parsed_result.metadata["twitter"]["title"], "meta.name=twitter:title"
637 | )
638 | self.assertEqual(
639 | parsed.parsed_result.metadata["twitter"]["url"],
640 | "https://example.com/meta/name=twitter:url",
641 | )
642 | self.assertEqual(
643 | parsed.parsed_result.metadata["twitter"]["data"],
644 | "meta.name=twitter:data||value",
645 | )
646 | self.assertNotIn("label", parsed.parsed_result.metadata["twitter"])
647 | self.assertEqual(parsed.parsed_result.is_opengraph_minimum(), True)
648 |
649 | def test_html_urls(self):
650 | """this tests simple.html to have certain fields"""
651 | html = self._MakeOne("simple.html")
652 | parsed = metadata_parser.MetadataParser(url=None, html=html)
653 | # by default we do og_first
654 | self.assertEqual(
655 | parsed.get_discrete_url(), "https://www.example.com/meta/property=og:url"
656 | )
657 | self.assertEqual(
658 | parsed.get_discrete_url(canonical_first=True, og_first=False),
659 | "http://example.com/meta/rel=canonical",
660 | )
661 | self.assertEqual(
662 | parsed.get_url_opengraph(), "https://www.example.com/meta/property=og:url"
663 | )
664 | self.assertEqual(
665 | parsed.get_url_canonical(), "http://example.com/meta/rel=canonical"
666 | )
667 |
668 | def test_encoding_fallback(self):
669 | """this tests simple.html to have certain fields"""
670 | html = """body"""
671 | parsed = metadata_parser.MetadataParser(url=None, html=html)
672 | # typing scope
673 | assert parsed.response is not None
674 | self.assertEqual(parsed.response.encoding, "ISO-8859-1")
675 |
676 | def test_encoding_declared(self):
677 | html = """body"""
678 | parsed = metadata_parser.MetadataParser(url=None, html=html)
679 | # typing scope
680 | assert parsed.response is not None
681 | self.assertEqual(parsed.response.encoding, "UTF-8")
682 |
683 | def test_charsets(self):
684 | """
685 | python -m unittest tests.document_parsing.TestDocumentParsing.test_charsets
686 | """
687 | a_html = self._MakeOne("charset_a.html")
688 | a_parsed = metadata_parser.MetadataParser(url=None, html=a_html)
689 | self.assertEqual(
690 | a_parsed.parsed_result.metadata["meta"]["content-type"],
691 | "text/html; charset=UTF-8",
692 | )
693 |
694 | b_html = self._MakeOne("charset_b.html")
695 | b_parsed = metadata_parser.MetadataParser(url=None, html=b_html)
696 | self.assertEqual(b_parsed.parsed_result.metadata["meta"]["charset"], "UTF-8")
697 |
698 | c_html = self._MakeOne("charset_c.html")
699 | c_parsed = metadata_parser.MetadataParser(url=None, html=c_html)
700 | self.assertEqual(c_parsed.parsed_result.metadata["meta"]["charset"], "UTF-8")
701 |
702 | def test_malformed_twitter(self):
703 | """
704 | this tests simple.html to have certain fields
705 | python -munittest tests.document_parsing.TestDocumentParsing.test_malformed_twitter
706 | """
707 | html = self._MakeOne("simple.html")
708 |
709 | # the default behavior is to not support malformed
710 | # that means we should consult 'value' for data and 'label'
711 | # in `simple.html`, "label" (incorrectly) uses "content" and "data" uses "label"
712 | parsed = metadata_parser.MetadataParser(url=None, html=html)
713 | self.assertEqual(
714 | parsed.parsed_result.metadata["twitter"]["data"],
715 | "meta.name=twitter:data||value",
716 | )
717 | self.assertNotIn("label", parsed.parsed_result.metadata["twitter"])
718 | self.assertNotIn("invalid", parsed.parsed_result.metadata["twitter"])
719 |
720 | # now with `support_malformed` support we will load the label!
721 | parsed2 = metadata_parser.MetadataParser(
722 | url=None, html=html, support_malformed=True
723 | )
724 | self.assertEqual(
725 | parsed2.parsed_result.metadata["twitter"]["data"],
726 | "meta.name=twitter:data||value",
727 | )
728 | self.assertEqual(
729 | parsed2.parsed_result.metadata["twitter"]["label"],
730 | "meta.name=twitter:label||content",
731 | )
732 | self.assertEqual(
733 | parsed2.parsed_result.metadata["twitter"]["invalid"],
734 | "meta.name=twitter:invalid",
735 | )
736 |
737 | # try it with dupes...
738 | html_dupes = self._MakeOne("duplicates.html")
739 | parsed_dupe = metadata_parser.MetadataParser(url=None, html=html_dupes)
740 | # two items for each of data/label, but label is empty strings
741 | self.assertEqual(
742 | parsed_dupe.parsed_result.metadata["twitter"]["data"],
743 | ["meta.name=twitter:data||value,1", "meta.name=twitter:data||value,2"],
744 | )
745 | self.assertNotIn("label", parsed.parsed_result.metadata["twitter"])
746 |
747 | # everyone is happy when metadata is malformed!
748 | parsed_dupe = metadata_parser.MetadataParser(
749 | url=None, html=html_dupes, support_malformed=True
750 | )
751 | self.assertEqual(
752 | parsed_dupe.parsed_result.metadata["twitter"]["data"],
753 | ["meta.name=twitter:data||value,1", "meta.name=twitter:data||value,2"],
754 | )
755 | self.assertEqual(
756 | parsed_dupe.parsed_result.metadata["twitter"]["label"],
757 | [
758 | "meta.name=twitter:label||content,1",
759 | "meta.name=twitter:label||content,2",
760 | ],
761 | )
762 |
763 |
764 | class TestDocumentParsing_Complex(unittest.TestCase, _TestDocumentParsingCore):
765 | """
766 | this tests duplicates.html to have certain fields under complex conditions
767 | """
768 |
769 | def test_og_image(self):
770 | parsed = self._MakeOneParsed()
771 |
772 | # we have 3 og:image entries in this file
773 | _computed_link = parsed.get_metadata_link("image", strategy=["og"])
774 | assert _computed_link == "https://www.example.com/meta/property=og:image"
775 | _all_og_images = parsed.parsed_result.get_metadatas("og:image")
776 | assert _all_og_images is not None
777 | assert isinstance(_all_og_images, dict)
778 | assert "meta" in _all_og_images
779 |
780 | all_og_images = _all_og_images["meta"]
781 |
782 | assert len(all_og_images) == 3
783 | assert "https://www.example.com/meta/property=og:image" in all_og_images
784 | # bs4 cleans up the ampersand internally into an entity, but prints it deserialized by default
785 | assert (
786 | "https://www.example.com/meta?property=og:image&duplicate=1"
787 | in all_og_images
788 | )
789 | assert (
790 | "https://www.example.com/meta?property=og:image&duplicate=2"
791 | in all_og_images
792 | )
793 |
794 | def test__citation_authors(self):
795 | parsed = self._MakeOneParsed()
796 |
797 | # -----
798 | # this is a duplicate element and should be stored in the metadata dict as a list
799 | citation_authors = [
800 | "citation_author:1",
801 | "citation_author:2",
802 | "citation_author:3",
803 | ]
804 | # these should be lists
805 | self.assertEqual(
806 | parsed.parsed_result.metadata["meta"]["citation_author"], citation_authors
807 | )
808 |
809 | self.assertEqual(
810 | parsed.parsed_result.get_metadatas("citation_author", ["meta"])["meta"],
811 | citation_authors,
812 | )
813 |
814 | # this is a string
815 | self.assertEqual(
816 | parsed.parsed_result.get_metadatas("citation_author", ["meta"])["meta"][0],
817 | citation_authors[0],
818 | )
819 |
820 | def test__meta_authors(self):
821 | parsed = self._MakeOneParsed()
822 |
823 | meta_authors = ["meta.author:1", "meta.author:2"]
824 |
825 | # these should be lists
826 | self.assertEqual(parsed.parsed_result.metadata["meta"]["author"], meta_authors)
827 | self.assertEqual(
828 | parsed.parsed_result.get_metadatas("author", ["meta"])["meta"], meta_authors
829 | )
830 | # this is a string
831 | self.assertEqual(
832 | parsed.parsed_result.get_metadatas("author", ["meta"])["meta"][0],
833 | meta_authors[0],
834 | )
835 |
836 | def test__meta_keywords(self):
837 | parsed = self._MakeOneParsed()
838 |
839 | meta_kws = ["meta.keywords:1", "meta.keywords:2"]
840 | # these should be lists
841 | self.assertEqual(
842 | parsed.parsed_result.metadata["meta"]["keywords"],
843 | meta_kws,
844 | )
845 | self.assertEqual(
846 | parsed.parsed_result.get_metadatas("keywords", ["meta"])["meta"],
847 | meta_kws,
848 | )
849 | # this is a string
850 | self.assertEqual(
851 | parsed.parsed_result.get_metadatas("keywords", ["meta"])["meta"][0],
852 | meta_kws[0],
853 | )
854 |
855 | def test__meta_description(self):
856 | parsed = self._MakeOneParsed()
857 | # -----
858 | # this is a single element and should be stored in the metadata dict as a string
859 | description = "meta.description"
860 |
861 | # these should be lists
862 | self.assertEqual(
863 | parsed.parsed_result.get_metadatas("description", ["meta"])["meta"],
864 | [description],
865 | )
866 |
867 | # this is a string
868 | self.assertEqual(
869 | parsed.parsed_result.metadata["meta"]["description"],
870 | description,
871 | )
872 | self.assertEqual(
873 | parsed.parsed_result.get_metadatas("description", ["meta"])["meta"][0],
874 | description,
875 | )
876 |
877 | def test__dc__basic(self):
878 | parsed = self._MakeOneParsed()
879 | # -----
880 | # dc creator has a language variant
881 | # 'dc': {'Creator': [{'content': 'Plato'},
882 | # {'content': 'Platon', 'lang': 'fr'}],
883 |
884 | self.assertIn("Creator", parsed.parsed_result.metadata["dc"])
885 | dc_creator = parsed.parsed_result.metadata["dc"]["Creator"]
886 | # so this should be a list
887 | self.assertIs(type(dc_creator), list)
888 | # with a length of 2
889 | self.assertEqual(len(dc_creator), 2)
890 | self.assertIs(type(dc_creator[0]), dict)
891 | self.assertIs(type(dc_creator[1]), dict)
892 | self.assertIn("content", dc_creator[0])
893 | self.assertEqual(dc_creator[0]["content"], "Plato")
894 | self.assertIn("content", dc_creator[1])
895 | self.assertEqual(dc_creator[1]["content"], "Platon")
896 | self.assertIn("lang", dc_creator[1])
897 | self.assertEqual(dc_creator[1]["lang"], "fr")
898 |
899 | def test__dc__subject(self):
900 | parsed = self._MakeOneParsed()
901 | # -----
902 | # dc subject has a scheme variant
903 | # 'Subject': [{'content': 'heart attack'},
904 | # {'content': 'Myocardial Infarction; Pericardial Effusion',
905 | # 'scheme': 'MESH'},
906 | # {'content': 'vietnam war'},
907 | # {'content': 'Vietnamese Conflict, 1961-1975',
908 | # 'scheme': 'LCSH'},
909 | # {'content': 'Friendship'},
910 | # {'content': '158.25', 'scheme': 'ddc'}]},
911 | dcSubjectsExpected = [
912 | {"content": "heart attack"},
913 | {
914 | "content": "Myocardial Infarction; Pericardial Effusion",
915 | "scheme": "MESH",
916 | },
917 | {"content": "vietnam war"},
918 | {"content": "Vietnamese Conflict, 1961-1975", "scheme": "LCSH"},
919 | {"content": "Friendship"},
920 | {"content": "158.25", "scheme": "ddc"},
921 | ]
922 | self.assertIn("Subject", parsed.parsed_result.metadata["dc"])
923 | dc_subject = parsed.parsed_result.metadata["dc"]["Subject"]
924 | self.assertIs(type(dc_subject), list)
925 | self.assertEqual(len(dc_subject), len(dcSubjectsExpected))
926 | for idx, _expected in enumerate(dc_subject):
927 | self.assertIs(type(dc_subject[idx]), dict)
928 | self.assertEqual(
929 | len(dc_subject[idx].keys()), len(dcSubjectsExpected[idx].keys())
930 | )
931 | self.assertEqual(
932 | sorted(dc_subject[idx].keys()), sorted(dcSubjectsExpected[idx].keys())
933 | )
934 | for _key in dc_subject[idx].keys():
935 | self.assertEqual(dc_subject[idx][_key], dcSubjectsExpected[idx][_key])
936 |
937 | def test__dc__TestMixedCandidates1(self):
938 | parsed = self._MakeOneParsed()
939 | # -----
940 | # dc TestMixedCandidates1
941 | # handle the ordering of results
942 | # the raw info tested is the same as the above Subject test...
943 | dcTestMixedCandidates1aExpected = {"content": "Friendship"}
944 | self.assertIn(
945 | "TestMixedCandidates1a",
946 | parsed.parsed_result.metadata["dc"],
947 | )
948 | dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
949 | "TestMixedCandidates1a"
950 | ]
951 | self.assertIs(type(dc_mixed_candidates), dict)
952 | self.assertEqual(
953 | len(dc_mixed_candidates.keys()), len(dcTestMixedCandidates1aExpected.keys())
954 | )
955 | self.assertEqual(
956 | sorted(dc_mixed_candidates.keys()),
957 | sorted(dcTestMixedCandidates1aExpected.keys()),
958 | )
959 | for _key in dc_mixed_candidates.keys():
960 | self.assertEqual(
961 | dc_mixed_candidates[_key],
962 | dcTestMixedCandidates1aExpected[_key],
963 | )
964 |
965 | # test get_metadatas
966 | with self.assertRaises(InvalidStrategy) as cm:
967 | parsed.parsed_result.get_metadatas("TestMixedCandidates1a", strategy="dc")
968 | self.assertEqual(
969 | cm.exception.args[0],
970 | 'If `strategy` is not a `list`, it must be "all".',
971 | )
972 |
973 | self.assertEqual(
974 | parsed.parsed_result.get_metadatas(
975 | "TestMixedCandidates1a", strategy=["dc"]
976 | )["dc"][0],
977 | {"content": "Friendship"},
978 | )
979 | self.assertEqual(
980 | parsed.parsed_result.get_metadatas(
981 | "TestMixedCandidates1a", strategy=["dc"]
982 | )["dc"],
983 | [dcTestMixedCandidates1aExpected],
984 | )
985 | self.assertEqual(
986 | parsed.parsed_result.get_metadatas(
987 | "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer
988 | )["dc"],
989 | [{"CONTENT": "FRIENDSHIP"}],
990 | )
991 |
992 | # 1b
993 | dcTestMixedCandidates1bExpected = {"content": "158.25", "scheme": "ddc"}
994 | self.assertIn("TestMixedCandidates1b", parsed.parsed_result.metadata["dc"])
995 | dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
996 | "TestMixedCandidates1b"
997 | ]
998 | self.assertIs(type(dc_mixed_candidates), dict)
999 | self.assertEqual(
1000 | len(dc_mixed_candidates.keys()), len(dcTestMixedCandidates1bExpected.keys())
1001 | )
1002 | self.assertEqual(
1003 | sorted(dc_mixed_candidates.keys()),
1004 | sorted(dcTestMixedCandidates1bExpected.keys()),
1005 | )
1006 | for _key in dc_mixed_candidates.keys():
1007 | self.assertEqual(
1008 | dc_mixed_candidates[_key], dcTestMixedCandidates1bExpected[_key]
1009 | )
1010 |
1011 | # test get_metadatas
1012 | self.assertEqual(
1013 | parsed.parsed_result.get_metadatas(
1014 | "TestMixedCandidates1b", strategy=["dc"]
1015 | )["dc"][0],
1016 | {"content": "158.25", "scheme": "ddc"},
1017 | )
1018 | self.assertEqual(
1019 | parsed.parsed_result.get_metadatas(
1020 | "TestMixedCandidates1b", strategy=["dc"]
1021 | )["dc"],
1022 | [dcTestMixedCandidates1bExpected],
1023 | )
1024 | self.assertEqual(
1025 | parsed.parsed_result.get_metadatas(
1026 | "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer
1027 | )["dc"],
1028 | [{"CONTENT": "158.25", "SCHEME": "DDC"}],
1029 | )
1030 |
1031 | def test__dc__TestMixedCandidates2(self):
1032 | parsed = self._MakeOneParsed()
1033 | # -----
1034 | # dc TestMixedCandidates2
1035 | # handle the ordering of results
1036 | # the raw info tested is the same as the above Subject test...
1037 | dcTestMixedCandidates2aExpected = [
1038 | {"content": "158.25", "scheme": "ddc"},
1039 | {"content": "Friendship"},
1040 | ]
1041 | self.assertIn(
1042 | "TestMixedCandidates2a",
1043 | parsed.parsed_result.metadata["dc"],
1044 | )
1045 | dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
1046 | "TestMixedCandidates2a"
1047 | ]
1048 | self.assertIs(type(dc_mixed_candidates), list)
1049 | self.assertEqual(len(dc_mixed_candidates), len(dcTestMixedCandidates2aExpected))
1050 | for idx, _expected in enumerate(dc_mixed_candidates):
1051 | self.assertIs(type(dc_mixed_candidates[idx]), dict)
1052 | self.assertEqual(
1053 | len(dc_mixed_candidates[idx].keys()),
1054 | len(dcTestMixedCandidates2aExpected[idx].keys()),
1055 | )
1056 | self.assertEqual(
1057 | sorted(dc_mixed_candidates[idx].keys()),
1058 | sorted(dcTestMixedCandidates2aExpected[idx].keys()),
1059 | )
1060 | for _key in dc_mixed_candidates[idx].keys():
1061 | self.assertEqual(
1062 | dc_mixed_candidates[idx][_key],
1063 | dcTestMixedCandidates2aExpected[idx][_key],
1064 | )
1065 |
1066 | # test get_metadatas
1067 |
1068 | self.assertEqual(
1069 | parsed.parsed_result.get_metadatas(
1070 | "TestMixedCandidates2a", strategy=["dc"]
1071 | )["dc"][0],
1072 | {"content": "158.25", "scheme": "ddc"},
1073 | )
1074 | self.assertEqual(
1075 | parsed.parsed_result.get_metadatas(
1076 | "TestMixedCandidates2a", strategy=["dc"]
1077 | )["dc"],
1078 | dcTestMixedCandidates2aExpected,
1079 | )
1080 | self.assertEqual(
1081 | parsed.parsed_result.get_metadatas(
1082 | "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
1083 | )["dc"][0],
1084 | {"CONTENT": "158.25", "SCHEME": "DDC"},
1085 | {"CONTENT": "FRIENDSHIP"},
1086 | )
1087 | self.assertEqual(
1088 | parsed.parsed_result.get_metadatas(
1089 | "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
1090 | )["dc"],
1091 | [{"CONTENT": "158.25", "SCHEME": "DDC"}, {"CONTENT": "FRIENDSHIP"}],
1092 | )
1093 |
1094 | # 2b
1095 | dcTestMixedCandidates2bExpected = [
1096 | {"content": "Friendship"},
1097 | {"content": "158.25", "scheme": "ddc"},
1098 | ]
1099 | self.assertIn(
1100 | "TestMixedCandidates2b",
1101 | parsed.parsed_result.metadata["dc"],
1102 | )
1103 | dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
1104 | "TestMixedCandidates2b"
1105 | ]
1106 | self.assertIs(type(dc_mixed_candidates), list)
1107 | self.assertEqual(len(dc_mixed_candidates), len(dcTestMixedCandidates2bExpected))
1108 | for idx, _expected in enumerate(dc_mixed_candidates):
1109 | self.assertIs(type(dc_mixed_candidates[idx]), dict)
1110 | self.assertEqual(
1111 | len(dc_mixed_candidates[idx].keys()),
1112 | len(dcTestMixedCandidates2bExpected[idx].keys()),
1113 | )
1114 | self.assertEqual(
1115 | sorted(dc_mixed_candidates[idx].keys()),
1116 | sorted(dcTestMixedCandidates2bExpected[idx].keys()),
1117 | )
1118 | for _key in dc_mixed_candidates[idx].keys():
1119 | self.assertEqual(
1120 | dc_mixed_candidates[idx][_key],
1121 | dcTestMixedCandidates2bExpected[idx][_key],
1122 | )
1123 |
1124 | # test get_metadatas
1125 | self.assertEqual(
1126 | parsed.parsed_result.get_metadatas(
1127 | "TestMixedCandidates2b", strategy=["dc"]
1128 | )["dc"][0],
1129 | {"content": "Friendship"},
1130 | )
1131 | self.assertEqual(
1132 | parsed.parsed_result.get_metadatas(
1133 | "TestMixedCandidates2b", strategy=["dc"]
1134 | )["dc"],
1135 | dcTestMixedCandidates2bExpected,
1136 | )
1137 | self.assertEqual(
1138 | parsed.parsed_result.get_metadatas(
1139 | "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
1140 | )["dc"][0],
1141 | {"CONTENT": "FRIENDSHIP"},
1142 | )
1143 | self.assertEqual(
1144 | parsed.parsed_result.get_metadatas(
1145 | "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
1146 | )["dc"],
1147 | [{"CONTENT": "FRIENDSHIP"}, {"CONTENT": "158.25", "SCHEME": "DDC"}],
1148 | )
1149 |
1150 | def test__TestMixedField0(self):
1151 | parsed = self._MakeOneParsed()
1152 | # ok, mixedfield tests:
1153 | # TestMixedField0
1154 | self.assertEqual(
1155 | parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["dc"]),
1156 | None,
1157 | )
1158 | self.assertEqual(
1159 | parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["meta"])[
1160 | "meta"
1161 | ][0],
1162 | "meta:TestMixedField0",
1163 | )
1164 | self.assertEqual(
1165 | parsed.parsed_result.get_metadatas("TestMixedField0", strategy="all"),
1166 | {"meta": ["meta:TestMixedField0"]},
1167 | )
1168 | self.assertEqual(
1169 | parsed.parsed_result.get_metadatas(
1170 | "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
1171 | ),
1172 | None,
1173 | )
1174 | self.assertEqual(
1175 | parsed.parsed_result.get_metadatas(
1176 | "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
1177 | )["meta"][0],
1178 | "META:TESTMIXEDFIELD0",
1179 | )
1180 | self.assertEqual(
1181 | parsed.parsed_result.get_metadatas(
1182 | "TestMixedField0", strategy="all", encoder=encoder_capitalizer
1183 | ),
1184 | {"meta": ["META:TESTMIXEDFIELD0"]},
1185 | )
1186 | self.assertEqual(
1187 | parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["dc"]),
1188 | None,
1189 | )
1190 | self.assertEqual(
1191 | parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["meta"])[
1192 | "meta"
1193 | ],
1194 | ["meta:TestMixedField0"],
1195 | )
1196 | self.assertEqual(
1197 | parsed.parsed_result.get_metadatas("TestMixedField0", strategy="all"),
1198 | {"meta": ["meta:TestMixedField0"]},
1199 | )
1200 | self.assertEqual(
1201 | parsed.parsed_result.get_metadatas(
1202 | "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
1203 | ),
1204 | None,
1205 | )
1206 | self.assertEqual(
1207 | parsed.parsed_result.get_metadatas(
1208 | "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
1209 | )["meta"],
1210 | ["META:TESTMIXEDFIELD0"],
1211 | )
1212 | self.assertEqual(
1213 | parsed.parsed_result.get_metadatas(
1214 | "TestMixedField0", strategy="all", encoder=encoder_capitalizer
1215 | ),
1216 | {"meta": ["META:TESTMIXEDFIELD0"]},
1217 | )
1218 |
1219 | def test__TestMixedField1(self):
1220 | parsed = self._MakeOneParsed()
1221 | # TestMixedField1
1222 | self.assertEqual(
1223 | parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["dc"])[
1224 | "dc"
1225 | ][0],
1226 | {"content": "dc:TestMixedField1"},
1227 | )
1228 | self.assertEqual(
1229 | parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["meta"])[
1230 | "meta"
1231 | ][0],
1232 | "meta:TestMixedField1",
1233 | )
1234 | self.assertEqual(
1235 | parsed.parsed_result.get_metadatas("TestMixedField1", strategy="all"),
1236 | {
1237 | "dc": [{"content": "dc:TestMixedField1"}],
1238 | "meta": ["meta:TestMixedField1"],
1239 | },
1240 | )
1241 | self.assertEqual(
1242 | parsed.parsed_result.get_metadatas(
1243 | "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
1244 | )["dc"][0],
1245 | {"CONTENT": "DC:TESTMIXEDFIELD1"},
1246 | )
1247 | self.assertEqual(
1248 | parsed.parsed_result.get_metadatas(
1249 | "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
1250 | )["meta"][0],
1251 | "META:TESTMIXEDFIELD1",
1252 | )
1253 | self.assertEqual(
1254 | parsed.parsed_result.get_metadatas(
1255 | "TestMixedField1", strategy="all", encoder=encoder_capitalizer
1256 | ),
1257 | {
1258 | "dc": [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1259 | "meta": ["META:TESTMIXEDFIELD1"],
1260 | },
1261 | )
1262 | self.assertEqual(
1263 | parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["dc"])[
1264 | "dc"
1265 | ],
1266 | [{"content": "dc:TestMixedField1"}],
1267 | )
1268 | self.assertEqual(
1269 | parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["meta"])[
1270 | "meta"
1271 | ],
1272 | ["meta:TestMixedField1"],
1273 | )
1274 | self.assertEqual(
1275 | parsed.parsed_result.get_metadatas("TestMixedField1", strategy="all"),
1276 | {
1277 | "meta": ["meta:TestMixedField1"],
1278 | "dc": [{"content": "dc:TestMixedField1"}],
1279 | },
1280 | )
1281 | self.assertEqual(
1282 | parsed.parsed_result.get_metadatas(
1283 | "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
1284 | )["dc"],
1285 | [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1286 | )
1287 | self.assertEqual(
1288 | parsed.parsed_result.get_metadatas(
1289 | "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
1290 | )["meta"],
1291 | ["META:TESTMIXEDFIELD1"],
1292 | )
1293 | self.assertEqual(
1294 | parsed.parsed_result.get_metadatas(
1295 | "TestMixedField1", strategy="all", encoder=encoder_capitalizer
1296 | ),
1297 | {
1298 | "meta": ["META:TESTMIXEDFIELD1"],
1299 | "dc": [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1300 | },
1301 | )
1302 |
1303 | def test__TestMixedField2(self):
1304 | parsed = self._MakeOneParsed()
1305 | # TestMixedField2
1306 | self.assertEqual(
1307 | parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["dc"])[
1308 | "dc"
1309 | ][0],
1310 | {"content": "dc:TestMixedField2"},
1311 | # {"con[45 chars]dc"},
1312 | )
1313 | self.assertEqual(
1314 | parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["meta"])[
1315 | "meta"
1316 | ][0],
1317 | "meta:TestMixedField2",
1318 | )
1319 | self.assertEqual(
1320 | parsed.parsed_result.get_metadatas("TestMixedField2", strategy="all"),
1321 | {
1322 | "dc": [
1323 | {"content": "dc:TestMixedField2"},
1324 | {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1325 | ],
1326 | "meta": ["meta:TestMixedField2"],
1327 | },
1328 | )
1329 | self.assertEqual(
1330 | parsed.parsed_result.get_metadatas(
1331 | "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
1332 | )["dc"][0],
1333 | {"CONTENT": "DC:TESTMIXEDFIELD2"},
1334 | )
1335 | self.assertEqual(
1336 | parsed.parsed_result.get_metadatas(
1337 | "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
1338 | )["meta"][0],
1339 | "META:TESTMIXEDFIELD2",
1340 | )
1341 | self.assertEqual(
1342 | parsed.parsed_result.get_metadatas(
1343 | "TestMixedField2", strategy="all", encoder=encoder_capitalizer
1344 | ),
1345 | {
1346 | "dc": [
1347 | {"CONTENT": "DC:TESTMIXEDFIELD2"},
1348 | {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1349 | ],
1350 | "meta": ["META:TESTMIXEDFIELD2"],
1351 | },
1352 | )
1353 | self.assertEqual(
1354 | parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["dc"])[
1355 | "dc"
1356 | ],
1357 | [
1358 | {"content": "dc:TestMixedField2"},
1359 | {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1360 | ],
1361 | )
1362 | self.assertEqual(
1363 | parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["meta"])[
1364 | "meta"
1365 | ],
1366 | ["meta:TestMixedField2"],
1367 | )
1368 | self.assertEqual(
1369 | parsed.parsed_result.get_metadatas("TestMixedField2", strategy="all"),
1370 | {
1371 | "meta": ["meta:TestMixedField2"],
1372 | "dc": [
1373 | {"content": "dc:TestMixedField2"},
1374 | {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1375 | ],
1376 | },
1377 | )
1378 | self.assertEqual(
1379 | parsed.parsed_result.get_metadatas(
1380 | "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
1381 | )["dc"],
1382 | [
1383 | {"CONTENT": "DC:TESTMIXEDFIELD2"},
1384 | {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1385 | ],
1386 | )
1387 | self.assertEqual(
1388 | parsed.parsed_result.get_metadatas(
1389 | "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
1390 | )["meta"],
1391 | ["META:TESTMIXEDFIELD2"],
1392 | )
1393 | self.assertEqual(
1394 | parsed.parsed_result.get_metadatas(
1395 | "TestMixedField2", strategy="all", encoder=encoder_capitalizer
1396 | ),
1397 | {
1398 | "meta": ["META:TESTMIXEDFIELD2"],
1399 | "dc": [
1400 | {"CONTENT": "DC:TESTMIXEDFIELD2"},
1401 | {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1402 | ],
1403 | },
1404 | )
1405 |
1406 | def test__TestMixedField3(self):
1407 | parsed = self._MakeOneParsed()
1408 | # TestMixedField3
1409 | self.assertEqual(
1410 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["dc"])[
1411 | "dc"
1412 | ][0],
1413 | {"content": "dc:TestMixedField3"},
1414 | )
1415 | self.assertEqual(
1416 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["meta"])[
1417 | "meta"
1418 | ][0],
1419 | "meta:TestMixedField3",
1420 | )
1421 | self.assertEqual(
1422 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy="all"),
1423 | {
1424 | "dc": [{"content": "dc:TestMixedField3"}],
1425 | "meta": ["meta:TestMixedField3"],
1426 | },
1427 | )
1428 | self.assertEqual(
1429 | parsed.parsed_result.get_metadatas(
1430 | "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
1431 | )["dc"][0],
1432 | {"CONTENT": "DC:TESTMIXEDFIELD3"},
1433 | )
1434 | self.assertEqual(
1435 | parsed.parsed_result.get_metadatas(
1436 | "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
1437 | )["meta"][0],
1438 | "META:TESTMIXEDFIELD3",
1439 | )
1440 | self.assertEqual(
1441 | parsed.parsed_result.get_metadatas(
1442 | "TestMixedField3", strategy="all", encoder=encoder_capitalizer
1443 | ),
1444 | {
1445 | "dc": [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1446 | "meta": ["META:TESTMIXEDFIELD3"],
1447 | },
1448 | )
1449 | self.assertEqual(
1450 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["dc"])[
1451 | "dc"
1452 | ],
1453 | [{"content": "dc:TestMixedField3"}],
1454 | )
1455 | self.assertEqual(
1456 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["meta"])[
1457 | "meta"
1458 | ],
1459 | ["meta:TestMixedField3"],
1460 | )
1461 | self.assertEqual(
1462 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy="all"),
1463 | {
1464 | "meta": ["meta:TestMixedField3"],
1465 | "dc": [{"content": "dc:TestMixedField3"}],
1466 | },
1467 | )
1468 | self.assertEqual(
1469 | parsed.parsed_result.get_metadatas(
1470 | "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
1471 | )["dc"],
1472 | [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1473 | )
1474 | self.assertEqual(
1475 | parsed.parsed_result.get_metadatas(
1476 | "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
1477 | )["meta"],
1478 | ["META:TESTMIXEDFIELD3"],
1479 | )
1480 | self.assertEqual(
1481 | parsed.parsed_result.get_metadatas(
1482 | "TestMixedField3", strategy="all", encoder=encoder_capitalizer
1483 | ),
1484 | {
1485 | "meta": ["META:TESTMIXEDFIELD3"],
1486 | "dc": [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1487 | },
1488 | )
1489 |
1490 | self.assertEqual(
1491 | parsed.parsed_result.get_metadatas("news_keywords", strategy=["meta"])[
1492 | "meta"
1493 | ][0],
1494 | "",
1495 | )
1496 | self.assertEqual(
1497 | parsed.parsed_result.get_metadatas("auto-publish", strategy=["meta"])[
1498 | "meta"
1499 | ][0],
1500 | "timely",
1501 | )
1502 | self.assertEqual(
1503 | parsed.parsed_result.get_metadatas(
1504 | "article:modified_time", strategy=["meta"]
1505 | )["meta"][0],
1506 | "2017-10-11 01:01:01",
1507 | )
1508 | self.assertEqual(
1509 | parsed.parsed_result.get_metadatas(
1510 | "msapplication-tap-highlight", strategy=["meta"]
1511 | )["meta"][0],
1512 | "no",
1513 | )
1514 | self.assertEqual(
1515 | parsed.parsed_result.get_metadatas(
1516 | "google-site-verification", strategy=["meta"]
1517 | )["meta"][0],
1518 | "123123123",
1519 | )
1520 | self.assertEqual(
1521 | parsed.parsed_result.get_metadatas("twitter:data1", strategy=["meta"])[
1522 | "meta"
1523 | ][0],
1524 | "8 min read",
1525 | )
1526 | self.assertEqual(
1527 | parsed.parsed_result.get_metadatas("google", strategy=["meta"])["meta"][0],
1528 | "notranslate",
1529 | )
1530 | self.assertEqual(
1531 | parsed.parsed_result.get_metadatas("news_keywords", strategy=["meta"])[
1532 | "meta"
1533 | ][0],
1534 | "",
1535 | )
1536 | self.assertEqual(
1537 | parsed.parsed_result.get_metadatas("viewport", strategy=["meta"])["meta"],
1538 | [
1539 | "width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no",
1540 | "width=device-width, initial-scale=1, maximum-scale=1",
1541 | ],
1542 | )
1543 | self.assertEqual(
1544 | parsed.parsed_result.get_metadatas("thumbnail", strategy=["meta"])["meta"][
1545 | 0
1546 | ],
1547 | "https://example.com/path/to/image.jpg",
1548 | )
1549 | self.assertEqual(
1550 | parsed.get_metadata_link("thumbnail", strategy=["meta"]),
1551 | "https://example.com/path/to/image.jpg",
1552 | )
1553 | self.assertEqual(
1554 | parsed.parsed_result.get_metadatas("thumbnail-2", strategy=["meta"])[
1555 | "meta"
1556 | ][0],
1557 | "//example.com/path/to/image.jpg",
1558 | )
1559 | self.assertEqual(
1560 | parsed.get_metadata_link("thumbnail-2", strategy=["meta"]), None
1561 | )
1562 | self.assertEqual(
1563 | parsed.parsed_result.get_metadatas("thumbnail-3", strategy=["meta"])[
1564 | "meta"
1565 | ][0],
1566 | "/path/to/image.jpg",
1567 | )
1568 | self.assertEqual(
1569 | parsed.get_metadata_link("thumbnail-3", strategy=["meta"]), None
1570 | )
1571 |
1572 | def test__canonical(self):
1573 | parsed = self._MakeOneParsed()
1574 | # this should error!
1575 | with self.assertRaises(InvalidStrategy) as cm:
1576 | parsed.parsed_result.get_metadatas("canonical", strategy=["all"])
1577 | self.assertEqual(
1578 | cm.exception.args[0],
1579 | 'Submit "all" as a `str`, not in a `list`.',
1580 | )
1581 |
1582 | # ok, now test the return types
1583 | # some behavior was changed in the .7 release
1584 |
1585 | # get_metadatas - single section
1586 | self.assertEqual(
1587 | parsed.parsed_result.get_metadatas("canonical", strategy=["page"])["page"][
1588 | 0
1589 | ],
1590 | "http://example.com/meta/rel=canonical",
1591 | )
1592 | self.assertEqual(
1593 | parsed.parsed_result.get_metadatas("canonical", strategy=["meta"]),
1594 | None,
1595 | )
1596 | self.assertEqual(
1597 | parsed.parsed_result.get_metadatas("canonical", strategy="all"),
1598 | {"page": ["http://example.com/meta/rel=canonical"]},
1599 | )
1600 |
1601 | # get_metadatas - single section
1602 | self.assertEqual(
1603 | parsed.parsed_result.get_metadatas("canonical", strategy=["page"])["page"],
1604 | ["http://example.com/meta/rel=canonical"],
1605 | )
1606 | self.assertEqual(
1607 | parsed.parsed_result.get_metadatas("canonical", strategy=["meta"]),
1608 | None,
1609 | )
1610 | self.assertEqual(
1611 | parsed.parsed_result.get_metadatas("canonical", strategy="all"),
1612 | {"page": ["http://example.com/meta/rel=canonical"]},
1613 | )
1614 |
1615 | def test__description(self):
1616 | parsed = self._MakeOneParsed()
1617 | # get_metadatas - multiple section
1618 | self.assertEqual(
1619 | parsed.parsed_result.get_metadatas("description", strategy=["meta"])[
1620 | "meta"
1621 | ][0],
1622 | "meta.description",
1623 | )
1624 | self.assertEqual(
1625 | parsed.parsed_result.get_metadatas("description", strategy="all"),
1626 | {
1627 | "og": ["meta.property=og:description"],
1628 | "meta": ["meta.description"],
1629 | "twitter": ["meta.name=twitter:description"],
1630 | },
1631 | )
1632 | # get_metadatas - multiple section
1633 | self.assertEqual(
1634 | parsed.parsed_result.get_metadatas("description", strategy=["meta"])[
1635 | "meta"
1636 | ],
1637 | ["meta.description"],
1638 | )
1639 | self.assertEqual(
1640 | parsed.parsed_result.get_metadatas("description", strategy="all"),
1641 | {
1642 | "og": ["meta.property=og:description"],
1643 | "meta": ["meta.description"],
1644 | "twitter": ["meta.name=twitter:description"],
1645 | },
1646 | )
1647 |
1648 | def test__keywords(self):
1649 | parsed = self._MakeOneParsed()
1650 | # multiple candidates!
1651 | self.assertEqual(
1652 | parsed.parsed_result.get_metadatas("keywords", strategy=["meta"])["meta"][
1653 | 0
1654 | ],
1655 | "meta.keywords:1",
1656 | )
1657 | self.assertEqual(
1658 | parsed.parsed_result.get_metadatas("keywords", strategy=["meta"])["meta"],
1659 | ["meta.keywords:1", "meta.keywords:2"],
1660 | )
1661 |
1662 | def test_complex_html__encoder(self):
1663 | """
1664 | pytest tests/test_document_parsing.py::TestDocumentParsing::test_complex_html__encoder
1665 | """
1666 | html = self._MakeOne("duplicates.html")
1667 | parsed = metadata_parser.MetadataParser(url=None, html=html)
1668 |
1669 | # Test a few things with and without encoding
1670 |
1671 | # Test A1
1672 | self.assertEqual(
1673 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["meta"]),
1674 | {"meta": ["meta:TestMixedField3"]},
1675 | )
1676 | self.assertEqual(
1677 | parsed.parsed_result.get_metadatas(
1678 | "TestMixedField3",
1679 | strategy=["meta"],
1680 | encoder=metadata_parser.utils.encode_ascii,
1681 | ),
1682 | {"meta": ["meta:TestMixedField3"]},
1683 | )
1684 |
1685 | # Test A2 - dc only
1686 | # without an encoder, DC generates a dict
1687 | # with the encoder, DC generates a str
1688 | self.assertEqual(
1689 | parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["dc"]),
1690 | {"dc": [{"content": "dc:TestMixedField3"}]},
1691 | )
1692 | self.assertEqual(
1693 | parsed.parsed_result.get_metadatas(
1694 | "TestMixedField3",
1695 | strategy=["dc"],
1696 | encoder=metadata_parser.utils.encode_ascii,
1697 | ),
1698 | {"dc": ["dc:TestMixedField3"]},
1699 | )
1700 |
1701 | # Test A3 - dc within all
1702 | # without an encoder, DC generates a dict
1703 | # with the encoder, DC generates a str
1704 | self.assertEqual(
1705 | parsed.parsed_result.get_metadatas(
1706 | "TestMixedField3",
1707 | strategy="all",
1708 | encoder=metadata_parser.utils.encode_ascii,
1709 | ),
1710 | {
1711 | "meta": ["meta:TestMixedField3"],
1712 | "dc": ["dc:TestMixedField3"],
1713 | },
1714 | )
1715 |
1716 | # Test A3 - dc + meta
1717 | # without an encoder, DC generates a dict
1718 | # with the encoder, DC generates a str
1719 | self.assertEqual(
1720 | parsed.parsed_result.get_metadatas(
1721 | "TestMixedField3",
1722 | strategy=["dc", "meta"],
1723 | encoder=metadata_parser.utils.encode_ascii,
1724 | ),
1725 | {
1726 | "meta": ["meta:TestMixedField3"],
1727 | "dc": ["dc:TestMixedField3"],
1728 | },
1729 | )
1730 |
1731 |
1732 | class TestDocumentParsing_SelectFirstMatch(unittest.TestCase, _TestDocumentParsingCore):
1733 |
1734 | def _test__shared(self, parsed: MetadataParser):
1735 | # but the data is still there...
1736 | self.assertEqual(
1737 | parsed.parsed_result.get_metadatas("keywords.order", strategy="all"),
1738 | {
1739 | "dc": [
1740 | {"content": "dc:keywords.order::1"},
1741 | {"content": "dc:keywords.order::2"},
1742 | ],
1743 | "meta": [
1744 | "meta.keywords.order::1",
1745 | "meta.keywords.order::2",
1746 | ],
1747 | "og": [
1748 | "meta.property=og:keywords.order::1",
1749 | "meta.property=og:keywords.order::2",
1750 | ],
1751 | "twitter": [
1752 | "meta.name=twitter:keywords.order::1",
1753 | "meta.name=twitter:keywords.order::2",
1754 | ],
1755 | },
1756 | )
1757 |
1758 | # all gets meta first
1759 | self.assertEqual(
1760 | parsed.parsed_result.select_first_match("keywords.order", strategy="all"),
1761 | "meta.keywords.order::1",
1762 | )
1763 |
1764 | # only look in: meta
1765 | self.assertEqual(
1766 | parsed.parsed_result.select_first_match(
1767 | "keywords.order", strategy=["meta"]
1768 | ),
1769 | "meta.keywords.order::1",
1770 | )
1771 | # only look in: page
1772 | self.assertEqual(
1773 | parsed.parsed_result.select_first_match(
1774 | "keywords.order", strategy=["page"]
1775 | ),
1776 | None,
1777 | )
1778 | # only look in: dc
1779 | self.assertEqual(
1780 | parsed.parsed_result.select_first_match("keywords.order", strategy=["dc"]),
1781 | "dc:keywords.order::1",
1782 | )
1783 | # only look in: og
1784 | self.assertEqual(
1785 | parsed.parsed_result.select_first_match("keywords.order", strategy=["og"]),
1786 | "meta.property=og:keywords.order::1",
1787 | )
1788 | # only look in: twitter
1789 | self.assertEqual(
1790 | parsed.parsed_result.select_first_match(
1791 | "keywords.order", strategy=["twitter"]
1792 | ),
1793 | "meta.name=twitter:keywords.order::1",
1794 | )
1795 |
1796 | def test__basic(self):
1797 | parsed = self._MakeOneParsed()
1798 | self._test__shared(parsed)
1799 |
1800 | # multiple candidates!
1801 | self.assertEqual(
1802 | parsed.parsed_result.get_metadatas("keywords.order"),
1803 | {
1804 | "dc": [
1805 | {"content": "dc:keywords.order::1"},
1806 | {"content": "dc:keywords.order::2"},
1807 | ],
1808 | "meta": [
1809 | "meta.keywords.order::1",
1810 | "meta.keywords.order::2",
1811 | ],
1812 | "og": [
1813 | "meta.property=og:keywords.order::1",
1814 | "meta.property=og:keywords.order::2",
1815 | ],
1816 | "twitter": [
1817 | "meta.name=twitter:keywords.order::1",
1818 | "meta.name=twitter:keywords.order::2",
1819 | ],
1820 | },
1821 | )
1822 |
1823 | # default gets meta first
1824 | self.assertEqual(
1825 | parsed.parsed_result.select_first_match("keywords.order"),
1826 | "meta.keywords.order::1",
1827 | )
1828 |
1829 | def test__all(self):
1830 | parsed = self._MakeOneParsed(strategy="all")
1831 | self._test__shared(parsed)
1832 |
1833 | # multiple candidates!
1834 | self.assertEqual(
1835 | parsed.parsed_result.get_metadatas("keywords.order"),
1836 | {
1837 | "dc": [
1838 | {"content": "dc:keywords.order::1"},
1839 | {"content": "dc:keywords.order::2"},
1840 | ],
1841 | "meta": [
1842 | "meta.keywords.order::1",
1843 | "meta.keywords.order::2",
1844 | ],
1845 | "og": [
1846 | "meta.property=og:keywords.order::1",
1847 | "meta.property=og:keywords.order::2",
1848 | ],
1849 | "twitter": [
1850 | "meta.name=twitter:keywords.order::1",
1851 | "meta.name=twitter:keywords.order::2",
1852 | ],
1853 | },
1854 | )
1855 |
1856 | # default gets meta first
1857 | self.assertEqual(
1858 | parsed.parsed_result.select_first_match("keywords.order"),
1859 | "meta.keywords.order::1",
1860 | )
1861 |
1862 | def test__meta(self):
1863 | parsed = self._MakeOneParsed(strategy=["meta"])
1864 | self._test__shared(parsed)
1865 |
1866 | # multiple candidates!
1867 | # only shows the meta, because of the init
1868 | self.assertEqual(
1869 | parsed.parsed_result.get_metadatas("keywords.order"),
1870 | {"meta": ["meta.keywords.order::1", "meta.keywords.order::2"]},
1871 | )
1872 |
1873 | # default gets meta first
1874 | self.assertEqual(
1875 | parsed.parsed_result.select_first_match("keywords.order"),
1876 | "meta.keywords.order::1",
1877 | )
1878 |
1879 | def test__reversed(self):
1880 | parsed = self._MakeOneParsed(strategy=["twitter", "dc", "og", "page", "meta"])
1881 |
1882 | self._test__shared(parsed)
1883 |
1884 | # default gets TWITTER first
1885 | self.assertEqual(
1886 | parsed.parsed_result.select_first_match("keywords.order"),
1887 | "meta.name=twitter:keywords.order::1",
1888 | )
1889 |
1890 |
1891 | class Test_UrlParserCacheable(unittest.TestCase):
1892 | """
1893 | python -m unittest tests.document_parsing.Test_UrlParserCacheable
1894 | """
1895 |
1896 | def test__default(self):
1897 | """MetadataParser()"""
1898 | parsed, errors = _docs_test_parser(
1899 | [
1900 | "good-canonical-relative",
1901 | "good-canonical-relative_alt",
1902 | "good-og-relative_alt",
1903 | ],
1904 | "*no-kwarg",
1905 | )
1906 | if errors:
1907 | raise ValueError(errors)
1908 |
1909 | def test__True(self):
1910 | """MetadataParser(cached_urlparser=True)"""
1911 | parsed, errors = _docs_test_parser(
1912 | [
1913 | "good-canonical-relative",
1914 | "good-canonical-relative_alt",
1915 | "good-og-relative_alt",
1916 | ],
1917 | True,
1918 | )
1919 | if errors:
1920 | raise ValueError(errors)
1921 |
1922 | def test__Int_1(self):
1923 | """MetadataParser(cached_urlparser=1)"""
1924 | # this should fail
1925 | with self.assertRaises(ValueError) as cm:
1926 | parsed, errors = _docs_test_parser(
1927 | [
1928 | "good-canonical-relative",
1929 | "good-canonical-relative_alt",
1930 | "good-og-relative_alt",
1931 | ],
1932 | 1,
1933 | )
1934 | if errors:
1935 | raise ValueError(errors)
1936 | assert isinstance(cm.exception, ValueError)
1937 | assert cm.exception.args[0] == "`cached_urlparser` must be a callable"
1938 |
1939 | def test__Int_0(self):
1940 | """MetadataParser(cached_urlparser=0)"""
1941 | parsed, errors = _docs_test_parser(
1942 | [
1943 | "good-canonical-relative",
1944 | "good-canonical-relative_alt",
1945 | "good-og-relative_alt",
1946 | ],
1947 | 0,
1948 | )
1949 | if errors:
1950 | raise ValueError(errors)
1951 | # equivalent to `cached_urlparser=False`
1952 | assert parsed.urlparse is urlparse
1953 |
1954 | def test__None(self):
1955 | parsed, errors = _docs_test_parser(
1956 | [
1957 | "good-canonical-relative",
1958 | "good-canonical-relative_alt",
1959 | "good-og-relative_alt",
1960 | ],
1961 | None,
1962 | )
1963 | if errors:
1964 | raise ValueError(errors)
1965 |
1966 | def test__False(self):
1967 | parsed, errors = _docs_test_parser(
1968 | [
1969 | "good-canonical-relative",
1970 | "good-canonical-relative_alt",
1971 | "good-og-relative_alt",
1972 | ],
1973 | False,
1974 | )
1975 | if errors:
1976 | raise ValueError(errors)
1977 |
1978 | def test__CustomParser(self):
1979 | custom_parser_obj = metadata_parser.UrlParserCacheable()
1980 | custom_parser = custom_parser_obj.urlparse
1981 | parsed, errors = _docs_test_parser(
1982 | [
1983 | "good-canonical-relative",
1984 | "good-canonical-relative_alt",
1985 | "good-og-relative_alt",
1986 | ],
1987 | custom_parser,
1988 | )
1989 | if errors:
1990 | raise ValueError(errors)
1991 |
1992 |
1993 | class Test_UrlParserCacheable_MaxItems(unittest.TestCase):
1994 |
1995 | def test__default(self):
1996 | """MetadataParser()"""
1997 | parsed, errors = _docs_test_parser(
1998 | [
1999 | "good-canonical-relative",
2000 | "good-canonical-relative_alt",
2001 | "good-og-relative_alt",
2002 | ],
2003 | "*no-kwarg",
2004 | cached_urlparser_maxitems=1,
2005 | )
2006 | if errors:
2007 | raise ValueError(errors)
2008 |
2009 | def test__True(self):
2010 | # this should fail
2011 | parsed, errors = _docs_test_parser(
2012 | [
2013 | "good-canonical-relative",
2014 | "good-canonical-relative_alt",
2015 | "good-og-relative_alt",
2016 | ],
2017 | True,
2018 | cached_urlparser_maxitems=1,
2019 | )
2020 | if errors:
2021 | raise ValueError(errors)
2022 |
2023 | def test__False(self):
2024 | # this should fail
2025 | with self.assertRaises(ValueError) as cm:
2026 | parsed, errors = _docs_test_parser(
2027 | [
2028 | "good-canonical-relative",
2029 | "good-canonical-relative_alt",
2030 | "good-og-relative_alt",
2031 | ],
2032 | False,
2033 | cached_urlparser_maxitems=1,
2034 | )
2035 | if errors:
2036 | raise ValueError(errors)
2037 | assert isinstance(cm.exception, ValueError)
2038 | assert (
2039 | cm.exception.args[0]
2040 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2041 | )
2042 |
2043 | def test__Int_1(self):
2044 | # this should fail
2045 | with self.assertRaises(ValueError) as cm:
2046 | parsed, errors = _docs_test_parser(
2047 | [
2048 | "good-canonical-relative",
2049 | "good-canonical-relative_alt",
2050 | "good-og-relative_alt",
2051 | ],
2052 | 1,
2053 | cached_urlparser_maxitems=1,
2054 | )
2055 | if errors:
2056 | raise ValueError(errors)
2057 | assert isinstance(cm.exception, ValueError)
2058 | assert (
2059 | cm.exception.args[0]
2060 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2061 | )
2062 |
2063 | def test__Int_0(self):
2064 | """MetadataParser(cached_urlparser=0)"""
2065 | # this should fail
2066 | with self.assertRaises(ValueError) as cm:
2067 | parsed, errors = _docs_test_parser(
2068 | [
2069 | "good-canonical-relative",
2070 | "good-canonical-relative_alt",
2071 | "good-og-relative_alt",
2072 | ],
2073 | 0,
2074 | cached_urlparser_maxitems=1,
2075 | )
2076 | if errors:
2077 | raise ValueError(errors)
2078 | assert isinstance(cm.exception, ValueError)
2079 | assert (
2080 | cm.exception.args[0]
2081 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2082 | )
2083 |
2084 | def test__None(self):
2085 | # this should fail
2086 | with self.assertRaises(ValueError) as cm:
2087 | parsed, errors = _docs_test_parser(
2088 | [
2089 | "good-canonical-relative",
2090 | "good-canonical-relative_alt",
2091 | "good-og-relative_alt",
2092 | ],
2093 | None,
2094 | cached_urlparser_maxitems=1,
2095 | )
2096 | if errors:
2097 | raise ValueError(errors)
2098 | assert isinstance(cm.exception, ValueError)
2099 | assert (
2100 | cm.exception.args[0]
2101 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2102 | )
2103 |
2104 | def test__CustomParser(self):
2105 | # this should fail
2106 | custom_parser_obj = metadata_parser.UrlParserCacheable()
2107 | custom_parser = custom_parser_obj.urlparse
2108 | with self.assertRaises(ValueError) as cm:
2109 | parsed, errors = _docs_test_parser(
2110 | [
2111 | "good-canonical-relative",
2112 | "good-canonical-relative_alt",
2113 | "good-og-relative_alt",
2114 | ],
2115 | custom_parser,
2116 | cached_urlparser_maxitems=1,
2117 | )
2118 | if errors:
2119 | raise ValueError(errors)
2120 | assert isinstance(cm.exception, ValueError)
2121 | assert (
2122 | cm.exception.args[0]
2123 | == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2124 | )
2125 |
--------------------------------------------------------------------------------
/tests/test_ip_tracking.py:
--------------------------------------------------------------------------------
1 | # stdlib
2 | import unittest
3 |
4 | # local
5 | import metadata_parser
6 |
7 | # ==============================================================================
8 |
9 |
10 | class TestIpLookups(unittest.TestCase):
11 | """"""
12 |
13 | def test_ip_lookup(self):
14 | """
15 | this is using the live internet
16 |
17 | todo: use httpbin
18 | """
19 | url = "https://example.com/"
20 | page = metadata_parser.MetadataParser(url=url)
21 | self.assertTrue(page.peername)
22 |
--------------------------------------------------------------------------------
/tests/test_responses.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # stdlib
4 | import unittest
5 |
6 | # pypi
7 | import requests
8 | import responses
9 |
10 | # local
11 | from metadata_parser import derive_encoding__hook
12 |
13 | # ==============================================================================
14 |
15 |
16 | URLS_HEADER = {
17 | "https://example.com/header=none": (None, "ISO-8859-1", "♥"),
18 | "https://example.com/header=ISO-8859-1": ("ISO-8859-1", "ISO-8859-1", "♥"),
19 | "https://example.com/header=utf-8": ("utf-8", "utf-8", "♥"),
20 | "https://example.com/header=UTF-8": ("UTF-8", "UTF-8", "♥"),
21 | }
22 | URLS_META = {
23 | "https://example.com/content_type=none": (None, "ISO-8859-1", "♥"),
24 | "https://example.com/content_type=ISO-8859-1": (
25 | "ISO-8859-1",
26 | "ISO-8859-1",
27 | "♥",
28 | ),
29 | "https://example.com/content_type=utf-8": ("utf-8", "utf-8", "♥"),
30 | "https://example.com/content_type=UTF-8": ("UTF-8", "UTF-8", "♥"),
31 | }
32 |
33 |
34 | class TestMockedResponse(unittest.TestCase):
35 | def test_simple_encoding_found(self):
36 | """these tests just check to see we derive the right content with `derive_encoding__hook`"""
37 |
38 | requests_session = requests.Session()
39 | requests_session.hooks["response"].append(derive_encoding__hook)
40 |
41 | with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
42 | # track results to this
43 | to_test = {}
44 |
45 | # set up the header tests
46 | for url in URLS_HEADER.keys():
47 | (_header, _expected, _body_char) = URLS_HEADER[url]
48 | _content_type = "text/html"
49 | if _header:
50 | _content_type = "text/html; charset=%s" % _header
51 | _body = "%s" % _body_char
52 | rsps.add(
53 | responses.GET,
54 | url,
55 | body=_body,
56 | status=200,
57 | content_type=_content_type,
58 | )
59 | to_test[url] = (_expected, _body)
60 |
61 | # set up the meta tests
62 | for url in URLS_META.keys():
63 | (_header, _expected, _body_char) = URLS_META[url]
64 | _body = "%s" % _body_char
65 | if _header:
66 | _body = (
67 | '%s'
68 | % (_header, _body_char)
69 | )
70 | rsps.add(
71 | responses.GET, url, body=_body, status=200, content_type="text/html"
72 | )
73 | to_test[url] = (_expected, _body)
74 |
75 | for url in to_test:
76 | (_expected, _body) = to_test[url]
77 | r = requests_session.get(url)
78 | self.assertEqual(r.status_code, 200)
79 | self.assertEqual(r.encoding, _expected)
80 | self.assertEqual(r.text, _body)
81 |
--------------------------------------------------------------------------------
/tests/test_sessions.py:
--------------------------------------------------------------------------------
1 | # stdlib
2 | from typing import Optional
3 | import unittest
4 |
5 | # pypi
6 | from httpbin import app as httpbin_app
7 | import pytest_httpbin.serve
8 | import requests
9 |
10 | # local
11 | import metadata_parser
12 |
13 | # ==============================================================================
14 |
15 |
16 | class SessionRedirect(requests.Session):
17 | num_checked = None
18 |
19 | def get_redirect_target(self, resp):
20 | # previous versions cached this for later use, but now we use a hook
21 | # cached_peername = metadata_parser.get_response_peername(resp)
22 | def _get():
23 | if self.num_checked is None:
24 | self.num_checked = 0
25 | self.num_checked += 1
26 | if resp.is_redirect:
27 | return resp.headers["location"]
28 | if resp.status_code == 200:
29 | # some servers will do a 200 but put a redirect header in there. WTF
30 | dumb_redirect = resp.headers.get("location")
31 | if dumb_redirect:
32 | return dumb_redirect
33 | return None
34 |
35 | # --
36 | if not hasattr(resp, "_redirect_target"):
37 | resp._redirect_target = _get()
38 | return resp._redirect_target
39 |
40 |
41 | class TestSessionsHttpBin(unittest.TestCase):
42 | def setUp(self):
43 | self.httpbin_server = pytest_httpbin.serve.Server(application=httpbin_app)
44 | self.httpbin_server.start()
45 |
46 | def tearDown(self):
47 | self.httpbin_server.stop()
48 | try:
49 | # we're not invoking `pytest_httpbin.serve.Server` in the standard way
50 | # our implementation was copied off another project
51 | # the `_server` is a wsgiref server, and in Py3 simply calling
52 | # `stop()` wil shutdown the server, but it will not `close()` any
53 | # lingering sockets. this explicitly does that.
54 | self.httpbin_server._server.socket.close()
55 | except Exception as exc: # noqa: F841
56 | pass
57 |
58 | def test_no_session(self):
59 | """just checking for args"""
60 | url = self.httpbin_server.url + "/html"
61 | page = metadata_parser.MetadataParser(url=url)
62 | assert page
63 | assert page.url == url
64 |
65 | def test_simple_session(self):
66 | """just checking for args"""
67 | url = self.httpbin_server.url + "/html"
68 | with requests.Session() as s:
69 | page = metadata_parser.MetadataParser(url=url, requests_session=s)
70 | assert page
71 | assert page.url == url
72 |
73 | def test_custom_session(self):
74 | """just checking for a custom session"""
75 | num_redirects = 4
76 | url = self.httpbin_server.url + "/redirect/%s" % num_redirects
77 | with SessionRedirect() as s:
78 | page: Optional[metadata_parser.MetadataParser]
79 | try:
80 | page = metadata_parser.MetadataParser(url=url, requests_session=s)
81 | except metadata_parser.NotParsableJson as e:
82 | page = e.metadataParser
83 | # typing scope
84 | assert page is not None
85 | assert page.response is not None
86 | # we end on get
87 | self.assertEqual(page.response.url, self.httpbin_server.url + "/get")
88 | # the session should have checked the following responses: redirects + final
89 | self.assertEqual(num_redirects + 1, s.num_checked)
90 | self.assertEqual(num_redirects, len(page.response.history))
91 |
92 | # make sure that we tracked the peername. httpbin will encode
93 | self.assertTrue(metadata_parser.get_response_peername(page.response))
94 | for h in page.response.history:
95 | self.assertTrue(metadata_parser.get_response_peername(h))
96 |
--------------------------------------------------------------------------------
/tests/test_url_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf-8 -*-
2 |
3 | # stdlib
4 | import unittest
5 | from urllib.parse import ParseResult
6 | from urllib.parse import ParseResultBytes
7 | from urllib.parse import urlparse
8 |
9 | # local
10 | import metadata_parser
11 |
12 | # ==============================================================================
13 |
14 |
15 | URLS_VALID = [
16 | "http://example.com",
17 | "http://example.com/",
18 | "http://example.com/one",
19 | "http://example.com/one/two.html",
20 | "http://foo.example.com",
21 | "http://example.com:80",
22 | "http://example.com:80/",
23 | "http://example.com:80/one",
24 | "http://example.com:80/one/two.html",
25 | "http://192.168.1.1",
26 | "http://192.168.1.1/",
27 | "http://192.168.1.1:80",
28 | "http://192.168.1.1:8080",
29 | "http://192.168.1.1:80/",
30 | "http://192.168.1.1:8080/",
31 | "http://192.168.1.1:80/a.html",
32 | "http://192.168.1.1:8080/a.html",
33 | "https://example.com",
34 | "https://example.com/",
35 | "https://example.com/one",
36 | "https://example.com/one/two.html",
37 | "https://foo.example.com",
38 | "https://example.com:80",
39 | "https://example.com:80/",
40 | "https://example.com:80/one",
41 | "https://example.com:80/one/two.html",
42 | "https://192.168.1.1",
43 | "https://192.168.1.1/",
44 | "https://192.168.1.1:80",
45 | "https://192.168.1.1:8080",
46 | "https://192.168.1.1:80/",
47 | "https://192.168.1.1:8080/",
48 | "https://192.168.1.1:80/a.html",
49 | "https://192.168.1.1:8080/a.html",
50 | ]
51 |
52 | URLS_VALID_CONDITIONAL = [
53 | "http://localhost",
54 | "http://localhost:80",
55 | "http://localhost:8000",
56 | "http://localhost/foo",
57 | "http://localhost:80/foo",
58 | "http://localhost:8000/foo",
59 | "https://localhost",
60 | "https://localhost:80",
61 | "https://localhost:8000",
62 | "https://localhost/foo",
63 | "https://localhost:80/foo",
64 | "https://localhost:8000/foo",
65 | "http://127.0.0.1",
66 | "http://127.0.0.1:80",
67 | "http://127.0.0.1:8000",
68 | "http://127.0.0.1/foo",
69 | "http://127.0.0.1:80/foo",
70 | "http://127.0.0.1:8000/foo",
71 | "https://127.0.0.1",
72 | "https://127.0.0.1:80",
73 | "https://127.0.0.1:8000",
74 | "https://127.0.0.1/foo",
75 | "https://127.0.0.1:80/foo",
76 | "https://127.0.0.1:8000/foo",
77 | "http://0.0.0.0",
78 | "http://0.0.0.0:80",
79 | "http://0.0.0.0:8000",
80 | "http://0.0.0.0/foo",
81 | "http://0.0.0.0:80/foo",
82 | "http://0.0.0.0:8000/foo",
83 | "https://0.0.0.0",
84 | "https://0.0.0.0:80",
85 | "https://0.0.0.0:8000",
86 | "https://0.0.0.0/foo",
87 | "https://0.0.0.0:80/foo",
88 | "https://0.0.0.0:8000/foo",
89 | ]
90 |
91 | URLS_INVALID = [
92 | "http://example_com",
93 | "http://example_com/",
94 | "http://example_com/one",
95 | "http://999.999.999.999/",
96 | "http://999.999.999.999.999/",
97 | "http://999.999.999.999.999:8080:8080",
98 | "https://example_com",
99 | "https://example_com/",
100 | "https://example_com/one",
101 | "https://999.999.999.999/",
102 | "https://999.999.999.999.999/",
103 | "https://999.999.999.999.999:8080:8080",
104 | ]
105 |
106 |
107 | RFC_REGEX_VALID = [
108 | """http://user:password@one.example.com/foo/bar;one=two&three=four?foo=bar&biz=bash#foo"""
109 | ]
110 |
111 | RFC_REGEX_INVALID = ["""
Then l""", """ccurl" style="display:none;" """] 112 | 113 | 114 | class TestUrlRfcValid(unittest.TestCase): 115 | """ 116 | python -m unittest tests.url_parsing.TestUrlRfcValid 117 | 118 | Ensures URLs contain rfc valid components 119 | """ 120 | 121 | def test_urls_valid(self): 122 | for i in RFC_REGEX_VALID: 123 | matched = metadata_parser.RE_rfc3986_valid_characters.match(i) 124 | self.assertTrue(matched) 125 | 126 | def test_urls_invalid(self): 127 | for i in RFC_REGEX_INVALID: 128 | matched = metadata_parser.RE_rfc3986_valid_characters.match(i) 129 | self.assertTrue(matched is None) 130 | 131 | 132 | class TestUrlParsing(unittest.TestCase): 133 | """ 134 | python -m unittest tests.url_parsing.TestUrls 135 | 136 | Ensures URLs are parsed correctly as valid/invalid 137 | """ 138 | 139 | def test_urls_valid(self): 140 | for i in URLS_VALID: 141 | parsed = urlparse(i) 142 | self.assertTrue(metadata_parser.is_parsed_valid_url(parsed)) 143 | 144 | def test_urls_invalid(self): 145 | for i in URLS_INVALID: 146 | parsed = urlparse(i) 147 | self.assertFalse(metadata_parser.is_parsed_valid_url(parsed)) 148 | 149 | def test_urls_valid_conditional(self): 150 | for i in URLS_VALID_CONDITIONAL: 151 | parsed = urlparse(i) 152 | self.assertFalse( 153 | metadata_parser.is_parsed_valid_url( 154 | parsed, require_public_netloc=True, allow_localhosts=False 155 | ) 156 | ) 157 | self.assertTrue( 158 | metadata_parser.is_parsed_valid_url( 159 | parsed, require_public_netloc=False, allow_localhosts=True 160 | ) 161 | ) 162 | 163 | 164 | class TestAbsoluteUpgrades(unittest.TestCase): 165 | """ 166 | python -m unittest tests.url_parsing.TestAbsoluteUpgrades 167 | 168 | Ensures URLs are parsed correctly as valid/invalid 169 | """ 170 | 171 | def test_none_returns_none(self): 172 | absolute = metadata_parser.url_to_absolute_url(None, url_fallback=None) 173 | self.assertEqual(absolute, None) 174 | 175 | def test_nothing(self): 176 | absolute = metadata_parser.url_to_absolute_url( 177 | "http://example.com", url_fallback="http://example.com" 178 | ) 179 | self.assertEqual(absolute, "http://example.com") 180 | 181 | def test_upgrade(self): 182 | absolute = metadata_parser.url_to_absolute_url( 183 | "a.html", url_fallback="http://example.com" 184 | ) 185 | self.assertEqual(absolute, "http://example.com/a.html") 186 | 187 | def test_fallback(self): 188 | absolute = metadata_parser.url_to_absolute_url( 189 | None, url_fallback="http://example.com" 190 | ) 191 | self.assertEqual(absolute, "http://example.com") 192 | 193 | 194 | class _DocumentCanonicalsMixin(object): 195 | def _MakeOne(self, url): 196 | """generates a canonical document""" 197 | doc_base = """
%(head)s""" 198 | canonical_base = """""" 199 | _canonical_html = canonical_base % {"canonical": url} 200 | _doc_html = doc_base % {"head": _canonical_html} 201 | return _doc_html 202 | 203 | 204 | class TestDocumentCanonicals(unittest.TestCase, _DocumentCanonicalsMixin): 205 | """ 206 | python -m unittest tests.url_parsing.TestDocumentCanonicals 207 | """ 208 | 209 | def test_canonical_simple(self): 210 | """someone did their job""" 211 | url = None 212 | rel_canonical = "https://example.com/canonical" 213 | rel_expected = "https://example.com/canonical" 214 | html_doc = self._MakeOne(rel_canonical) 215 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 216 | parsed_url = parsed.get_discrete_url() 217 | self.assertEqual(parsed_url, rel_expected) 218 | 219 | def test_canonical_upgrade(self): 220 | """someone else did their job. not as good, but did their job""" 221 | url = "https://example.com" 222 | rel_canonical = "/canonical" 223 | rel_expected = "https://example.com/canonical" 224 | html_doc = self._MakeOne(rel_canonical) 225 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 226 | parsed_url = parsed.get_discrete_url() 227 | self.assertEqual(parsed_url, rel_expected) 228 | 229 | def test_upgrade_invalid_root(self): 230 | """ 231 | you had one job... 232 | """ 233 | url = "https://example.com" 234 | rel_canonical = "http://localhost:8080" 235 | rel_expected = "https://example.com" 236 | html_doc = self._MakeOne(rel_canonical) 237 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 238 | parsed_url = parsed.get_discrete_url() 239 | self.assertEqual(parsed_url, rel_expected) 240 | 241 | def test_upgrade_utf8_path(self): 242 | """ 243 | you had one job... but you didn't read the RFC you shitty third rate enterprise cms 244 | """ 245 | url = "https://example.com" 246 | rel_canonical = r"https://example.com/canonical-ü" 247 | rel_expected = r"https://example.com/canonical-%C3%BC" 248 | html_doc = self._MakeOne(rel_canonical) 249 | parsed = metadata_parser.MetadataParser( 250 | url=url, 251 | html=html_doc, 252 | derive_encoding=False, 253 | default_encoding="utf-8", 254 | html_encoding="utf-8", 255 | ) 256 | parsed_url = parsed.get_discrete_url() 257 | self.assertEqual(parsed_url, rel_expected) 258 | 259 | def test_upgrade_invalid_file(self): 260 | """ 261 | you had one job... 262 | if someone lists the canonical as an invalid domain, remount the right domain 263 | 264 | python -m unittest tests.url_parsing.TestDocumentCanonicals.test_upgrade_invalid_file 265 | """ 266 | url = "https://example.com/a" 267 | rel_canonical = "http://localhost:8080" 268 | rel_expected = "https://example.com" 269 | html_doc = self._MakeOne(rel_canonical) 270 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 271 | parsed_url = parsed.get_discrete_url() 272 | self.assertEqual(parsed_url, rel_expected) 273 | 274 | def test_upgrade_invalid_file_b(self): 275 | """ 276 | you had one job... 277 | if someone lists the canonical as a different file on an invalid domain, remount the right domain 278 | """ 279 | url = "https://example.com/a" 280 | rel_canonical = "http://localhost:8080/b" 281 | rel_expected = "https://example.com/b" 282 | html_doc = self._MakeOne(rel_canonical) 283 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 284 | parsed_url = parsed.get_discrete_url() 285 | self.assertEqual(parsed_url, rel_expected) 286 | 287 | def test_readme_scenario(self): 288 | """ 289 | you had one job... 290 | if someone lists the canonical as an invalid LOCAL domain, remount the right domain 291 | 292 | python -m unittest tests.url_parsing.TestDocumentCanonicals.test_readme_scenario 293 | """ 294 | url = "https://example.com/a" 295 | rel_canonical = "http://localhost:8000/alt-path/to/foo" 296 | rel_expected = "https://example.com/alt-path/to/foo" 297 | rel_expected_legacy = rel_canonical 298 | html_doc = self._MakeOne(rel_canonical) 299 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 300 | 301 | # ensure we replace the bad domain with the right one 302 | parsed_url = parsed.get_discrete_url() 303 | self.assertEqual(parsed_url, rel_expected) 304 | 305 | # ensure support for the legacy behavior... 306 | parsed_url = parsed.get_discrete_url(require_public_global=False) 307 | self.assertEqual(parsed_url, rel_expected_legacy) 308 | 309 | 310 | class TestDocumentCanonicalsRelative(unittest.TestCase, _DocumentCanonicalsMixin): 311 | """ 312 | python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative 313 | python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative.test_upgrade_local_a 314 | python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative.test_upgrade_local_b 315 | """ 316 | 317 | def test_upgrade_local_a(self): 318 | """""" 319 | url = "https://example.com/nested/A.html" 320 | rel_canonical = "/nested/B.html" 321 | rel_expected = "https://example.com/nested/B.html" 322 | html_doc = self._MakeOne(rel_canonical) 323 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 324 | parsed_url = parsed.get_discrete_url() 325 | self.assertEqual(parsed_url, rel_expected) 326 | 327 | def test_upgrade_local_b(self): 328 | """""" 329 | url = "https://example.com/nested/A.html" 330 | rel_canonical = "B.html" 331 | rel_expected = "https://example.com/nested/B.html" 332 | html_doc = self._MakeOne(rel_canonical) 333 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 334 | parsed_url = parsed.get_discrete_url() 335 | self.assertEqual(parsed_url, rel_expected) 336 | 337 | def test_upgrade_local_bb(self): 338 | """""" 339 | url = "https://example.com/nested/A.html" 340 | rel_canonical = "path/to/B.html" 341 | rel_expected = "https://example.com/nested/path/to/B.html" 342 | html_doc = self._MakeOne(rel_canonical) 343 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 344 | parsed_url = parsed.get_discrete_url() 345 | self.assertEqual(parsed_url, rel_expected) 346 | 347 | def test_upgrade_local_c(self): 348 | """""" 349 | url = "https://example.com/nested/A.html" 350 | rel_canonical = "/B.html" 351 | rel_expected = "https://example.com/B.html" 352 | html_doc = self._MakeOne(rel_canonical) 353 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 354 | parsed_url = parsed.get_discrete_url() 355 | self.assertEqual(parsed_url, rel_expected) 356 | 357 | def test_noupgrade_a(self): 358 | """ 359 | these tests currently require tldextract; otherwise they won't work right. 360 | """ 361 | if not metadata_parser.USE_TLDEXTRACT: 362 | raise ValueError("these tests currently require tldextract") 363 | 364 | url = "https://example.com/nested/A.html" 365 | rel_canonical = "https://foo.local/B.html" 366 | rel_expected = None 367 | html_doc = self._MakeOne(rel_canonical) 368 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 369 | 370 | parsed_url = parsed.get_url_canonical(require_public_global=True) 371 | self.assertEqual(parsed_url, rel_expected) 372 | 373 | parsed_url = parsed.get_url_opengraph(require_public_global=True) 374 | self.assertEqual(parsed_url, rel_expected) 375 | 376 | parsed_url = parsed.get_url_canonical( 377 | require_public_global=True, url_fallback=url 378 | ) 379 | self.assertEqual(parsed_url, rel_expected) 380 | 381 | parsed_url = parsed.get_url_opengraph( 382 | require_public_global=True, url_fallback=url 383 | ) 384 | self.assertEqual(parsed_url, rel_expected) 385 | 386 | 387 | class TestFixUnicodeUrls(unittest.TestCase): 388 | def test_fix_unicode_path(self): 389 | _test_pairs = ( 390 | ( 391 | "https://example.com/2017/12/abcdefgühijklmnop?a=%20foo", 392 | "https://example.com/2017/12/abcdefg%C3%BChijklmnop?a=%20foo", 393 | ), 394 | ) 395 | for raw, expected in _test_pairs: 396 | cleaned = metadata_parser.fix_unicode_url(raw) 397 | self.assertEqual(cleaned, expected) 398 | 399 | def test_fix_unicode_path_leave_unicode_kwargs(self): 400 | _test_pairs = ( 401 | ( 402 | "https://example.com/2017/12/abcdefgühijklmnop?a=%20foo&b=ü", 403 | "https://example.com/2017/12/abcdefg%C3%BChijklmnop?a=%20foo&b=ü", 404 | ), 405 | ) 406 | for raw, expected in _test_pairs: 407 | cleaned = metadata_parser.fix_unicode_url(raw) 408 | self.assertEqual(cleaned, expected) 409 | 410 | 411 | class TestArgsExceptions(unittest.TestCase, _DocumentCanonicalsMixin): 412 | """ 413 | python -m unittest tests.url_parsing.TestArgsExceptions 414 | """ 415 | 416 | def test_no_args__good(self): 417 | url = "https://example.com/nested/A.html" 418 | rel_canonical = "/B.html" 419 | rel_expected = "https://example.com/B.html" # noqa: F841 420 | html_doc = self._MakeOne(rel_canonical) 421 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 422 | parsed_url = parsed.get_discrete_url() # noqa: F841 423 | 424 | def test_og_first__good(self): 425 | url = "https://example.com/nested/A.html" 426 | rel_canonical = "/B.html" 427 | rel_expected = "https://example.com/B.html" # noqa: F841 428 | html_doc = self._MakeOne(rel_canonical) 429 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 430 | parsed_url = parsed.get_discrete_url(og_first=True) # noqa: F841 431 | 432 | def test_og_first_canonical_first__bad(self): 433 | url = "https://example.com/nested/A.html" 434 | rel_canonical = "/B.html" 435 | rel_expected = "https://example.com/B.html" # noqa: F841 436 | html_doc = self._MakeOne(rel_canonical) 437 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 438 | self.assertRaises( 439 | ValueError, parsed.get_discrete_url, og_first=True, canonical_first=True 440 | ) 441 | 442 | def test_canonical_first__bad(self): 443 | url = "https://example.com/nested/A.html" 444 | rel_canonical = "/B.html" 445 | rel_expected = "https://example.com/B.html" # noqa: F841 446 | html_doc = self._MakeOne(rel_canonical) 447 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 448 | self.assertRaises(ValueError, parsed.get_discrete_url, canonical_first=True) 449 | 450 | def test_canonical_first__good(self): 451 | url = "https://example.com/nested/A.html" 452 | rel_canonical = "/B.html" 453 | rel_expected = "https://example.com/B.html" # noqa: F841 454 | html_doc = self._MakeOne(rel_canonical) 455 | parsed = metadata_parser.MetadataParser(url=url, html=html_doc) 456 | parsed_url = parsed.get_discrete_url( # noqa: F841 457 | og_first=False, canonical_first=True 458 | ) 459 | 460 | 461 | class TestCommands(unittest.TestCase, _DocumentCanonicalsMixin): 462 | """ 463 | python -m unittest tests.url_parsing.TestCommands 464 | """ 465 | 466 | def test_is_parsed_valid_url__string(self): 467 | url = "https://example.com/A.html" 468 | parsed = urlparse(url) 469 | self.assertIsInstance(parsed, ParseResult) 470 | is_valid = metadata_parser.is_parsed_valid_url(parsed) 471 | self.assertTrue(is_valid) 472 | 473 | def test_is_parsed_valid_url__bytes(self): 474 | url = b"https://example.com/A.html" 475 | parsed = urlparse(url) 476 | self.assertIsInstance(parsed, ParseResultBytes) 477 | is_valid = metadata_parser.is_parsed_valid_url(parsed) 478 | self.assertTrue(is_valid) 479 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | lint, 4 | mypy, 5 | py37,py38,py39,py310,py311,py312,py313 6 | 7 | [testenv] 8 | commands = 9 | python --version 10 | pytest {posargs:} 11 | extras = 12 | testing 13 | -------------------------------------------------------------------------------- /types.txt: -------------------------------------------------------------------------------- 1 | 2 | types { 3 | text/html html htm shtml; 4 | text/css css; 5 | text/xml xml; 6 | image/gif gif; 7 | image/jpeg jpeg jpg; 8 | application/javascript js; 9 | application/atom+xml atom; 10 | application/rss+xml rss; 11 | 12 | text/mathml mml; 13 | text/plain txt; 14 | text/vnd.sun.j2me.app-descriptor jad; 15 | text/vnd.wap.wml wml; 16 | text/x-component htc; 17 | 18 | image/png png; 19 | image/tiff tif tiff; 20 | image/vnd.wap.wbmp wbmp; 21 | image/x-icon ico; 22 | image/x-jng jng; 23 | image/x-ms-bmp bmp; 24 | image/svg+xml svg svgz; 25 | image/webp webp; 26 | 27 | application/font-woff woff; 28 | application/java-archive jar war ear; 29 | application/json json; 30 | application/mac-binhex40 hqx; 31 | application/msword doc; 32 | application/pdf pdf; 33 | application/postscript ps eps ai; 34 | application/rtf rtf; 35 | application/vnd.ms-excel xls; 36 | application/vnd.ms-fontobject eot; 37 | application/vnd.ms-powerpoint ppt; 38 | application/vnd.wap.wmlc wmlc; 39 | application/vnd.google-earth.kml+xml kml; 40 | application/vnd.google-earth.kmz kmz; 41 | application/x-7z-compressed 7z; 42 | application/x-cocoa cco; 43 | application/x-java-archive-diff jardiff; 44 | application/x-java-jnlp-file jnlp; 45 | application/x-makeself run; 46 | application/x-perl pl pm; 47 | application/x-pilot prc pdb; 48 | application/x-rar-compressed rar; 49 | application/x-redhat-package-manager rpm; 50 | application/x-sea sea; 51 | application/x-shockwave-flash swf; 52 | application/x-stuffit sit; 53 | application/x-tcl tcl tk; 54 | application/x-x509-ca-cert der pem crt; 55 | application/x-xpinstall xpi; 56 | application/xhtml+xml xhtml; 57 | application/zip zip; 58 | 59 | application/octet-stream bin exe dll; 60 | application/octet-stream deb; 61 | application/octet-stream dmg; 62 | application/octet-stream iso img; 63 | application/octet-stream msi msp msm; 64 | 65 | application/vnd.openxmlformats-officedocument.wordprocessingml.document docx; 66 | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx; 67 | application/vnd.openxmlformats-officedocument.presentationml.presentation pptx; 68 | 69 | audio/midi mid midi kar; 70 | audio/mpeg mp3; 71 | audio/ogg ogg; 72 | audio/x-m4a m4a; 73 | audio/x-realaudio ra; 74 | 75 | video/3gpp 3gpp 3gp; 76 | video/mp4 mp4; 77 | video/mpeg mpeg mpg; 78 | video/quicktime mov; 79 | video/webm webm; 80 | video/x-flv flv; 81 | video/x-m4v m4v; 82 | video/x-mng mng; 83 | video/x-ms-asf asx asf; 84 | video/x-ms-wmv wmv; 85 | video/x-msvideo avi; 86 | } 87 | --------------------------------------------------------------------------------