├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.txt
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── TODO.txt
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
    └── metadata_parser
    │   ├── __init__.py
    │   ├── config.py
    │   ├── exceptions.py
    │   ├── py.typed
    │   ├── regex.py
    │   ├── requests_extensions.py
    │   ├── typing.py
    │   └── utils.py
├── tests
    ├── __init__.py
    ├── html_scaffolds
    │   ├── charset_a.html
    │   ├── charset_b.html
    │   ├── charset_c.html
    │   ├── duplicates.html
    │   └── simple.html
    ├── test_document_parsing.py
    ├── test_ip_tracking.py
    ├── test_responses.py
    ├── test_sessions.py
    └── test_url_parsing.py
├── tox.ini
└── types.txt


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: 
 9 |         - main
10 |         - "branch-*"
11 |   pull_request:
12 |     branches:
13 |         - main
14 |         - "branch-*"
15 | 
16 | jobs:
17 |   build:
18 |     runs-on: ${{ matrix.os }}
19 |     strategy:
20 |       matrix:
21 |         os:
22 |           - "ubuntu-22.04"
23 |         python-version:
24 |           - "3.7"
25 |           - "3.8"
26 |           - "3.9"
27 |           - "3.10"
28 |           - "3.11"
29 |           - "3.12"
30 |           - "3.13"
31 |     steps:
32 |     - uses: actions/checkout@v3
33 |     - name: Set up Python ${{ matrix.python-version }}
34 |       uses: actions/setup-python@v4
35 |       with:
36 |         python-version: ${{ matrix.python-version }}
37 |     - name: Install dependencies
38 |       run: |
39 |         python -m pip install --upgrade pip
40 |         pip install --upgrade tox setuptools flake8 pytest
41 |         pip list
42 |     - name: Test with pytest
43 |       run: |
44 |         tox -e py -- ${{ matrix.pytest-args }}
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tests/private/*
2 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/psf/black
 5 |     rev: 24.8.0
 6 |     hooks:
 7 |       - id: black
 8 |   - repo: https://github.com/pycqa/flake8
 9 |     rev: 7.1.1
10 |     hooks:
11 |       - id: flake8


--------------------------------------------------------------------------------
/CHANGELOG.txt:
--------------------------------------------------------------------------------
  1 | 1.0.0 (unreleased)
  2 | 
  3 |     IMPORTANT
  4 |     
  5 |     This release has many breaking changes.
  6 |     
  7 |     Deprecated legacy code was removed.
  8 |     
  9 |     Work has been done to make the API more consistent.
 10 |     
 11 |     Several long-standing bugs and inconsistencies were fixed.
 12 | 
 13 | 
 14 |     Backwards Incompatible Changes:
 15 | 
 16 |         Remove Deprecated Functions:
 17 |             ``MetadataParser.get_metadata``
 18 |             ``MetadataParser.get_metadatas``
 19 |             ``MetadataParser.is_opengraph_minimum``
 20 |             ``MetadataParser.metadata``
 21 |             ``MetadataParser.metadata_encoding``
 22 |             ``MetadataParser.metadata_version``
 23 |             ``MetadataParser.soup``
 24 |             ``ParsedResult.get_metadata``
 25 | 
 26 |         Remove Deprecated Functionality:
 27 |             ``MetadataParser.__init__::cached_urlparser``
 28 |                     no longer accepts `int` to control `cached_urlparser_maxitems`
 29 | 
 30 |         Encoder changes
 31 |             affected functions:
 32 |                 ``decode_html``
 33 |                 ``encode_ascii``
 34 |                 ``ParsedResult.default_encoder``
 35 |                 ``ParsedResult.get_metadatas::encoder``
 36 |                 ``MetadataParser.__init__::default_encoder``
 37 |             previously, encoders accepted one argument, which was documented to
 38 |                 be a string. This would cause issues if the elements were DC, as
 39 |                 that storage uses a dict. The new behavior is to send a first
 40 |                 raw input value that could either be a Dict or String, and a
 41 |                 second value that is a string identifiying the storage type.
 42 |             now they accept two arguments:
 43 |                 Arg 1 is a string or dict
 44 |                 Arg 2 is optional string, identifying the strategy/store
 45 | 
 46 |         API Changes
 47 |             The package was split into namespaces.
 48 |             ``MetadataParser.__init__`` now validates submitted `strategy` args
 49 | 
 50 |             ``MetadataParser.strategy`` now defaults to: `["meta", "page", "og", "dc", "twitter"]`
 51 |                 previously this was: `["og", "dc", "meta", "page", "twitter"]`
 52 | 
 53 |             ``ParsedResult.get_metadatas`` will now return a dict or None.
 54 |                 A bug was discovered in which it would return the first matched
 55 |                 elements when there were multiple options
 56 | 
 57 |             An invalid strategy will now raise `InvalidStrategy`, a subclass of `ValueError`
 58 |             
 59 |             `InvalidDocument` no longer has a .message attribute
 60 |             
 61 |             Exceptions now invoke `super().__init__(args)`
 62 | 
 63 |     New Functionality
 64 | 
 65 |         ```ParsedResult.select_first_match(field, strategy)```
 66 |             will return the first match for the given, or default strategy
 67 | 
 68 | 
 69 | 
 70 | 0.13.1
 71 |     * guard against incorrect warnings; see Issue#52
 72 |     * add support for branches in github actions
 73 |     
 74 | 0.13.0
 75 |     * drop py36; no test options due to github deprecation of ubuntu20.04
 76 |     * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a
 77 |         ValueError if a string other than "all" is submitted.  The only valid
 78 |         string is "all", otherwise a list of string - excluding "all" - must be
 79 |         submitted. Warnings of this have been emitted for several years.
 80 |     * __init__(`search_head_only`) now defaults to False
 81 |     * `UrlParserCacheable` has been extended to accepted a `urlparser` argument.
 82 |       This defaults to `urlparse` and expects the same signature.
 83 |     * __init__(`cached_urlparser`) has new deprecations to standardize the API
 84 |         submitting an Int to set max_items is deprecated; instead:
 85 |             cached_urlparser=True
 86 |             cached_urlparser_maxitems=int
 87 |         submitting 0 is deprecated; instead:
 88 |             cached_urlparser=False
 89 |                 or
 90 |             cached_urlparser_maxitems=0
 91 |             cached_urlparser=False
 92 |     * __init__(`cached_urlparser_maxitems`) has been added
 93 |     * the next release is likely to be 1.0
 94 | 
 95 | 0.12.3
 96 |     * pin "BeautifulSoup4<4.15.0"
 97 |         * See `https://git.launchpad.net/beautifulsoup/tree/CHANGELOG`
 98 |         > 4.13.0 (20250202)
 99 |         > These things now give DeprecationWarnings when you try to use them,
100 | and are scheduled to be removed in Beautiful Soup 4.15.0.
101 |         * fixes #47
102 | 
103 | 0.12.2
104 |     * Support Python 3.13 via `legacy-cgi` package.
105 |       Thank you, https://github.com/Dryusdan.
106 |       See:
107 |         https://github.com/jvanasco/metadata_parser/pull/44
108 |         https://github.com/jvanasco/metadata_parser/issues/43
109 |     * updated pre-commit-config 
110 | 
111 | 0.12.1
112 |     * typing
113 |     * added `METADATA_PARSER_FUTURE` environment variable
114 |         `export METADATA_PARSER_FUTURE=1` to enable
115 |     * is_parsed_valid_url can accept a ParseResultBytes object now
116 | 
117 | 0.12.0
118 |     * drop python 2.7
119 |     * initial typing support
120 | 
121 | 0.11.0 | UNRELEASED
122 | 
123 |   * BREAKING CHANGES
124 |     Due to the following breaking changes, the version was bumped to 0.11.0
125 |     * `MetadataParser.fetch_url` now returns a third item.
126 |       
127 |   * COMPATIBLE CHANGES
128 |     The following changes are backwards compatible to the 0.10.x releases
129 |     * a test-suite for an application leveraging `metadata_parser` experienced
130 |       some issues due to changes in the Responses package used to mock tests.
131 |       to better faciliate against that, a new change were made:
132 | 
133 |       MetadataParser now has 2 subclassable attributes for items that should
134 |       or should not be parsed:
135 | 
136 |         +    _content_types_parse = ("text/html",)
137 |         +    _content_types_noparse = ("application/json",)
138 | 
139 |       Previously, these values were hardcoded into the logic.     
140 |     * some error log messages were reformatted for clarity
141 |     * some error log messages were incorrectly reformatted by black
142 |     * added logging for NotParseable situations involving redirects
143 |     * added a `.response` attribute to NotParsable errors to help debug
144 |       redirects
145 |     * added a new ResponseHistory class to track redirects
146 |       * it is computed and returned during `MetadataParser.fetch_url`
147 |       * `MetadataParser.parse(` optionally accepts it, and will stash
148 |         it into ParsedResult
149 |       * `ParsedResult` 
150 |         * ResponseHistory is not stashed in the metadata stash, but a new namespace
151 |         * `.response_history` will either be `ResponseHistory` or None
152 |     * improving docstrings
153 |     * added `decode_html` helper
154 |     * extended MetadataParser to allow registration of a defcault_encoder for results
155 |     * style cleanup
156 | 
157 | 0.10.5
158 |     packaging fixes
159 |     migrated 'types.txt' out of distribution; it remains in github source
160 |     updated some log lines with the url
161 |     introduced some new log lines
162 |     added `METADATA_PARSER__DISABLE_TLDEXTRACT` env
163 |     merged, but reverted PR#34 which addresses Issue#32
164 | 
165 | 
166 | 0.10.4
167 |     * black via pre-commit
168 |     * upgraded black; 20.8b1
169 |     * integrated with pre-commit
170 |     * github actions and tox
171 |     * several test files were not in git!
172 | 
173 | 0.10.3
174 |     updated docs on bad data
175 |     black formatting
176 |     added pyproject.toml
177 |     moved BeautifulSoup generation into it's own method, so anyone can subclass to customize
178 |         :fixes: https://github.com/jvanasco/metadata_parser/issues/25
179 |     some internal variable changes thanks to flake8
180 | 
181 | 0.10.2
182 |     added some docs on encoding
183 | 
184 | 0.10.1
185 |     clarifying some inline docs
186 |     BREAKING CHANGE: `fetch_url` now returns a tuple of `(html, encoding)
187 |     now tracking in ParsedResult: encoding
188 |         ParsedResult.metadata['_internal']['encoding'] = resp.encoding.lower() if resp.encoding else None
189 |     `.parse` now accepts `html_encoding`
190 |     refactored url fetching to use context managers
191 |     refactored url fetching to only insert our hooks when needed
192 |     adjusted test harness to close socket connections
193 | 
194 | 0.10.0
195 |     better Python3 support by using the six library
196 | 
197 | 0.9.23
198 |     added tests for url entities
199 |     better grabbing of the charset
200 |     better grabbing of some edge cases
201 | 
202 | 0.9.22
203 |     removed internal calls to the deprecated `get_metadata`, replacing them with `get_metadatas`.
204 |     this will avoid emitting a deprecation warning, allowing users to migrate more easily
205 | 
206 | 0.9.21
207 |     * requests_toolbelt is now required
208 |     ** this is to solve PR#16 / Issue#21
209 |     ** the toolbelt and built-in versions of get_encodings_from_content required different workarounds
210 |     * the output of urlparse is now cached onto the parser instance.
211 |     ** perhaps this will be global cache in the future
212 |     * MetadataParser now accepts `cached_urlparser`
213 |     ** default: True
214 |        options: True: use a instance of UrlParserCacheable(maxitems=30)
215 |               : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser)
216 |               : None/False/0 - use native urlparse
217 |               : other truthy values - use as a custom urlparse
218 | 
219 |     * addressing issue #17 (https://github.com/jvanasco/metadata_parser/issues/17) where `get_link_` logic does not handle schemeless urls.
220 |     ** `MetadataParser.get_metadata_link` will now try to upgrade schemeless links (e.g. urls that start with "//")
221 |     ** `MetadataParser.get_metadata_link` will now check values against `FIELDS_REQUIRE_HTTPS` in certain situations to see if the value is valid for http
222 |     ** `MetadataParser.schemeless_fields_upgradeable` is a tuple of the fields which can be upgradeable. this defaults to a package definition, but can be changed on a per-parser bases.
223 |         The defaults are:
224 |             'image',
225 |             'og:image', 'og:image:url', 'og:audio', 'og:video',
226 |             'og:image:secure_url', 'og:audio:secure_url', 'og:video:secure_url',        
227 |     ** `MetadataParser.schemeless_fields_disallow` is a tuple of the fields which can not be upgradeable. this defaults to a package definition, but can be changed on a per-parser bases.
228 |         The defaults are:
229 |             'canonical',
230 |             'og:url',
231 |     ** `MetadataParser.get_url_scheme()` is a new method to expose the scheme of the active url
232 |     ** `MetadataParser.upgrade_schemeless_url()` is a new method to upgrade schemeless links
233 |         it accepts two arguments: url and field(optional)
234 |         if present, the field is checked against the package tuple FIELDS_REQUIRE_HTTPS to see if the value is valid for http
235 |             'og:image:secure_url',
236 |             'og:audio:secure_url',
237 |             'og:video:secure_url',
238 | 
239 | 0.9.20
240 |     * support for deprecated `twitter:label` and `twitter:data` metatags, which use "value" instead of "content".
241 |     * new param to `__init__` and `parse`: `support_malformed` (default `None`).
242 |       if true, will support malformed parsing (such as consulting "value" instead of "content".
243 |       functionality extended from PR #13 (https://github.com/jvanasco/metadata_parser/pull/13) from https://github.com/amensouissi
244 | 
245 | 0.9.19
246 |     * addressing https://github.com/jvanasco/metadata_parser/issues/12
247 |         on pages with duplicate metadata keys, additional elements are ignored
248 |         when parsing the document, duplicate data was not kept.
249 |     * `MetadataParser.get_metadata` will always return a single string (or none)
250 |     * `MetadataParser.get_metadatas` has been introduced. this will always return an array.
251 |     * the internal parsed_metadata store will now store data in a mix of arrays and strings, keeping it backwards compatible
252 |     * This new version benches slightly slower because of the mixed format but preserves a smaller footprint.
253 |     * the parsed result now contains a version record for tracking the format `_v`.
254 |     * standardized single/double quoting
255 |     * cleaned up some line
256 |     * the library will try to coerce strategy= arguments into the right format
257 |     * when getting dublin core data, the result could either be a string of a dict.  there's no good way to handle this.
258 |     * added tests for encoders
259 |     * greatly expanded tests
260 |     
261 | 0.9.18
262 |     * removed a stray debug line
263 | 
264 | 0.9.17
265 |     * added `retry_dropped_without_headers` option
266 | 
267 | 0.9.16
268 |     * added `fix_unicode_url()`
269 |     * Added `allow_unicode_url` (default True) to the following calls:
270 |         `MetadataParser.get_url_canonical`
271 |         `MetadataParser.get_url_opengraph`
272 |         `MetadataParser.get_discrete_url`
273 |       This functionality will try to recode canonical urls with unicode data into percent-encoded streams
274 | 
275 | 0.9.15
276 |     * Python3 support returned
277 | 
278 | 0.9.14
279 |     * added some more tests to ensure encoding detected correctly
280 |     * stash the soup sooner when parsing, to aid in debugging
281 | 
282 | 0.9.13
283 |     * doing some work to guess encoding...
284 |     * internal: now using `resp` instead of `r`, it is easier for pdb debugging
285 |     * the peername check was changed to be a hook, so it can be processed more immediately
286 |     * the custom session redirect test was altered
287 |     * changed the DummyResponse encoding fallback to `ENCODING_FALLBACK` which is Latin (not utf8)
288 |       this is somewhat backwards incompatible with this library, but maintains compatibility with the underlying `requests` library
289 | 
290 | 0.9.12
291 |     * added more attributes to DummyResponse:
292 |     ** `content`
293 |     ** `headers`
294 | 
295 | 0.9.11
296 |     * some changes to how we handle upgrading bad canonicals
297 |       upgrades will no longer happen IF they specify a bad domain.
298 |       upgrades from localhost will still transfer over
299 | 
300 | 0.9.10
301 |     * slight reorder internally of TLD extract support
302 | 
303 | 0.9.9
304 |     * inspecting `requests` errors for a response and using it if possible
305 |     * this will now try to validate urls if the `tldextract` library is present.
306 |       this feature can be disabled with a global toggle
307 | 
308 |             import metadata_parser
309 |             metadata_parser.USE_TLDEXTRACT = False
310 | 
311 | 0.9.7
312 |     * changed some internal variable names to better clarify difference between a hostname and netloc
313 | 
314 | 0.9.7
315 |     updated the following functions to test for RFC valid characters in the url string
316 |     some websites, even BIG PROFESSIONAL ONES, will put html in here.
317 |     idiots? amateurs? lazy? doesn't matter, they're now our problem.  well, not anymore.
318 |         * get_url_canonical
319 |         * get_url_opengraph
320 |         * get_metadata_link
321 | 
322 | 0.9.6
323 |     this is being held for an update to the `requests` library
324 |     * made the following arguments to `MetadataParser.fetch_url()` default to None - which will then default to the class setting. they are all passed-through to `requests.get`
325 |     ** `ssl_verify`
326 |     ** `allow_redirects`
327 |     ** `requests_timeout`
328 |     * removed `force_parse` kwarg from `MetadataParser.parser`
329 |     * added 'metadata_parser.RedirectDetected' class. if allow_redirects is False, a detected redirect will raise this.
330 |     * added 'metadata_parser.NotParsableRedirect' class. if allow_redirects is False, a detected redirect will raise this if missing a Location.
331 |     * added `requests_session` argument to `MetadataParser`
332 |     * starting to use httpbin for some tests
333 |     * detecting JSON documents
334 |     * extended NotParseable exceptions with the MetadataParser instance as `metadataParser`
335 |     * added `only_parse_http_ok` which defaults to True (legacy).  submitting False will allow non-http200 responses to be parsed.
336 |     * shuffled `fetch_url` logic around. it will now process more data before a potential error.
337 |     * working on support for custom request sessions that can better handle redirects (requires patch or future version of requests)
338 |     * caching the peername onto the response object as `_mp_peername` [ _m(etadata)p(arser)_peername ].  this will allow it to be calculated in a redirect session hook. (see tests/sessions.py)
339 |     * added `defer_fetch` argument to `MetadataParser.__init__`, default ``False``.  If ``True``, this will overwrite the instance's `deferred_fetch` method to actually fetch the url.  this strategy allows for the `page` to be defined and response history caught.  Under this situation, a 301 redirecting to a 500 can be observed; in the previous versions only the 500 would be caught.
340 |     * starting to encapsulate everything into a "parsed result" class
341 |     * fixed opengraph minimum check
342 |     * added `MetadataParser.is_redirect_unique`
343 |     * added `DummyResponse.history`
344 | 
345 | 0.9.5
346 |     * failing to load a document into BeautifulSoup will now catch the BS error and raise NotParsable
347 | 
348 | 0.9.4
349 |     * created `MetadataParser.get_url_canonical`
350 |     * created `MetadataParser.get_url_opengraph`
351 |     * `MetadataParser.get_discrete_url` now calls `get_url_canonical` and `get_url_opengraph`
352 | 
353 | 0.9.3
354 |     * fixed packaging error. removed debug "print" statements
355 | 
356 | 0.9.2
357 |     * upgrade nested local canonical rels correctly
358 | 
359 | 0.9.1
360 |     * added a new `_internal` storage namespace to the `MetadataParser.metadata` payload.
361 |       this simply stashes the `MetadataParser.url` and `MetadataParser.url_actual` attributes to makes objects easier to encode for debugging
362 |     * the twitter parsing was incorrectly looking for 'value' not 'content' as in the current spec
363 |     * tracking the shortlink on a page
364 | 
365 | 0.9.0
366 |     - This has a default behavior change regarding `get_discrete_url()` .
367 |     - `is_parsed_valid_url()` did not correctly handle `require_public_netloc=True`, and would allow for `localhost` values to pass
368 |     - new kwarg `allow_localhosts` added to
369 |         * is_parsed_valid_url
370 |         * is_url_valid
371 |         * url_to_absolute_url
372 |         * MetadataParser.__init__
373 |         * MetadataParser.absolute_url
374 |         * MetadataParser.get_discrete_url
375 |         * MetadataParser.get_metadata_link
376 |     - new method `get_fallback_url`
377 |     - `url_to_absolute_url` will return `None` if not supplied with a fallback and test url. Previously an error in parsing would occur
378 |     - `url_to_absolute_url` tries to do a better job at determining the intended url when given a malformed url.
379 | 
380 | 0.8.3
381 |     - packaging fixes
382 | 
383 | 0.8.2
384 |     - incorporated fix in https://github.com/jvanasco/metadata_parser/pull/10 to handle windows support of socket objects
385 |     - cleaned up some tests
386 |     - added `encode_ascii` helper
387 |     - added git-ignored `tests/private` directory for non-public tests
388 |     - added an `encoder` argument to `get_metadata` for encoding values
389 | 
390 | 0.8.1
391 |     added 2 new properties to a computed MetadataParser object:
392 |         is_redirect = None
393 |         is_redirect_same_host = None
394 |     in the case of redirects, we only have the peername available for the final URL (not the source)
395 |     if a response is a redirect, it may not be for the same host -- and the peername would correspond to the destination URL -- not the origin
396 | 
397 | 0.8.0
398 |     this bump introduces 2 new arguments and some changed behavior:
399 | 
400 |     - `search_head_only=None`.  previously the meta/og/etc data was only searched in the document head (where expected as per HTML specs).
401 |       after indexing millions of pages, many appeared to implement this incorrectly of have html that is so off specification that
402 |       parsing libraries can't correctly read it (for example, Twitter.com).
403 |       This is currently implemented to default from None to True, but future versions will default to `False`.
404 |       This is marked for a future default of `search_head_only=False`
405 | 
406 |     - `raise_on_invalid`.  default False.  If True, this will raise a new exception: InvalidDocument if the response
407 |        does not look like a proper html document
408 | 
409 | 
410 | 
411 | 0.7.4
412 |     - more aggressive attempts to get the peername.
413 | 
414 | 0.7.3
415 |     - this will now try to cache the `peername` of the request (ie, the remote server) onto the peername attribute
416 | 
417 | 0.7.2
418 |     - applying a `strip()` to the "title".  bad authors/cms often have whitespace.
419 | 
420 | 0.7.1
421 |     - added kwargs to docstrings
422 |     - `get_metadata_link` behavior has been changed as follows:
423 |        * if an encoded uri is present (starts with `data:image/`)
424 |        ** this will return None by default
425 |        ** if a kwarg of `allow_encoded_uri=True` is submitted, will return the encoded url (without a url prefix)
426 | 
427 | 0.7.0
428 |     - merged https://github.com/jvanasco/metadata_parser/pull/9 from xethorn
429 |     - nested all commands to `log` under `__debug__` to avoid calls on production when PYTHONOPTIMIZE is set
430 | 
431 | 0.6.18
432 |     - migrated version string into __init__.py
433 | 
434 | 0.6.17
435 |     - added a new `DummyResponse` class to mimic popular attributes of a `requests.response` object when parsing from HTML files
436 | 
437 | 0.6.16
438 |     - incorporated pull8 (https://github.com/jvanasco/metadata_parser/pull/8) which fixes issue5 (https://github.com/jvanasco/metadata_parser/issues/5) with comments
439 | 
440 | 0.6.15
441 |     - fixed README which used old api in the example
442 | 
443 | 0.6.14
444 |     - there was a typo and another bug that passed some tests on BeautifulSoup parsing.  they have been fixed.  todo- migrate tests to public repo
445 | 
446 | 0.6.13
447 |     - trying to integrate a "safe read"
448 | 
449 | 0.6.12
450 |     - now passing "stream=True" to requests.get.  this will fetch the headers first, before looping through the response.  we can avoid many issues with this approach
451 | 
452 | 0.6.11
453 |     - now correctly validating urls with ports. had to restructure a lot of the url validation
454 | 
455 | 0.6.10
456 |     - changed how some nodes are inspected. this should lead to fewer errors
457 | 
458 | 0.6.9
459 |     - added a new method `get_metadata_link()`, which applies link transformations to a metadata in an attempt to ensure a valid link
460 | 
461 | 0.6.8
462 |     - added a kwarg `requests_timeout` to proxy a timeout value to `requests.get()`
463 | 
464 | 0.6.7
465 |     - added a lockdown to `is_parsed_valid_url` titled `http_only` -- requires http/https for the scheme
466 | 
467 | 0.6.6
468 |     - protecting against bad doctypes, like nasa.gov
469 |     -- added `force_doctype` to __init__.  defaults to False. this will change the doctype to get around to bs4/lxml issues
470 |     -- this is defaulted to False.
471 | 
472 | 0.6.5
473 |     - keeping the parsed BS4 document; a user may wish to perform further operations on it.
474 |     -- `MetadataParser.soup` attribute holds BS4 document
475 | 
476 | 0.6.4
477 |     - flake8 fixes. purely cosmetic.
478 | 
479 | 0.6.3
480 |     - no changes.  `sdist upload` was picking up a reference file that wasn't in github; that file killed the distribution install
481 | 
482 | 0.6.2
483 |     - formatting fixes via flake8
484 | 
485 | 0.6.1
486 |     - Lightweight, but functional, url validation
487 |     -- new 'init' argument (defaults to True) : `require_public_netloc`
488 |     -- this will ensure a url's hostname/netloc is either an IPV4 or "public DNS" name
489 |     -- if the url is entirely numeric, requires it to be IPV4
490 |     -- if the url is alphanumeric, requires a TLD + Domain ( exception is "localhost" )
491 |     -- this is NOT RFC compliant, but designed for "Real Life" use cases.
492 | 
493 | 0.6.0
494 |     - Several fixes to improve support of canonical and absolute urls
495 |     -- replaced REGEX parsing of urls with `urlparse` parsing and inspection; too many edge cases got in
496 |     -- refactored `MediaParser.absolute_url` , now proxies a call to new function `url_to_absolute_url`
497 |     -- refactored `MediaParser.get_discrete_url` , now cleaner and leaner.
498 |     -- refactored how some tests run, so there is cleaner output
499 | 
500 | 
501 | 0.5.8
502 |     - trying to fix some issues with distribution
503 | 
504 | 0.5.7
505 |     - trying to parse unparsable pages was creating an error
506 |     -- `MetadataParser.init` now accepts `only_parse_file_extensions` -- list of the only file extensions to parse
507 |     -- `MetadataParser.init` now accepts `force_parse_invalid_content_type` -- forces to parse invalid content
508 |     -- `MetadataParser.fetch_url` will only parse "text/html" content by default
509 | 
510 | 0.5.6
511 |     - trying to ensure we return a valid url in get_discrete_url()
512 |     - adding in some proper unit tests; migrating from the private demo's slowly ( the private demo's hit a lot of internal files and public urls ; wouldn't be proper to make these public )
513 |     - setting `self.url_actual = url` on __init__. this will get overridden on a `fetch`, but allows for a fallback on html docs passed through
514 | 
515 | 
516 | 0.5.5
517 |     - Dropped BS3 support
518 |     - test Python3 support ( support added by Paul Bonser [ https://github.com/pib ] )
519 | 
520 | 
521 | 0.5.4
522 |     - Pull Request - https://github.com/jvanasco/metadata_parser/pull/1
523 |         Credit to Paul Bonser [ https://github.com/pib ]
524 | 
525 | 0.5.3
526 |     - added a few `.strip()` calls to clean up metadata values
527 | 
528 | 0.5.2
529 |     - fixed an issue on html title parsing.  the old method incorrectly regexed on a BS4 tag, not tag contents, creating character encoding issues.
530 | 
531 | 0.5.1
532 |     - missed the ssl_verify command
533 | 
534 | 0.5.0
535 |     - migrated to the requests library
536 | 
537 | 0.4.13
538 |     - trapping all errors in httplib and urrlib2 ; raising as an NotParsable and sticking the original error into the `raised` attribute.
539 |         this will allow for cleaner error handling
540 |     - we *really* need to move to requests.py
541 | 
542 | 0.4.12
543 |     - created a workaround for sharethis hashbang urls, which urllib2 doesn't like
544 |     - we need to move to requests.py
545 | 
546 | 0.4.11
547 |     - added more relaxed controls for parsing safe files
548 | 
549 | 0.4.10
550 |     - fixed force_parse arg on init
551 |     - added support for more filetypes
552 | 
553 | 0.4.9
554 |     - support for gzip documents that pad with extra data ( spec allows, python module doesn't )
555 |     - ensure proper document format
556 | 
557 | 0.4.8
558 |     - added support for twitter's own og style markup
559 |     - cleaned up the beautifulsoup finds for og data
560 |     - moved 'try' from encapsulating 'for' blocks to encapsulating the inner loop.  this will pull more data out if an error occurs.
561 | 
562 | 0.4.7
563 |     - cleaned up some code
564 | 
565 | 0.4.6
566 |     - realized that some servers return gzip content, despite not advertising that this client accepts that content ; fixed by using some ideas from mark pilgrim's feedparser.  metadata_parser now advertises gzip and zlib, and processes it as needed
567 | 
568 | 0.4.5
569 |     - fixed a bug that prevented toplevel directories from being parsed
570 | 
571 | 0.4.4
572 |     - made redirect/masked/shortened links have better dereferenced url support
573 | 
574 | 0.4.2
575 |     - Wrapped title tag traversal with an AttributeException try block
576 |     - Wrapped canonical tag lookup with a KeyError try block, defaulting to 'href' then 'content'
577 |     - Added support for `url_actual` and `url_info` , which persist the data from the urllib2.urlopen object's `geturl()` and `info()`
578 |     - `get_discrete_url` and `absolute_url` use the underlying url_actual data
579 |     - added support for passing data and headers into urllib2 requests
580 | 
581 | 0.4.1
582 |     Initial Release
583 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2018, Jonathan Vanasco <jonathan@findmeon.com>
 2 | 
 3 | MIT License -- http://www.opensource.org/licenses/mit-license
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 
23 | --------------------------------------------------------------------------------
24 | 
25 | Portions of this Library include code from other projects that are available
26 | under similarly permissive licenses.  More information and the license is
27 | available alongside the code.
28 | 
29 | This includes:
30 | 
31 | * code from the Requests project, copyright Kenneth Reitz, which is available
32 |   under the Apache2 license.
33 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft src
 2 | graft tests
 3 | prune tests/private
 4 | 
 5 | include setup.cfg pyproject.toml
 6 | include tox.ini
 7 | include CHANGELOG.txt
 8 | include LICENSE.txt
 9 | include README.rst
10 | 
11 | recursive-exclude * __pycache__ *.py[cod] .DS_Store
12 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | MetadataParser
  2 | ==============
  3 | 
  4 | .. |build_status| image:: https://github.com/jvanasco/metadata_parser/workflows/Python%20package/badge.svg
  5 | 
  6 | Build Status: |build_status|
  7 | 
  8 | MetadataParser is a Python module for pulling metadata out of web documents.
  9 | 
 10 | `BeautifulSoup` is required for parsing.
 11 | `Requests` is required for fetching remote documents.
 12 | `tldextract` is utilized to parse domains, but can be disabled by setting an
 13 | environment variable.
 14 | 
 15 | This project has been used in production for many years, and has successfully
 16 | parsed billions of documents.
 17 | 
 18 | 
 19 | Versioning, Pinning, and Support
 20 | ================================
 21 | 
 22 | This project is using a Semantic Versioning release schedule,
 23 | with a {MAJOR}.{MINOR}.{PATCH} format.
 24 | 
 25 | Users are advised to pin their installations to "metadata_parser<{MINOR +1}"
 26 | 
 27 | For example:
 28 | 
 29 | * if the current release is: `0.10.6`
 30 | * the advised pin is:  `metadata_parser<0.11`
 31 | 
 32 | PATCH releases will usually be bug fixes and new features that support backwards
 33 | compatibility with Public Methods.  Private Methods are not guaranteed to be
 34 | backwards compatible.
 35 | 
 36 | MINOR releases are triggered when there is a breaking change to Public Methods.
 37 | Once a new MINOR release is triggered, first-party support for the previous MINOR
 38 | release is EOL (end of life). PRs for previous releases are welcome, but giving
 39 | them proper attention is not guaranteed.
 40 | 
 41 | Future deprecations will raise warnings.
 42 | 
 43 | By populating the following environment variable, future deprecations will raise exceptions:
 44 | 
 45 |     export METADATA_PARSER_FUTURE=1
 46 | 
 47 | Installation
 48 | =============
 49 | 
 50 | pip install metadata_parser
 51 | 
 52 | 
 53 | Installation Recommendation
 54 | ===========================
 55 | 
 56 | The ``requests`` library version 2.4.3 or newer is strongly recommended.
 57 | 
 58 | This is not required, but it is better.  On earlier versions it is possible to
 59 | have an uncaught DecodeError exception when there is an underlying redirect/404.
 60 | Recent fixes to ``requests`` improve redirect handling, urllib3 and urllib3
 61 | errors.
 62 | 
 63 | 
 64 | Features
 65 | ========
 66 | 
 67 | * ``metadata_parser`` pulls as much metadata out of a document as possible
 68 | * Developers can set a 'strategy' for finding metadata (i.e. only accept
 69 |   opengraph or page attributes)
 70 | * Lightweight but functional(!) url validation
 71 | * Verbose logging
 72 | 
 73 | Logging
 74 | =======
 75 | 
 76 | This file utilizes extensive logging to help developers pinpoint problems.
 77 | 
 78 | * ``log.debug``
 79 |   This log level is mostly used to handle library maintenance and
 80 |   troubleshooting, aka "Library Debugging".  Library Debugging is verbose, but
 81 |   is nested under ``if __debug__:`` statements, so it is compiled away when
 82 |   PYTHONOPTIMIZE is set.
 83 |   Several sections of logic useful to developers will also emit logging
 84 |   statements at the ``debug`` level, regardless of PYTHONOPTIMIZE.
 85 | 
 86 | * ``log.info``
 87 |   Currently unused
 88 | 
 89 | * ``log.warning``
 90 |   Currently unused
 91 | 
 92 | * ``log.error``
 93 |   This log level is mostly used to alert developers of errors that were
 94 |   encountered during url fetching and document parsing, and often emits a log
 95 |   statement just before an Exception is raised. The log statements will contain
 96 |   at least the exception type, and may contain the active URL and additional
 97 |   debugging information, if any of that information is available.
 98 | 
 99 | * ``log.critical``
100 |   Currently unused
101 | 
102 | 
103 | It is STRONGLY recommended to keep Python's logging at ``debug``.
104 | 
105 | 
106 | Optional Integrations
107 | =====================
108 | 
109 | * ``tldextract``
110 |   This package will attempt to use the package ``tldextract`` for advanced domain
111 |   and hostname analysis. If ``tldextract`` is not wanted, it can be disabled
112 |   with an environment variable.
113 | 
114 | 
115 | Environment Variables
116 | =====================
117 | 
118 | * ``METADATA_PARSER__DISABLE_TLDEXTRACT``
119 |   Default: "0".
120 |   If set to "1", the package will not attempt to load ``tldextract``.
121 | 
122 | * ``METADATA_PARSER__ENCODING_FALLBACK``
123 |   Default: "ISO-8859-1"
124 |   Used as the fallback when trying to decode a response.
125 | 
126 | *  ``METADATA_PARSER__DUMMY_URL``
127 |    Used as the fallback URL when calculating url data.
128 | 
129 | 
130 | Notes
131 | =====
132 | 
133 | 1. This package requires BeautifulSoup 4.
134 | 2. For speed, it will instantiate a BeautifulSoup parser with lxml, and
135 |    fallback to 'None' (the internal pure Python) if it can not load lxml.
136 | 3. URL Validation is not RFC compliant, but tries to be "Real World" compliant.
137 | 
138 | It is HIGHLY recommended that you install lxml for usage.
139 | lxml is considerably faster.
140 | Considerably faster.
141 | 
142 | Developers should also use a very recent version of lxml.
143 | segfaults have been reported on lxml versions < 2.3.x;
144 | Using at least the most recent 3.x versions is strongly recommended
145 | 
146 | The default 'strategy' is to look in this order::
147 | 
148 |     meta,page,og,dc,
149 | 
150 | Which stands for the following::
151 | 
152 |     og = OpenGraph
153 |     dc = DublinCore
154 |     meta = metadata
155 |     page = page elements
156 | 
157 | Developers can specify a strategy as a comma-separated list of the above.
158 | 
159 | The only 2 page elements currently supported are::
160 | 
161 |     <title>VALUE</title> -> metadata['page']['title']
162 |     <link rel="canonical" href="VALUE"> -> metadata['page']['link']
163 | 
164 | 'metadata' elements are supported by ``name`` and ``property``.
165 | 
166 | The MetadataParser object also wraps some convenience functions, which can be
167 | used otherwise , that are designed to turn alleged urls into well formed urls.
168 | 
169 | For example, you may pull a page::
170 | 
171 |     http://www.example.com/path/to/file.html
172 | 
173 | and that file indicates a canonical url which is simple "/file.html".
174 | 
175 | This package will try to 'remount' the canonical url to the absolute url of
176 | "http://www.example.com/file.html".
177 | Tt will return None if the end result is not a valid url.
178 | 
179 | This all happens under-the-hood, and is honestly really useful when dealing
180 | with indexers and spiders.
181 | 
182 | 
183 | URL Validation
184 | ==============
185 | 
186 | "Real World" URL validation is enabled by default.  This is not RFC compliant.
187 | 
188 | There are a few gaps in the RFCs that allow for "odd behavior".
189 | Just about any use-case for this package will desire/expect rules that parse
190 | URLs "in the wild", not theoretical.
191 | 
192 | The differences:
193 | 
194 | * If an entirely numeric ip address is encountered, it is assumed to be a
195 |   dot-notation IPV4 and it is checked to have the right amount of valid octets.
196 |   
197 |   The default behavior is to invalidate these hosts::
198 | 
199 |         http://256.256.256.256
200 |         http://999.999.999.999.999
201 | 
202 |   According to RFCs those are valid hostnames that would fail as "IP Addresses"
203 |   but pass as "Domain Names".  However in the real world, one would never
204 |   encounter domain names like those.
205 | 
206 | * The only non-domain hostname that is allowed, is "localhost"
207 | 
208 |   The default behavior is to invalidate these hosts ::
209 | 
210 |         http://example
211 |         http://examplecom
212 | 
213 |   Those are considered to be valid hosts, and might exist on a local network or
214 |   custom hosts file.  However, they are not part of the public internet.
215 | 
216 | Although this behavior breaks RFCs, it greatly reduces the number of
217 | "False Positives" generated when analyzing internet pages. If you want to
218 | include bad data, you can submit a kwarg to ``MetadataParser.__init__``
219 | 
220 | 
221 | Handling Bad URLs and Encoded URIs
222 | ==================================
223 | 
224 | This library tries to safeguard against a few common situations.
225 | 
226 | Encoded URIs and relative urls
227 | ------------------------------
228 | 
229 | Most website publishers will define an image as a URL::
230 | 
231 |     <meta property="og:image" content="http://example.com/image.jpg" />
232 | 
233 | Some will define an image as an encoded URI::
234 | 
235 |     <meta property="og:image" content="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNM+Q8AAc0BZX6f84gAAAAASUVORK5CYII=" />
236 | 
237 | By default, the ``get_metadata_link()`` method can be used to ensure a valid link
238 | is extracted from the metadata payload::
239 | 
240 |     >>> import metadata_parser
241 |     >>> page = metadata_parser.MetadataParser(url="http://www.example.com")
242 |     >>> print(page.get_metadata_link('image'))
243 | 
244 | This method accepts a kwarg ``allow_encoded_uri`` (default False) which will
245 | return the image without further processing::
246 | 
247 |     >>> print(page.get_metadata_link('image', allow_encoded_uri=True))
248 | 
249 | Similarly, if a url is local::
250 | 
251 |     <meta property="og:image" content="/image.jpg" />
252 | 
253 | The ``get_metadata_link`` method will automatically upgrade it onto the domain::
254 | 
255 |     >>> print(page.get_metadata_link('image'))
256 |     http://example.com/image.jpg
257 | 
258 | Poorly Constructed Canonical URLs
259 | ---------------------------------
260 | 
261 | Many website publishers implement canonical URLs incorrectly.
262 | This package tries to fix that.
263 | 
264 | By default ``MetadataParser`` is constructed with ``require_public_netloc=True``
265 | and ``allow_localhosts=True``.
266 | 
267 | This will require somewhat valid 'public' network locations in the url.
268 | 
269 | For example, these will all be valid URLs::
270 | 
271 |     http://example.com
272 |     http://1.2.3.4
273 |     http://localhost
274 |     http://127.0.0.1
275 |     http://0.0.0.0
276 | 
277 | If these known 'localhost' urls are not wanted, they can be filtered out with
278 | ``allow_localhosts=False``::
279 | 
280 |     http://localhost
281 |     http://127.0.0.1
282 |     http://0.0.0.0
283 | 
284 | There are two convenience methods that can be used to get a canonical url or
285 | calculate the effective url::
286 | 
287 | * MetadataParser.get_discrete_url
288 | * MetadataParser.get_metadata_link
289 | 
290 | These both accept an argument ``require_public_global``, which defaults to ``True``.
291 | 
292 | Assuming we have the following content on the url ``http://example.com/path/to/foo``::
293 | 
294 |     <link rel="canonical" href="http://localhost:8000/alt-path/to/foo">
295 | 
296 | By default, versions 0.9.0 and later will detect 'localhost:8000' as an
297 | improper canonical url, and remount the local part "/alt-path/to/foo" onto the
298 | domain that served the file.  The vast majority of times this 'behavior'
299 | has been encountered, this is the intended canonical::
300 | 
301 |     print(page.get_discrete_url())
302 |     >>> http://example.com/alt-path/to/foo
303 | 
304 | In contrast, versions 0.8.3 and earlier will not catch this situation::
305 | 
306 |     print(page.get_discrete_url())
307 |     >>> http://localhost:8000/alt-path/to/foo
308 | 
309 | In order to preserve the earlier behavior, just submit ``require_public_global=False``::
310 | 
311 |     print(page.get_discrete_url(require_public_global=False))
312 |     >>> http://localhost:8000/alt-path/to/foo
313 | 
314 | 
315 | Handling Bad Data
316 | =================
317 | 
318 | Many CMS systems (and developers) create malformed content or incorrect
319 | document identifiers.  When this happens, the BeautifulSoup parser will lose
320 | data or move it into an unexpected place.
321 | 
322 | There are two arguments that can help you analyze this data:
323 | 
324 | * force_doctype::
325 | 
326 |     ``MetadataParser(..., force_doctype=True, ...)``
327 | 
328 | ``force_doctype=True`` will try to replace the identified doctype with "html"
329 | via regex.  This will often make the input data usable by BS4.
330 | 
331 | * search_head_only::
332 | 
333 |     ``MetadataParser(..., search_head_only=False, ...)``
334 | 
335 | ``search_head_only=False`` will not limit the search path to the "<head>" element.
336 | This will have a slight performance hit and will incorporate data from CMS/User
337 | content, not just templates/Site-Operators.
338 | 
339 | 
340 | WARNING
341 | =============
342 | 
343 | Please pin your releases.
344 | 
345 | 
346 | Usage
347 | =====
348 | 
349 | Until version ``0.9.19``, the recommended way to get metadata was to use
350 | ``get_metadata`` which will return a string (or None):
351 | 
352 | **From an URL**::
353 | 
354 |     >>> import metadata_parser
355 |     >>> page = metadata_parser.MetadataParser(url="http://www.example.com")
356 |     >>> print(page.metadata)
357 |     >>> print(page.get_metadatas('title'))
358 |     >>> print(page.get_metadatas('title', strategy=['og',]))
359 |     >>> print(page.get_metadatas('title', strategy=['page', 'og', 'dc',]))
360 | 
361 | **From HTML**::
362 | 
363 |     >>> HTML = """<here>"""
364 |     >>> page = metadata_parser.MetadataParser(html=HTML)
365 |     >>> print(page.metadata)
366 |     >>> print(page.get_metadatas('title'))
367 |     >>> print(page.get_metadatas('title', strategy=['og',]))
368 |     >>> print(page.get_metadatas('title', strategy=['page', 'og', 'dc',]))
369 | 
370 | 
371 | Malformed Data
372 | ==============
373 | 
374 | It is very common to find malformed data. As of version ``0.9.20`` the following
375 | methods should be used to allow malformed presentation::
376 | 
377 |     >>> page = metadata_parser.MetadataParser(html=HTML, support_malformed=True)
378 | 
379 | or::
380 | 
381 |     >>> parsed = page.parse(html=html, support_malformed=True)
382 |     >>> parsed = page.parse(html=html, support_malformed=False)
383 | 
384 | The above options will support parsing common malformed options.  Currently
385 | this only looks at alternate (improper) ways of producing twitter tags, but may
386 | be expanded.
387 | 
388 | Notes
389 | =====
390 | 
391 | when building on Python3, a ``static`` toplevel directory may be needed
392 | 
393 | This library was originally based on Erik River's
394 | `opengraph module <https://github.com/erikriver/opengraph>`_. Something more
395 | aggressive than Erik's module was needed, so this project was started.
396 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
1 | 1.0.0
2 |     tests needed for:
3 |         select_first_strategy
4 |             try to break it
5 |             select different strategies, different data on each


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | check_untyped_defs = True
 3 | exclude = (?x)(
 4 |       ^setup\.py$
 5 |     | ^workspace-demos\/
 6 |     )
 7 | 
 8 | [mypy-httpbin.*]
 9 | ignore_missing_imports = True
10 | 
11 | [mypy-pytest_httpbin.*]
12 | ignore_missing_imports = True
13 | 
14 | [mypy-requests_toolbelt.*]
15 | ignore_missing_imports = True
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | target-version = ['py37']
 4 | exclude = '''
 5 | (
 6 |     /(
 7 | 		 \.eggs
 8 | 		| \.tox
 9 | 		| build
10 | 		| venv-*
11 |     )
12 | )
13 | '''


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | 
3 | filterwarnings =
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | application_import_names = metadata_parser
 3 | import_order_style = appnexus
 4 | exclude = .eggs/*, .pytest_cache/*, .tox/*, build/*, dist/*, workspace-demos/*
 5 | max_line_length = 88
 6 | 
 7 | # ignore = E402,E501,W503
 8 | # E501: line too long
 9 | # F401: imported but unused
10 | # I202: Additional newline in a group of imports
11 | per-file-ignores =
12 | 	setup.py:
13 | 	src/metadata_parser/__init__.py: E501
14 | 	src/metadata_parser/regex.py: E501
15 | 	tests/*: E501
16 | 	tests/_compat.py: F401
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # stdlib
 2 | import os
 3 | import re
 4 | import sys
 5 | 
 6 | # pypi
 7 | from setuptools import find_packages
 8 | from setuptools import setup
 9 | 
10 | # ==============================================================================
11 | 
12 | 
13 | HERE = os.path.abspath(os.path.dirname(__file__))
14 | 
15 | # store version in the init.py
16 | with open(
17 |     os.path.join(HERE, "src", "metadata_parser", "__init__.py")
18 | ) as v_file:  # noqa: E501
19 |     VERSION = (
20 |         re.compile(r'.*__VERSION__ = "(.*?)"', re.S).match(v_file.read()).group(1)
21 |     )  # noqa: E501
22 | 
23 | long_description = description = (
24 |     "A module to parse metadata out of urls and html documents"
25 | )
26 | with open(os.path.join(HERE, "README.rst")) as fp:
27 |     long_description = fp.read()
28 | 
29 | requires = [
30 |     "BeautifulSoup4>4.13.0,<4.14.0",
31 |     "requests>=2.19.1",
32 |     "requests-toolbelt>=0.8.0",
33 |     "typing_extensions",
34 | ]
35 | 
36 | if sys.version_info >= (3, 13):
37 |     requires.append("legacy-cgi")
38 | 
39 | tests_require = [
40 |     "httpbin",
41 |     "pytest",
42 |     "pytest-httpbin",
43 |     "responses",
44 |     "tldextract",
45 |     "types-beautifulsoup4",
46 |     "types-requests",
47 |     "werkzeug<2.1.0",  # httpbin compat issue
48 | ]
49 | testing_extras = tests_require + []
50 | 
51 | # go
52 | setup(
53 |     name="metadata_parser",
54 |     version=VERSION,
55 |     description=description,
56 |     long_description=long_description,
57 |     classifiers=[
58 |         "Intended Audience :: Developers",
59 |         "License :: OSI Approved :: MIT License",
60 |         "Programming Language :: Python :: 3",
61 |         "Programming Language :: Python :: 3.7",
62 |         "Programming Language :: Python :: 3.8",
63 |         "Programming Language :: Python :: 3.9",
64 |         "Programming Language :: Python :: 3.10",
65 |         "Programming Language :: Python :: 3.11",
66 |         "Programming Language :: Python :: 3.12",
67 |         "Programming Language :: Python :: 3.13",
68 |         "Topic :: Text Processing :: Markup :: HTML",
69 |         "Topic :: Software Development :: Libraries :: Python Modules",
70 |     ],
71 |     keywords="opengraph protocol facebook",
72 |     author="Jonathan Vanasco",
73 |     author_email="jonathan@findmeon.com",
74 |     url="https://github.com/jvanasco/metadata_parser",
75 |     license="MIT",
76 |     test_suite="tests",
77 |     packages=find_packages(
78 |         where="src",
79 |     ),
80 |     package_dir={"": "src"},
81 |     package_data={"metadata_parser": ["py.typed"]},
82 |     include_package_data=True,
83 |     zip_safe=False,
84 |     install_requires=requires,
85 |     tests_require=tests_require,
86 |     extras_require={
87 |         "testing": testing_extras,
88 |     },
89 |     entry_points="""
90 |       # -*- Entry points: -*-
91 |       """,
92 | )
93 | 


--------------------------------------------------------------------------------
/src/metadata_parser/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # ==============================================================================
 4 | 
 5 | # defaults
 6 | DISABLE_TLDEXTRACT = bool(
 7 |     int(os.environ.get("METADATA_PARSER__DISABLE_TLDEXTRACT", "0"))
 8 | )
 9 | DUMMY_URL = os.environ.get(
10 |     "METADATA_PARSER__DUMMY_URL", "http://example.com/index.html"
11 | )
12 | ENCODING_FALLBACK = os.environ.get("METADATA_PARSER__ENCODING_FALLBACK", "ISO-8859-1")
13 | FUTURE_BEHAVIOR = bool(int(os.getenv("METADATA_PARSER_FUTURE", "0")))
14 | TESTING = bool(int(os.environ.get("METADATA_PARSER__TESTING", "0")))
15 | 
16 | """
17 | # currently unused
18 | MAX_CONNECTIONTIME = int(
19 |     os.environ.get("METADATA_PARSER__MAX_CONNECTIONTIME", 20)
20 | )  # in seconds
21 | MAX_FILESIZE = int(
22 |     os.environ.get("METADATA_PARSER__MAX_FILESIZE", 2 ** 19)
23 | )  # bytes; this is .5MB
24 | """
25 | 


--------------------------------------------------------------------------------
/src/metadata_parser/exceptions.py:
--------------------------------------------------------------------------------
  1 | # stdlib
  2 | from typing import Optional
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | if TYPE_CHECKING:
  6 |     import requests
  7 |     from . import MetadataParser
  8 |     from .typing import TYPES_RESPONSE
  9 | 
 10 | # ==============================================================================
 11 | 
 12 | 
 13 | class AllowableError(Exception):
 14 |     pass
 15 | 
 16 | 
 17 | class InvalidDocument(Exception):
 18 | 
 19 |     def __str__(self) -> str:
 20 |         return "InvalidDocument: %s" % (self.args[0])
 21 | 
 22 | 
 23 | class InvalidStrategy(ValueError):
 24 | 
 25 |     def __str__(self) -> str:
 26 |         return "InvalidStrategy: %s" % (self.args[0])
 27 | 
 28 | 
 29 | class NotParsable(Exception):
 30 |     code: Optional[int]
 31 |     metadataParser: Optional["MetadataParser"]
 32 |     raised: Optional["requests.exceptions.RequestException"]
 33 |     response: Optional["TYPES_RESPONSE"]
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         message: str = "",
 38 |         raised: Optional["requests.exceptions.RequestException"] = None,
 39 |         code: Optional[int] = None,
 40 |         metadataParser: Optional["MetadataParser"] = None,
 41 |         response: Optional["TYPES_RESPONSE"] = None,
 42 |     ):
 43 |         super().__init__(message, raised, code, metadataParser, response)
 44 |         self.code = code
 45 |         self.message = message
 46 |         self.metadataParser = metadataParser
 47 |         self.raised = raised
 48 |         self.response = response
 49 | 
 50 |     def __str__(self) -> str:
 51 |         return "NotParsable: %s | %s | %s" % (self.message, self.code, self.raised)
 52 | 
 53 | 
 54 | class NotParsableJson(NotParsable):
 55 |     def __str__(self) -> str:
 56 |         return "NotParsableJson: %s | %s | %s" % (self.message, self.code, self.raised)
 57 | 
 58 | 
 59 | class NotParsableRedirect(NotParsable):
 60 |     """Raised if a redirect is detected, but there is no Location header."""
 61 | 
 62 |     def __str__(self) -> str:
 63 |         return "NotParsableRedirect: %s | %s | %s" % (
 64 |             self.message,
 65 |             self.code,
 66 |             self.raised,
 67 |         )
 68 | 
 69 | 
 70 | class NotParsableFetchError(NotParsable):
 71 |     def __str__(self) -> str:
 72 |         return "NotParsableFetchError: %s | %s | %s" % (
 73 |             self.message,
 74 |             self.code,
 75 |             self.raised,
 76 |         )
 77 | 
 78 | 
 79 | class RedirectDetected(Exception):
 80 |     """
 81 |     Raised if a redirect is detected
 82 |     Instance properties:
 83 | 
 84 |     ``location``: redirect location
 85 |     ``code``: status code of the response
 86 |     ``response``: actual response object
 87 |     """
 88 | 
 89 |     code: Optional[int]
 90 |     location: str
 91 |     metadataParser: Optional["MetadataParser"]
 92 |     response: Optional["TYPES_RESPONSE"]
 93 | 
 94 |     def __init__(
 95 |         self,
 96 |         location: str = "",
 97 |         code: Optional[int] = None,
 98 |         response: Optional["TYPES_RESPONSE"] = None,
 99 |         metadataParser: Optional["MetadataParser"] = None,
100 |     ):
101 |         super().__init__(location, code, response, metadataParser)
102 |         self.code = code
103 |         self.location = location
104 |         self.metadataParser = metadataParser
105 |         self.response = response
106 | 


--------------------------------------------------------------------------------
/src/metadata_parser/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jvanasco/metadata_parser/198fed802787ef86010357be399ae33cbc85ccc7/src/metadata_parser/py.typed


--------------------------------------------------------------------------------
/src/metadata_parser/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | # ==============================================================================
 4 | 
 5 | # regex library
 6 | 
 7 | RE_ALL_NUMERIC = re.compile(r"^[\d\.]+$")
 8 | RE_bad_title = re.compile(
 9 |     r"""(?:<title>|&lt;title&gt;)(.*)(?:<?/title>|(?:&lt;)?/title&gt;)""", re.I
10 | )
11 | RE_canonical = re.compile("^canonical$", re.I)
12 | RE_doctype = re.compile(r"^\s*<!DOCTYPE[^>]*>", re.IGNORECASE)
13 | RE_DOMAIN_NAME = re.compile(
14 |     r"""(^
15 |             (?:
16 |                 [A-Z0-9]
17 |                 (?:
18 |                     [A-Z0-9-]{0,61}
19 |                     [A-Z0-9]
20 |                 )?
21 |                 \.
22 |             )+
23 |             (?:
24 |                 [A-Z]{2,6}\.?
25 |                 |
26 |                 [A-Z0-9-]{2,}
27 |             (?<!-)\.?)
28 |         $)""",
29 |     re.VERBOSE | re.IGNORECASE,
30 | )
31 | RE_IPV4_ADDRESS = re.compile(
32 |     r"^(\d{1,3})\.(\d{1,3}).(\d{1,3}).(\d{1,3})$"  # grab 4 octets
33 | )
34 | RE_PORT = re.compile(r"^" r"(?P<main>.+)" r":" r"(?P<port>\d+)" r"$", re.IGNORECASE)
35 | RE_prefix_opengraph = re.compile(r"^og")
36 | RE_prefix_rel_img_src = re.compile("^image_src$", re.I)
37 | RE_prefix_twitter = re.compile(r"^twitter")
38 | 
39 | # we may need to test general validity of url components
40 | RE_rfc3986_valid_characters = re.compile(
41 |     r"""^[a-z0-9\-\.\_\~\:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=\%]+$""", re.I
42 | )
43 | r"""
44 | What is valid in the RFC?
45 |     # don't need escaping
46 |     rfc3986_unreserved__noescape = ['a-z', '0-9', ]
47 | 
48 |     # do need escaping
49 |     rfc3986_unreserved__escape = ['-', '.', '_', '~', ]
50 |     rfc3986_gen_delims__escape = [":", "/", "?", "#", "[", "]", "@", ]
51 |     rfc3986_sub_delims__escape = ["!", "$", "&", "'", "(", ")", "*", "+", ",", ";", "=", ]
52 |     rfc3986_pct_encoded__escape = ["%", ]
53 |     rfc3986__escape = rfc3986_unreserved__escape  + rfc3986_gen_delims__escape + rfc3986_sub_delims__escape + rfc3986_pct_encoded__escape
54 |     rfc3986__escaped = re.escape(''.join(rfc3986__escape))
55 |     rfc3986_chars = ''.join(rfc3986_unreserved__noescape) + rfc3986__escaped
56 |     print rfc3986_chars
57 | 
58 |     a-z0-9\-\.\_\~\:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=\%
59 | """
60 | 
61 | RE_shortlink = re.compile("^shortlink$", re.I)
62 | RE_whitespace = re.compile(r"\s+")
63 | 
64 | # based on DJANGO
65 | # https://github.com/django/django/blob/master/django/core/validators.py
66 | # not testing ipv6 right now, because rules are needed for ensuring they
67 | # are correct
68 | RE_VALID_NETLOC = re.compile(
69 |     r"(?:"
70 |     r"(?P<ipv4>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
71 |     r"|"  # ...or ipv4
72 |     #  r'(?P<ipv6>\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
73 |     #  r'|'
74 |     r"(?P<localhost>localhost)"  # localhost...
75 |     r"|"
76 |     r"(?P<domain>([A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?))"  # domain...
77 |     r"(?P<port>:\d+)?"  # optional port
78 |     r")",
79 |     re.IGNORECASE,
80 | )
81 | 


--------------------------------------------------------------------------------
/src/metadata_parser/requests_extensions.py:
--------------------------------------------------------------------------------
  1 | import _socket  # noqa: I201
  2 | 
  3 | # stdlib
  4 | import cgi  # noqa: I100
  5 | import logging
  6 | import socket
  7 | from typing import Optional
  8 | from typing import Tuple
  9 | from typing import TYPE_CHECKING
 10 | 
 11 | # pypi
 12 | import requests
 13 | from requests_toolbelt.utils.deprecated import get_encodings_from_content
 14 | 
 15 | # local
 16 | from . import config
 17 | from .exceptions import AllowableError
 18 | from .utils import DummyResponse
 19 | from .utils import safe_sample
 20 | 
 21 | 
 22 | if TYPE_CHECKING:
 23 |     from requests.structures import CaseInsensitiveDict
 24 |     from .typing import TYPES_PEERNAME
 25 |     from .typing import TYPES_RESPONSE
 26 | 
 27 | 
 28 | # ==============================================================================
 29 | 
 30 | log = logging.getLogger("metdata_parser")
 31 | 
 32 | # ------------------------------------------------------------------------------
 33 | 
 34 | 
 35 | # peername hacks
 36 | # only use for these stdlib packages
 37 | # eventually will not be needed thanks to upstream changes in `requests`
 38 | try:
 39 |     _compatible_sockets: Tuple = (
 40 |         _socket.socket,
 41 |         socket._socketobject,  # type: ignore[attr-defined]
 42 |     )
 43 | except AttributeError:
 44 |     _compatible_sockets: Tuple = (_socket.socket,)  # type: ignore[no-redef]
 45 | 
 46 | 
 47 | def derive_encoding__hook(resp: "TYPES_RESPONSE", *args, **kwargs) -> None:
 48 |     """
 49 |     a note about `requests`
 50 | 
 51 |     `response.content` is the raw response bytes
 52 |     `response.text` is `response.content` decoded to the identified codec or
 53 |                     the fallback codec.
 54 | 
 55 |     This fallback codec is normally iso-8859-1 (latin-1) which is defined by the
 56 |     RFC for HTTP as the default when no codec is provided in the headers or
 57 |     body. This hook exists because users in certain regions may expect the
 58 |     servers to not follow RFC and for the default encoding to be different.
 59 |     """
 60 |     if TYPE_CHECKING:
 61 |         assert hasattr(resp, "_encoding_fallback")
 62 |         assert hasattr(resp, "_encoding_content")
 63 |         assert hasattr(resp, "_encoding_headers")
 64 | 
 65 |     resp._encoding_fallback = config.ENCODING_FALLBACK
 66 |     # modified version, returns `None` if no charset available
 67 |     resp._encoding_headers = get_encoding_from_headers(resp.headers)
 68 |     resp._encoding_content = None
 69 |     if not resp._encoding_headers and resp.content:
 70 |         # html5 spec requires a meta-charset in the first 1024 bytes
 71 |         _sample = safe_sample(resp.content)
 72 |         resp._encoding_content = get_encodings_from_content(_sample)
 73 |     if resp._encoding_content:
 74 |         # it's a list
 75 |         resp.encoding = resp._encoding_content[0]
 76 |     else:
 77 |         resp.encoding = resp._encoding_headers or resp._encoding_fallback
 78 |     # do not return anything
 79 | 
 80 | 
 81 | def get_encoding_from_headers(headers: "CaseInsensitiveDict") -> Optional[str]:
 82 |     """
 83 |     Returns encodings from given HTTP Header Dict.
 84 | 
 85 |     :param headers: dictionary to extract encoding from.
 86 |     :rtype: str
 87 | 
 88 |     `requests.get("http://example.com").headers`
 89 |         should be `requests.structures.CaseInsensitiveDict`
 90 | 
 91 |     ----------------------------------------------------------------------------
 92 | 
 93 |     Modified from `requests` version 2.x
 94 | 
 95 |     The Requests Library:
 96 | 
 97 |         Copyright 2017 Kenneth Reitz
 98 | 
 99 |         Licensed under the Apache License, Version 2.0 (the "License");
100 |         you may not use this file except in compliance with the License.
101 |         You may obtain a copy of the License at
102 | 
103 |             http://www.apache.org/licenses/LICENSE-2.0
104 | 
105 |         Unless required by applicable law or agreed to in writing, software
106 |         distributed under the License is distributed on an "AS IS" BASIS,
107 |         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
108 |         See the License for the specific language governing permissions and
109 |         limitations under the License.
110 |     """
111 |     content_type = headers.get("content-type")
112 |     if not content_type:
113 |         return None
114 |     content_type, params = cgi.parse_header(content_type)
115 |     if "charset" in params:
116 |         return params["charset"].strip("'\"")
117 |     return None
118 | 
119 | 
120 | # ------------------------------------------------------------------------------
121 | 
122 | 
123 | def get_response_peername(resp: "TYPES_RESPONSE") -> Optional["TYPES_PEERNAME"]:
124 |     """
125 |     used to get the peername (ip+port) data from the request
126 |     if a socket is found, caches this onto the request object
127 | 
128 |     IMPORTANT. this must happen BEFORE any content is consumed.
129 | 
130 |     `response` is really `requests.models.Response`
131 | 
132 |     This will UPGRADE the response object to have the following attribute:
133 | 
134 |         * _mp_peername
135 |     """
136 |     if not isinstance(resp, requests.Response) and not isinstance(resp, DummyResponse):
137 |         # raise AllowableError("Not a HTTPResponse")
138 |         log.debug("Not a supported HTTPResponse | %s", resp)
139 |         log.debug("-> received a type of: %s", type(resp))
140 |         return None
141 | 
142 |     if hasattr(resp, "_mp_peername"):
143 |         return resp._mp_peername
144 | 
145 |     def _get_socket() -> Optional[socket.socket]:
146 |         if isinstance(resp, DummyResponse):
147 |             return None
148 |         i = 0
149 |         while True:
150 |             i += 1
151 |             try:
152 |                 if i == 1:
153 |                     sock = resp.raw._connection.sock  # type: ignore[union-attr]
154 |                 elif i == 2:
155 |                     sock = resp.raw._connection.sock.socket  # type: ignore[union-attr]
156 |                 elif i == 3:
157 |                     sock = resp.raw._fp.fp._sock  # type: ignore[union-attr]
158 |                 elif i == 4:
159 |                     sock = resp.raw._fp.fp._sock.socket  # type: ignore[union-attr]
160 |                 elif i == 5:
161 |                     sock = resp.raw._fp.fp.raw._sock  # type: ignore[union-attr]
162 |                 else:
163 |                     break
164 |                 if not isinstance(sock, _compatible_sockets):
165 |                     raise AllowableError()
166 |                 return sock
167 |             except Exception:
168 |                 pass
169 |         return None
170 | 
171 |     sock = _get_socket()
172 |     if sock:
173 |         # only cache if we have a sock
174 |         # we may want/need to call again
175 |         resp._mp_peername = sock.getpeername()  # type: ignore [union-attr]
176 |     else:
177 |         resp._mp_peername = None  # type: ignore [union-attr]
178 |     return resp._mp_peername  # type: ignore [union-attr]
179 | 
180 | 
181 | # ------------------------------------------------------------------------------
182 | 
183 | 
184 | def response_peername__hook(resp: "TYPES_RESPONSE", *args, **kwargs) -> None:
185 |     get_response_peername(resp)
186 |     # do not return anything
187 | 


--------------------------------------------------------------------------------
/src/metadata_parser/typing.py:
--------------------------------------------------------------------------------
 1 | # stdlib
 2 | from typing import Callable
 3 | from typing import Dict
 4 | from typing import List
 5 | from typing import Optional
 6 | from typing import Tuple
 7 | from typing import TYPE_CHECKING
 8 | from typing import Union
 9 | 
10 | # pypi
11 | from typing_extensions import Protocol  # py38
12 | 
13 | if TYPE_CHECKING:
14 |     import requests
15 |     from urllib.parse import ParseResult
16 |     from . import DummyResponse
17 |     from . import ResponseHistory
18 | 
19 | # ==============================================================================
20 | 
21 | # TYPE_ENCODER = Callable[[str, Optional[str]], str]  # def encode(value, strategy)
22 | TYPE_ENCODER = Callable[
23 |     [str, Optional[str]], Union[str, Dict]
24 | ]  # def encode(value, strategy)
25 | TYPE_REQUESTS_TIMEOUT = Optional[
26 |     Union[int, float, Tuple[int, int], Tuple[float, float]]
27 | ]
28 | TYPE_URL_FETCH = Tuple[str, str, "ResponseHistory"]
29 | TYPE_URLPARSE = Callable[[str], "ParseResult"]
30 | TYPES_PEERNAME = Tuple[str, int]  # (ip, port)
31 | TYPES_RESPONSE = Union["DummyResponse", "requests.Response"]
32 | TYPES_STRATEGY = Union[List[str], str, None]
33 | 
34 | 
35 | class _UrlParserCacheable(Protocol):
36 |     urlparse: TYPE_URLPARSE
37 | 


--------------------------------------------------------------------------------
/src/metadata_parser/utils.py:
--------------------------------------------------------------------------------
  1 | # stdlib
  2 | import datetime
  3 | from html import unescape as html_unescape
  4 | import logging
  5 | from typing import AnyStr
  6 | from typing import Callable
  7 | from typing import Dict
  8 | from typing import List
  9 | from typing import Optional
 10 | from typing import TYPE_CHECKING
 11 | from typing import Union
 12 | import unicodedata
 13 | from urllib.parse import quote as url_quote
 14 | from urllib.parse import unquote as url_unquote
 15 | from urllib.parse import urlparse
 16 | from urllib.parse import urlunparse
 17 | import warnings
 18 | 
 19 | # pypi
 20 | from requests.structures import CaseInsensitiveDict
 21 | from requests_toolbelt.utils.deprecated import get_encodings_from_content
 22 | 
 23 | # local
 24 | from . import config
 25 | from .regex import RE_rfc3986_valid_characters
 26 | 
 27 | 
 28 | if TYPE_CHECKING:
 29 |     from urllib.parse import ParseResult
 30 | 
 31 | 
 32 | # ==============================================================================
 33 | 
 34 | log = logging.getLogger("metdata_parser")
 35 | 
 36 | # ------------------------------------------------------------------------------
 37 | 
 38 | 
 39 | class DummyResponse(object):
 40 |     """
 41 |     A DummyResponse is used to ensure compatibility between url fetching
 42 |     and html data
 43 |     """
 44 | 
 45 |     text: str
 46 |     url: str
 47 |     status_code: int
 48 |     encoding: str
 49 |     elapsed_seconds: float = 0
 50 |     history: List
 51 |     headers: CaseInsensitiveDict
 52 |     content: Optional[Union[str, bytes]] = None
 53 |     default_encoding: str
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         text: str = "",
 58 |         url: str = "",
 59 |         status_code: int = 200,
 60 |         encoding: Optional[str] = None,
 61 |         elapsed_seconds: float = 0,
 62 |         headers: Optional[CaseInsensitiveDict] = None,
 63 |         content: Optional[AnyStr] = None,
 64 |         derive_encoding: Optional[bool] = None,
 65 |         default_encoding: Optional[str] = None,
 66 |     ):
 67 |         self.text = text
 68 |         self.url = url
 69 |         self.status_code = status_code
 70 |         self.elapsed = datetime.timedelta(0, elapsed_seconds)
 71 |         self.headers = headers if headers is not None else CaseInsensitiveDict()
 72 |         self.history = []
 73 |         self.content = content
 74 | 
 75 |         # start `encoding` block
 76 |         if encoding:
 77 |             self.encoding = encoding
 78 |         elif derive_encoding:
 79 |             # only examine first 1024 bytes. in this case chars. utf could be 4x chars
 80 |             _sample = safe_sample(text)
 81 |             encodings = get_encodings_from_content(_sample)
 82 |             if encodings:
 83 |                 self.encoding = encoding = encodings[0]
 84 |         self.default_encoding = default_encoding or config.ENCODING_FALLBACK
 85 |         # second phase cleanup
 86 |         if not encoding:
 87 |             self.encoding = self.default_encoding
 88 |         # end `encoding` block
 89 | 
 90 | 
 91 | def decode_html(raw: Union[str, Dict], strategy: Optional[str] = None) -> str:
 92 |     """
 93 |     helper function to decode text that has both HTML and non-ascii characters
 94 |     """
 95 |     if isinstance(raw, dict):
 96 |         if strategy == "dc":
 97 |             return decode_html(raw.get("content", ""))
 98 |         raise ValueError("strategy `%s` not known to support `dict")
 99 |     text = encode_ascii(html_unescape(raw))
100 |     return text
101 | 
102 | 
103 | def encode_ascii(raw: Union[str, Dict], strategy: Optional[str] = None) -> str:
104 |     """
105 |     helper function to force ascii;
106 |     some edge-cases have unicode line breaks in titles/etc.
107 | 
108 |     reference function for `encoder`
109 | 
110 |     The first arg is a `raw` value to be encoded, which will either be a str or
111 |     dict.  The second arg is an (optional) str that identifies the strategy.
112 | 
113 |     When invoked by `MetadataParser.get_metadatas()` the strategy will always
114 |     be sent.
115 |     """
116 |     if isinstance(raw, dict):
117 |         if strategy == "dc":
118 |             return encode_ascii(raw.get("content", ""))
119 |         raise ValueError("strategy `%s` not known to support `dict")
120 |     if not raw:
121 |         raw = ""
122 |     _as_bytes = unicodedata.normalize("NFKD", raw).encode("ascii", "ignore")
123 |     _as_str = _as_bytes.decode("utf-8", "ignore")
124 |     return _as_str
125 | 
126 | 
127 | def fix_unicode_url(
128 |     url: str,
129 |     encoding: Optional[str] = None,
130 |     urlparser: Callable[[str], "ParseResult"] = urlparse,
131 | ) -> str:
132 |     """
133 |     some cms systems will put unicode in their canonical url
134 |     this is not allowed by rfc.
135 |     currently this function will update the PATH but not the kwargs.
136 |     perhaps it should.
137 |     rfc3986 says that characters should be put into utf8 then percent encoded
138 | 
139 |     kwargs:
140 |         `encoding` - used for python2 encoding
141 |         `urlparser` - defaults to standard `urlparse`, can be substituted with
142 |                       a cacheable version.
143 |     """
144 |     parsed = urlparser(url)
145 |     if parsed.path in ("", "/"):
146 |         # can't do anything
147 |         return url
148 |     if RE_rfc3986_valid_characters.match(parsed.path):
149 |         # again, can't do anything
150 |         return url
151 |     # okay, we know we have bad items in the path, so try and upgrade!
152 |     # turn the namedtuple from urlparse into something we can edit
153 |     candidate = [i for i in parsed]
154 |     for _idx in [2]:  # 2=path, 3=params, 4=queryparams, 5fragment
155 |         try:
156 |             candidate[_idx] = parsed[_idx]
157 |             candidate[_idx] = url_quote(url_unquote(candidate[_idx]))
158 |         except Exception as exc:
159 |             log.debug("fix_unicode_url failure: %s | %s | %s", url, encoding, exc)
160 |             return url
161 |     _url = urlunparse(candidate)
162 |     return _url
163 | 
164 | 
165 | def safe_sample(source: Union[str, bytes]) -> bytes:
166 |     if isinstance(source, bytes):
167 |         _sample = source[:1024]
168 |     else:
169 |         # this block can cause an error on PY3 depending on where the data came
170 |         # from such as what the source is (from a request vs a document/test)
171 |         # thanks, @keyz182 for the PR/investigation
172 |         # https://github.com/jvanasco/metadata_parser/pull/16
173 |         _sample = (source.encode())[:1024]
174 |     return _sample
175 | 
176 | 
177 | def warn_future(message: str) -> None:
178 |     warnings.warn(message, FutureWarning, stacklevel=2)
179 |     if config.FUTURE_BEHAVIOR:
180 |         raise ValueError(message)
181 | 
182 | 
183 | def warn_user(message: str) -> None:
184 |     warnings.warn(message, UserWarning, stacklevel=2)
185 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jvanasco/metadata_parser/198fed802787ef86010357be399ae33cbc85ccc7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/html_scaffolds/charset_a.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head>
3 |     <!-- meta attributes -->
4 | 	<meta http-equiv="content-type" content="text/html; charset=UTF-8">
5 | </head>
6 | <body>
7 | </body>
8 | </html>


--------------------------------------------------------------------------------
/tests/html_scaffolds/charset_b.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head>
3 |     <!-- meta attributes -->
4 | 	<meta name="charset" charset="UTF-8">
5 | </head>
6 | <body>
7 | </body>
8 | </html>


--------------------------------------------------------------------------------
/tests/html_scaffolds/charset_c.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head>
3 |     <!-- meta attributes -->
4 | 	<meta charset="UTF-8">
5 | </head>
6 | <body>
7 | </body>
8 | </html>


--------------------------------------------------------------------------------
/tests/html_scaffolds/duplicates.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 |     <!-- core page attributes -->
  4 |     <title>title</title>
  5 |     <link rel='canonical' href='http://example.com/meta/rel=canonical' />
  6 |     <link rel='shortlink' href='http://example.com/meta/rel=shortlink' />
  7 | 
  8 | 
  9 |     <!-- meta attributes -->
 10 |     <meta name="description" content="meta.description">
 11 |     <meta name="keywords" content="meta.keywords:1">
 12 |     <meta name="keywords" content="meta.keywords:2">
 13 |     <meta name="author" content="meta.author:1">
 14 |     <meta name="author" content="meta.author:2">
 15 | 
 16 | 
 17 |     <!-- meta attributes : opengraph markup -->
 18 |     <meta property="article:publisher" content="https://www.example.com/meta/property=article:publisher" />
 19 |     <meta property="og:site_name" content="meta.property=og:site_name" />
 20 |     <meta property="og:title" content="meta.property=og:title" />
 21 |     <meta property="og:description" content="meta.property=og:description" />
 22 |     <meta property="og:image" content="https://www.example.com/meta/property=og:image" />
 23 |     <meta property="og:image" content="https://www.example.com/meta?property=og:image&duplicate=1" />
 24 |     <meta property="og:image" content="https://www.example.com/meta?property=og:image&amp;duplicate=2" />
 25 |     <meta property="og:url" content="https://www.example.com/meta/property=og:url" />
 26 |     <meta property="og:type" content="meta.property=og:type" />
 27 | 
 28 | 
 29 |     <!-- meta attributes : twitter cards -->
 30 |     <meta name='twitter:card' content='meta.name=twitter:card' />
 31 |     <meta name='twitter:image:src' content='https://example.com/meta/name=twitter:image:src' />
 32 |     <meta name='twitter:site' content='meta.name=twitter:site' />
 33 |     <meta name='twitter:url' content='https://example.com/meta/name=twitter:url' />
 34 |     <meta name='twitter:description' content='meta.name=twitter:description' />
 35 |     <meta name='twitter:title' content='meta.name=twitter:title' />
 36 |     <meta name='twitter:label' content='meta.name=twitter:label||content,1' />
 37 |     <meta name='twitter:data' value='meta.name=twitter:data||value,1' />
 38 |     <meta name='twitter:label' content='meta.name=twitter:label||content,2' />
 39 |     <meta name='twitter:data' value='meta.name=twitter:data||value,2' />
 40 | 
 41 | 
 42 |     <!-- some documents have multiple meta tags -->
 43 |     <meta name="citation_title" content="citation:title" />
 44 |     <meta name="citation_author" content="citation_author:1" />
 45 |     <meta name="citation_author" content="citation_author:2" />
 46 |     <meta name="citation_author" content="citation_author:3" />
 47 |     <meta name="citation_date" content="citation_date" />
 48 |     <meta name="citation_online_date" content="citation_online_date" />
 49 |     <meta name="citation_pdf_url" content="citation_pdf_url" />
 50 | 
 51 |     <!-- dublin core. these are from the RFC https://www.ietf.org/rfc/rfc2731.txt -->
 52 |     <meta name    = "DC.Creator"
 53 |           content = "Plato">
 54 |     <meta name    = "DC.Creator"
 55 |           lang    = "fr"
 56 |           content = "Platon">
 57 |     <meta name    = "DC.Subject"
 58 |           content = "heart attack">
 59 |     <meta name    = "DC.Subject"
 60 |           scheme  = "MESH"
 61 |           content = "Myocardial Infarction; Pericardial Effusion">
 62 |     <meta name    = "DC.Subject"
 63 |           content = "vietnam war">
 64 |     <meta name    = "DC.Subject"
 65 |           scheme  = "LCSH"
 66 |           content = "Vietnamese Conflict, 1961-1975">
 67 |     <meta name    = "DC.Subject"
 68 |           content = "Friendship">
 69 |     <meta name    = "DC.Subject"
 70 |           scheme  = "ddc"
 71 |           content = "158.25">
 72 | 
 73 |     <!-- dublin core. variant of above to test for ordering of duplicates -->
 74 |     <!-- 1a - single item, simple k/v -->
 75 |     <meta name    = "DC.TestMixedCandidates1a"
 76 |           content = "Friendship">
 77 |     <!-- 1b - single item, k/v+meta -->
 78 |     <meta name    = "DC.TestMixedCandidates1b"
 79 |           scheme  = "ddc"
 80 |           content = "158.25">
 81 |     <!-- 2a - mutiple item, meta first  -->
 82 |     <meta name    = "DC.TestMixedCandidates2a"
 83 |           scheme  = "ddc"
 84 |           content = "158.25">
 85 |     <meta name    = "DC.TestMixedCandidates2a"
 86 |           content = "Friendship">
 87 |     <!-- 2b - mutiple item, meta second  -->
 88 |     <meta name    = "DC.TestMixedCandidates2b"
 89 |           content = "Friendship">
 90 |     <meta name    = "DC.TestMixedCandidates2b"
 91 |           scheme  = "ddc"
 92 |           content = "158.25">
 93 | 
 94 |     <!-- some documents have multiple meta tags -->
 95 |     <!-- TestMixedField0: control -->
 96 |     <meta name="TestMixedField0" content="meta:TestMixedField0" />
 97 | 
 98 |     <!-- TestMixedField1: meta+dc-->
 99 |     <meta name="TestMixedField1" content="meta:TestMixedField1" />
100 |     <meta name    = "dc.TestMixedField1"
101 |           content = "dc:TestMixedField1">
102 | 
103 |     <!-- TestMixedField2: meta+dc+dc[scheme]-->
104 |     <meta name="TestMixedField2" content="meta:TestMixedField2" />
105 |     <meta name    = "dc.TestMixedField2"
106 |           content = "dc:TestMixedField2">
107 |     <meta name    = "dc.TestMixedField2"
108 |           scheme  = "ddc"
109 |           content = "dc:TestMixedField2.ddc">
110 | 
111 |     <!-- TestMixedField2: meta+meta+dc-->
112 |     <meta name="TestMixedField3" content="meta:TestMixedField3" />
113 |     <meta name="TestMixedField3" content="meta:TestMixedField3" />
114 |     <meta name    = "dc.TestMixedField3"
115 |           content = "dc:TestMixedField3">
116 | 
117 | 	<!-- testing new - thumbnails -->
118 | 	<meta name="thumbnail" content="https://example.com/path/to/image.jpg">
119 | 	<meta name="thumbnail-2" content="//example.com/path/to/image.jpg">
120 | 	<meta name="thumbnail-3" content="/path/to/image.jpg">
121 | 
122 | 
123 | 	<!-- testing new -->
124 | 	<meta name="news_keywords" content="">
125 | 	<meta name="article:modified_time" content="2017-10-11 01:01:01" property="article:modified_time">
126 | 	<meta name="auto-publish" content="timely">
127 | 	<meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no">
128 | 	<meta name="msapplication-tap-highlight" content="no">
129 | 	<meta name="google-site-verification" content="123123123">
130 | 	<meta name="twitter:data1" value="8 min read">
131 | 	<meta name="google" value="notranslate">
132 | 	<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
133 | 	
134 | 	<!-- testing order -->
135 |     <meta name="keywords.order" content="meta.keywords.order::1">
136 |     <meta name="keywords.order" content="meta.keywords.order::2">
137 |     <meta property="og:keywords.order" content="meta.property=og:keywords.order::1" />
138 |     <meta property="og:keywords.order" content="meta.property=og:keywords.order::2" />
139 |     <meta name="dc.keywords.order" content="dc:keywords.order::1"/>
140 |     <meta name="dc.keywords.order" content="dc:keywords.order::2"/>
141 |     <meta name='twitter:keywords.order' content='meta.name=twitter:keywords.order::1' />
142 |     <meta name='twitter:keywords.order' content='meta.name=twitter:keywords.order::2' />
143 | 
144 | </head>
145 | <body>
146 | body
147 | <img src="https://www.example.com/body/first-image" alt="first image on page">
148 | </body>
149 | </html>


--------------------------------------------------------------------------------
/tests/html_scaffolds/simple.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |     <!-- core page attributes -->
 4 |     <title>title</title>
 5 |     <link rel='canonical' href='http://example.com/meta/rel=canonical' />
 6 |     <link rel='shortlink' href='http://example.com/meta/rel=shortlink' />
 7 |     
 8 |     <!-- meta attributes -->
 9 |     <meta name="description" content="meta.description">
10 |     <meta name="keywords" content="meta.keywords">
11 |     <meta name="author" content="meta.author">
12 | 
13 |     <!-- meta attributes : opengraph markup -->
14 |     <meta property="article:publisher" content="https://www.example.com/meta/property=article:publisher" />
15 |     <meta property="og:site_name" content="meta.property=og:site_name" />
16 |     <meta property="og:title" content="meta.property=og:title" />
17 |     <meta property="og:description" content="meta.property=og:description" />
18 |     <meta property="og:image" content="https://www.example.com/meta/property=og:image" />
19 |     <meta property="og:url" content="https://www.example.com/meta/property=og:url" />
20 |     <meta property="og:type" content="meta.property=og:type" />
21 | 
22 |     <!-- meta attributes : twitter cards -->
23 |     <meta name='twitter:card' content='meta.name=twitter:card' />
24 |     <meta name='twitter:image:src' content='https://example.com/meta/name=twitter:image:src' />
25 |     <meta name='twitter:site' content='meta.name=twitter:site' />
26 |     <meta name='twitter:url' content='https://example.com/meta/name=twitter:url' />
27 |     <meta name='twitter:description' content='meta.name=twitter:description' />
28 |     <meta name='twitter:title' content='meta.name=twitter:title' />
29 |     <meta name='twitter:label' content='meta.name=twitter:label||content' />
30 |     <meta name='twitter:data' value='meta.name=twitter:data||value' />
31 |     <meta name='twitter:invalid' value='meta.name=twitter:invalid' />
32 | 
33 | </head>
34 | <body>
35 | body
36 | <img src="https://www.example.com/body/first-image" alt="first image on page">
37 | </body>
38 | </html>


--------------------------------------------------------------------------------
/tests/test_document_parsing.py:
--------------------------------------------------------------------------------
   1 | # stdlib
   2 | import os
   3 | from typing import Callable
   4 | from typing import Dict
   5 | from typing import List
   6 | from typing import Optional
   7 | from typing import Tuple
   8 | from typing import Union
   9 | import unittest
  10 | 
  11 | # local
  12 | import metadata_parser
  13 | from metadata_parser import MetadataParser
  14 | from metadata_parser import urlparse
  15 | from metadata_parser.exceptions import InvalidStrategy
  16 | 
  17 | 
  18 | # ==============================================================================
  19 | 
  20 | 
  21 | # this bit lets us run the tests directly during development
  22 | _tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
  23 | if _tests_dir.endswith("metadata_parser"):
  24 |     _tests_dir = os.path.join(_tests_dir, "tests")
  25 | _examples_dir = os.path.join(_tests_dir, "html_scaffolds")
  26 | 
  27 | # cache these lazily
  28 | CACHED_FILESYSTEM_DOCUMENTS = {}
  29 | 
  30 | 
  31 | doc_base = """<html><head>%(head)s</head><body></body></html>"""
  32 | 
  33 | docs: Dict = {
  34 |     "good-canonical-absolute": {
  35 |         "url-real": """http://example.com""",
  36 |         "head": {
  37 |             "url-canonical": """http://example.com/canonical.html""",
  38 |             "url-og": None,
  39 |         },
  40 |         "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
  41 |     },
  42 |     "good-og-absolute": {
  43 |         "url-real": """http://example.com""",
  44 |         "head": {"url-canonical": None, "url-og": """http://example.com/og.html"""},
  45 |         "expected": {"get_discrete_url()": "http://example.com/og.html"},
  46 |     },
  47 |     "good-canonical-noscheme-http": {
  48 |         "url-real": """http://example.com""",
  49 |         "head": {"url-canonical": """//example.com/canonical.html""", "url-og": None},
  50 |         "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
  51 |     },
  52 |     "good-og-noscheme-http": {
  53 |         "url-real": """http://example.com""",
  54 |         "head": {"url-canonical": None, "url-og": """//example.com/og.html"""},
  55 |         "expected": {"get_discrete_url()": "http://example.com/og.html"},
  56 |     },
  57 |     "good-canonical-noscheme-https": {
  58 |         "url-real": """https://example.com""",
  59 |         "head": {"url-canonical": """//example.com/canonical.html""", "url-og": None},
  60 |         "expected": {"get_discrete_url()": "https://example.com/canonical.html"},
  61 |     },
  62 |     "good-og-noscheme-https": {
  63 |         "url-real": """https://example.com""",
  64 |         "head": {"url-canonical": None, "url-og": """//example.com/og.html"""},
  65 |         "expected": {"get_discrete_url()": "https://example.com/og.html"},
  66 |     },
  67 |     "good-canonical-relative": {
  68 |         "url-real": """http://example.com""",
  69 |         "head": {"url-canonical": """canonical.html""", "url-og": None},
  70 |         "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
  71 |     },
  72 |     "good-canonical-relative_alt": {
  73 |         "url-real": """http://example.com""",
  74 |         "head": {"url-canonical": """/canonical.html""", "url-og": None},
  75 |         "expected": {"get_discrete_url()": "http://example.com/canonical.html"},
  76 |     },
  77 |     "good-og-relative_alt": {
  78 |         "url-real": """http://example.com""",
  79 |         "head": {"url-canonical": None, "url-og": """/og.html"""},
  80 |         "expected": {"get_discrete_url()": "http://example.com/og.html"},
  81 |     },
  82 |     "bad-canonical": {
  83 |         "url-real": """http://example.com/one-two-three.html""",
  84 |         "head": {"url-canonical": """...""", "url-og": None},
  85 |         "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
  86 |     },
  87 |     "bad-canonical2": {
  88 |         "url-real": """http://example.com/one-two-three.html""",
  89 |         "head": {"url-canonical": """http://""", "url-og": None},
  90 |         "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
  91 |     },
  92 |     "bad-canonical3": {
  93 |         "url-real": """http://example.com/one-two-three.html""",
  94 |         "head": {"url-canonical": """http://contentcreation""", "url-og": None},
  95 |         "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
  96 |     },
  97 |     "bad-og": {
  98 |         "url-real": """http://example.com/one-two-three.html""",
  99 |         "head": {"url-canonical": None, "url-og": """..."""},
 100 |         "expected": {"get_discrete_url()": "http://example.com/one-two-three.html"},
 101 |     },
 102 |     "image-https": {
 103 |         "url-real": """https://example.com""",
 104 |         "head": {
 105 |             "url-canonical": """https://example.com/canonical.html""",
 106 |             "url-og": None,
 107 |             "url-og:image": """https://example.com/img.gif""",
 108 |         },
 109 |         "expected": {"og:image": """https://example.com/img.gif"""},
 110 |     },
 111 |     "image-https-noscheme": {
 112 |         "url-real": """https://example.com""",
 113 |         "head": {
 114 |             "url-canonical": """https://example.com/canonical.html""",
 115 |             "url-og": None,
 116 |             "url-og:image": """//example.com/img.gif""",
 117 |         },
 118 |         "expected": {"og:image": """https://example.com/img.gif"""},
 119 |     },
 120 |     "image-https-noscheme-secure": {
 121 |         "url-real": """https://example.com""",
 122 |         "head": {
 123 |             "url-canonical": """https://example.com/canonical.html""",
 124 |             "url-og": None,
 125 |             "url-og:image:secure_url": """//example.com/img.gif""",
 126 |         },
 127 |         "expected": {"og:image:secure_url": """https://example.com/img.gif"""},
 128 |     },
 129 |     "image-http": {
 130 |         "url-real": """http://example.com""",
 131 |         "head": {
 132 |             "url-canonical": """http://example.com/canonical.html""",
 133 |             "url-og": None,
 134 |             "url-og:image": """http://example.com/img.gif""",
 135 |         },
 136 |         "expected": {"og:image": """http://example.com/img.gif"""},
 137 |     },
 138 |     "image-http-noscheme": {
 139 |         "url-real": """http://example.com""",
 140 |         "head": {
 141 |             "url-canonical": """http://example.com/canonical.html""",
 142 |             "url-og": None,
 143 |             "url-og:image": """//example.com/img.gif""",
 144 |         },
 145 |         "expected": {"og:image": """http://example.com/img.gif"""},
 146 |     },
 147 |     "image-http-noscheme-secure": {
 148 |         "url-real": """http://example.com""",
 149 |         "head": {
 150 |             "url-canonical": """//example.com/canonical.html""",
 151 |             "url-og": None,
 152 |             "url-og:image:secure_url": """//example.com/img.gif""",
 153 |         },
 154 |         "expected": {"og:image:secure_url": None},
 155 |     },
 156 | }
 157 | 
 158 | 
 159 | def encoder_capitalizer(
 160 |     raw: Union[str, Dict], strategy: Optional[str] = None
 161 | ) -> Union[str, dict]:
 162 |     # note, an api compliant encoder will only return str
 163 |     if isinstance(raw, dict):
 164 |         return {k.upper(): v.upper() for k, v in raw.items()}
 165 |     return raw.upper()
 166 | 
 167 | 
 168 | def encoder_lowercaser(
 169 |     raw: Union[str, Dict], strategy: Optional[str] = None
 170 | ) -> Union[str, dict]:
 171 |     # note, an api compliant encoder will only return str
 172 |     if isinstance(raw, dict):
 173 |         return {k.lower(): v.lower() for k, v in raw.items()}
 174 |     return raw.lower()
 175 | 
 176 | 
 177 | # setup the test_docs with html bodies
 178 | for test in list(docs.keys()):
 179 |     head = ""
 180 |     if "url-og" in docs[test]["head"]:
 181 |         if docs[test]["head"]["url-og"] is not None:
 182 |             head += (
 183 |                 """<meta property="og:url" content="%s"/>"""
 184 |                 % docs[test]["head"]["url-og"]
 185 |             )
 186 |     if "url-canonical" in docs[test]["head"]:
 187 |         if docs[test]["head"]["url-canonical"] is not None:
 188 |             head += (
 189 |                 """<link rel="canonical" href="%s" />"""
 190 |                 % docs[test]["head"]["url-canonical"]
 191 |             )
 192 |     if "url-og:image" in docs[test]["head"]:
 193 |         if docs[test]["head"]["url-og:image"] is not None:
 194 |             head += (
 195 |                 """<meta property="og:image" content="%s" />"""
 196 |                 % docs[test]["head"]["url-og:image"]
 197 |             )
 198 |     if "url-og:image:secure_url" in docs[test]["head"]:
 199 |         if docs[test]["head"]["url-og:image:secure_url"] is not None:
 200 |             head += (
 201 |                 """<meta property="og:image:secure_url" content="%s" />"""
 202 |                 % docs[test]["head"]["url-og:image:secure_url"]
 203 |             )
 204 |     custom_vars = {"head": head}
 205 |     docs[test]["doc"] = doc_base % custom_vars
 206 | 
 207 | 
 208 | def _docs_test(test_names):
 209 |     errors = []
 210 |     for test in test_names:
 211 |         tests = []
 212 |         url = docs[test]["url-real"]
 213 |         parsed = metadata_parser.MetadataParser(url=url, html=docs[test]["doc"])
 214 |         if "get_discrete_url()" in docs[test]["expected"]:
 215 |             tests.append("get_discrete_url()")
 216 |             url_expected = docs[test]["expected"]["get_discrete_url()"]
 217 |             url_retrieved = parsed.get_discrete_url()
 218 |             if url_retrieved != url_expected:
 219 |                 errors.append([test, "get_discrete_url()", url_expected, url_retrieved])
 220 |         if "og:image" in docs[test]["expected"]:
 221 |             tests.append("og:image")
 222 |             url_expected = docs[test]["expected"]["og:image"]
 223 |             url_retrieved = parsed.get_metadata_link("og:image")
 224 |             if url_retrieved != url_expected:
 225 |                 errors.append([test, "og:image", url_expected, url_retrieved])
 226 |         if "og:image:secure_url" in docs[test]["expected"]:
 227 |             tests.append("og:image:secure_url")
 228 |             url_expected = docs[test]["expected"]["og:image:secure_url"]
 229 |             url_retrieved = parsed.get_metadata_link("og:image:secure_url")
 230 |             if url_retrieved != url_expected:
 231 |                 errors.append(
 232 |                     [test, "og:image:secure_url", url_expected, url_retrieved]
 233 |                 )
 234 |         if not tests:
 235 |             raise ValueError("No tests!")
 236 |     return errors
 237 | 
 238 | 
 239 | def _docs_test_parser(
 240 |     test_names, cached_urlparser, cached_urlparser_maxitems=None
 241 | ) -> Tuple[metadata_parser.MetadataParser, List]:
 242 |     errors = []
 243 |     for test in test_names:
 244 |         tests = []
 245 |         url = docs[test]["url-real"]
 246 |         kwargs = {}
 247 |         if cached_urlparser != "*no-kwarg":
 248 |             kwargs["cached_urlparser"] = cached_urlparser
 249 |         if cached_urlparser_maxitems is not None:
 250 |             kwargs["cached_urlparser_maxitems"] = cached_urlparser_maxitems
 251 |         parsed = metadata_parser.MetadataParser(
 252 |             url=url, html=docs[test]["doc"], **kwargs
 253 |         )
 254 |         if "get_discrete_url()" in docs[test]["expected"]:
 255 |             tests.append("get_discrete_url()")
 256 |             url_expected = docs[test]["expected"]["get_discrete_url()"]
 257 |             url_retrieved = parsed.get_discrete_url()
 258 |             if url_retrieved != url_expected:
 259 |                 errors.append([test, "get_discrete_url()", url_expected, url_retrieved])
 260 |         if not tests:
 261 |             raise ValueError("No tests!")
 262 |     return parsed, errors
 263 | 
 264 | 
 265 | class TestHtmlDocument(unittest.TestCase):
 266 |     """
 267 |     python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__good_relative
 268 |     python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__good_absolute
 269 |     python -m unittest tests.document_parsing.TestHtmlDocument.test_get_discrete_url__bad
 270 |     """
 271 | 
 272 |     def test_get_discrete_url__good_relative(self):
 273 |         errors = _docs_test(
 274 |             [
 275 |                 "good-canonical-relative",
 276 |                 "good-canonical-relative_alt",
 277 |                 "good-og-relative_alt",
 278 |             ]
 279 |         )
 280 |         if errors:
 281 |             raise ValueError(errors)
 282 | 
 283 |     def test_get_discrete_url__good_absolute(self):
 284 |         errors = _docs_test(["good-canonical-absolute", "good-og-absolute"])
 285 |         if errors:
 286 |             raise ValueError(errors)
 287 | 
 288 |     def test_get_discrete_url__good_noscheme(self):
 289 |         errors = _docs_test(
 290 |             [
 291 |                 "good-canonical-noscheme-http",
 292 |                 "good-og-noscheme-http",
 293 |                 "good-canonical-noscheme-https",
 294 |                 "good-og-noscheme-https",
 295 |             ]
 296 |         )
 297 |         if errors:
 298 |             raise ValueError(errors)
 299 | 
 300 |     def test_get_discrete_url__bad(self):
 301 |         errors = _docs_test(
 302 |             ["bad-canonical", "bad-canonical2", "bad-canonical3", "bad-og"]
 303 |         )
 304 |         if errors:
 305 |             raise ValueError(errors)
 306 | 
 307 |     def test_get_image(self):
 308 |         errors = _docs_test(
 309 |             [
 310 |                 "image-http-noscheme-secure",
 311 |                 "image-https-noscheme-secure",
 312 |                 "image-http",
 313 |                 "image-https",
 314 |                 "image-http-noscheme",
 315 |                 "image-https-noscheme",
 316 |             ]
 317 |         )
 318 |         if errors:
 319 |             raise ValueError(errors)
 320 | 
 321 | 
 322 | class TestEncoders(unittest.TestCase):
 323 |     """
 324 |     python -munittest tests.test_document_parsing.TestEncoders
 325 |     """
 326 | 
 327 |     _data = {
 328 |         "unicode_whitespace": {
 329 |             "raw": """Example line with\xa0unicode whitespace.""",
 330 |             "ascii": """Example line with unicode whitespace.""",
 331 |         },
 332 |         "unicode_chars": {
 333 |             "raw": """Example line with\xc2\xa0unicode chars.""",
 334 |             "ascii": """Example line withA unicode chars.""",
 335 |         },
 336 |         "decode_html_encoder": {
 337 |             "html": """<html><head><meta name="description" content="Foo&amp;nbsp;Bar, &amp;quot;Biz Bang Bash.&amp;quot;"/></head></html>""",
 338 |             "parsed": "Foo&nbsp;Bar, &quot;Biz Bang Bash.&quot;",
 339 |             "decoded": 'Foo Bar, "Biz Bang Bash."',
 340 |         },
 341 |     }
 342 | 
 343 |     def _make_raw(self, data_option):
 344 |         # create a parsed result, and inject raw data.
 345 |         # data coming through beautifulsoup will be parsed differently
 346 |         parsed = metadata_parser.MetadataParser()
 347 |         parsed.parsed_result.metadata["meta"]["title"] = self._data[data_option]["raw"]
 348 |         return parsed
 349 | 
 350 |     def _make_html(self, data_option, default_encoder: Optional[Callable] = None):
 351 |         # data coming through beautifulsoup is parsed by that library
 352 |         parsed = metadata_parser.MetadataParser(
 353 |             html=self._data[data_option]["html"],
 354 |             force_doctype=True,
 355 |             default_encoder=default_encoder,
 356 |         )
 357 |         return parsed
 358 | 
 359 |     def test_unicode_whitespace(self):
 360 |         parsed = self._make_raw("unicode_whitespace")
 361 |         # title_raw = parsed.parsed_result.get_metadatas('title')
 362 |         _title_ascii = parsed.parsed_result.get_metadatas(
 363 |             "title", encoder=metadata_parser.utils.encode_ascii
 364 |         )
 365 |         title_ascii = _title_ascii["meta"]
 366 |         self.assertEqual(title_ascii[0], self._data["unicode_whitespace"]["ascii"])
 367 | 
 368 |     def test_unicode_chars(self):
 369 |         parsed = self._make_raw("unicode_chars")
 370 |         # title_raw = parsed.parsed_result.get_metadatas('title')
 371 |         _title_ascii = parsed.parsed_result.get_metadatas(
 372 |             "title", encoder=metadata_parser.utils.encode_ascii
 373 |         )
 374 |         title_ascii = _title_ascii["meta"]
 375 |         self.assertEqual(title_ascii[0], self._data["unicode_chars"]["ascii"])
 376 | 
 377 |     def test_decode_html_encoder(self):
 378 |         parsed = self._make_html("decode_html_encoder")
 379 |         _parsed_description = parsed.parsed_result.get_metadatas("description")
 380 |         parsed_description = _parsed_description["meta"]
 381 | 
 382 |         decoded_direct = metadata_parser.utils.decode_html(parsed_description[0])
 383 |         self.assertEqual(decoded_direct, self._data["decode_html_encoder"]["decoded"])
 384 | 
 385 |         _decoded_decoder = parsed.parsed_result.get_metadatas(
 386 |             "description", encoder=metadata_parser.utils.decode_html
 387 |         )
 388 |         decoded_decoder = _decoded_decoder["meta"]
 389 |         self.assertEqual(
 390 |             decoded_decoder[0], self._data["decode_html_encoder"]["decoded"]
 391 |         )
 392 | 
 393 |     def test_default_encoder(self):
 394 |         """
 395 |         ensure the default decoder is invoked
 396 |         """
 397 |         parsed_with_default = self._make_html(
 398 |             "decode_html_encoder", default_encoder=metadata_parser.utils.decode_html
 399 |         )
 400 |         parsed_no_default = self._make_html("decode_html_encoder")
 401 | 
 402 |         # does the default_decoder work?
 403 |         _decoded_default = parsed_with_default.parsed_result.get_metadatas(
 404 |             "description"
 405 |         )
 406 |         decoded_default = _decoded_default["meta"]
 407 |         self.assertEqual(
 408 |             decoded_default[0], self._data["decode_html_encoder"]["decoded"]
 409 |         )
 410 | 
 411 |         # does the no decoder work as expected?
 412 |         _not_decoded = parsed_no_default.parsed_result.get_metadatas("description")
 413 |         not_decoded = _not_decoded["meta"]
 414 |         self.assertEqual(not_decoded[0], self._data["decode_html_encoder"]["parsed"])
 415 | 
 416 |         # can we override the default_decoder to get RAW?
 417 |         _decoded_override = parsed_with_default.parsed_result.get_metadatas(
 418 |             "description", encoder="raw"
 419 |         )
 420 |         decoded_override = _decoded_override["meta"]
 421 |         self.assertEqual(
 422 |             decoded_override[0], self._data["decode_html_encoder"]["parsed"]
 423 |         )
 424 | 
 425 |         # can we override the default_decoder to get something else?
 426 |         # ensure these 2 aren't equal, otherwise the next bit doesn't really test!
 427 |         self.assertNotEqual(
 428 |             self._data["decode_html_encoder"]["parsed"],
 429 |             self._data["decode_html_encoder"]["parsed"].upper(),
 430 |         )
 431 |         _decoded_override = parsed_with_default.parsed_result.get_metadatas(
 432 |             "description", encoder=encoder_capitalizer
 433 |         )
 434 |         decoded_override = _decoded_override["meta"]
 435 |         self.assertEqual(
 436 |             decoded_override[0], self._data["decode_html_encoder"]["parsed"].upper()
 437 |         )
 438 | 
 439 | 
 440 | class _TestDocumentParsingCore:
 441 | 
 442 |     def _MakeOne(self, filename):
 443 |         """lazy cache of files as needed"""
 444 |         global CACHED_FILESYSTEM_DOCUMENTS
 445 |         if filename not in CACHED_FILESYSTEM_DOCUMENTS:
 446 |             CACHED_FILESYSTEM_DOCUMENTS[filename] = open(
 447 |                 os.path.join(_examples_dir, filename)
 448 |             ).read()
 449 |         return CACHED_FILESYSTEM_DOCUMENTS[filename]
 450 | 
 451 |     def _MakeOneParsed(self, **kwargs) -> metadata_parser.MetadataParser:
 452 |         html = self._MakeOne("duplicates.html")
 453 | 
 454 |         mp_kwargs = {}
 455 |         if "strategy" in kwargs:
 456 |             mp_kwargs["strategy"] = kwargs["strategy"]
 457 | 
 458 |         parsed = metadata_parser.MetadataParser(url=None, html=html, **mp_kwargs)
 459 | 
 460 |         # we should be tracking the verison now
 461 |         self.assertIn("_v", parsed.parsed_result.metadata)
 462 | 
 463 |         # it should be the same version
 464 |         self.assertEqual(
 465 |             parsed.parsed_result.metadata_version,
 466 |             metadata_parser.ParsedResult._version,
 467 |         )
 468 | 
 469 |         # we should be tracking the verison now
 470 |         self.assertIn("_v", parsed.parsed_result.metadata)
 471 | 
 472 |         # it should be the same version
 473 |         self.assertEqual(
 474 |             parsed.parsed_result.metadata_version, metadata_parser.ParsedResult._version
 475 |         )
 476 |         return parsed
 477 | 
 478 | 
 479 | class TestDocumentParsing_Exceptions(unittest.TestCase, _TestDocumentParsingCore):
 480 | 
 481 |     def test__all_in_list(self):
 482 |         parsed = self._MakeOneParsed()
 483 |         # this should error!
 484 |         with self.assertRaises(InvalidStrategy) as cm:
 485 |             parsed.parsed_result.get_metadatas("canonical", strategy=["all"])
 486 |         self.assertEqual(
 487 |             cm.exception.args[0],
 488 |             'Submit "all" as a `str`, not in a `list`.',
 489 |         )
 490 | 
 491 |     def test__known_as_str(self):
 492 |         parsed = self._MakeOneParsed()
 493 |         # this should error!
 494 |         with self.assertRaises(InvalidStrategy) as cm:
 495 |             parsed.parsed_result.get_metadatas("TestMixedCandidates1a", strategy="dc")
 496 |         self.assertEqual(
 497 |             cm.exception.args[0],
 498 |             'If `strategy` is not a `list`, it must be "all".',
 499 |         )
 500 | 
 501 |     def test__unknown_in_list(self):
 502 |         parsed = self._MakeOneParsed()
 503 |         # this should error!
 504 |         with self.assertRaises(InvalidStrategy) as cm:
 505 |             parsed.parsed_result.get_metadatas("canonical", strategy=["unknown"])
 506 |         self.assertEqual(
 507 |             cm.exception.args[0],
 508 |             'Invalid strategy: "unknown".',
 509 |         )
 510 |         with self.assertRaises(InvalidStrategy) as cm:
 511 |             parsed.parsed_result.get_metadatas(
 512 |                 "canonical", strategy=["unknown", "unknown-too"]
 513 |             )
 514 |         self.assertEqual(
 515 |             cm.exception.args[0],
 516 |             'Invalid strategy: "unknown", "unknown-too".',
 517 |         )
 518 | 
 519 | 
 520 | class TestDocumentParsing(unittest.TestCase, _TestDocumentParsingCore):
 521 |     """
 522 |     python -m unittest tests.document_parsing.TestDocumentParsing
 523 |     python -m unittest tests.document_parsing.TestDocumentParsing.test_simple_html
 524 |     python -m unittest tests.document_parsing.TestDocumentParsing.test_html_urls
 525 |     python -m unittest tests.document_parsing.TestDocumentParsing.test_complex_html
 526 |     python -m unittest tests.document_parsing.TestDocumentParsing.test_charsets
 527 |     """
 528 | 
 529 |     def test_simple_html(self):
 530 |         """this tests simple.html to have certain fields"""
 531 |         html = self._MakeOne("simple.html")
 532 |         parsed = metadata_parser.MetadataParser(url=None, html=html)
 533 |         self.assertEqual(
 534 |             parsed.parsed_result.metadata["meta"]["article:publisher"],
 535 |             "https://www.example.com/meta/property=article:publisher",
 536 |         )
 537 |         self.assertEqual(parsed.parsed_result.metadata["meta"]["author"], "meta.author")
 538 |         self.assertEqual(
 539 |             parsed.parsed_result.metadata["meta"]["description"], "meta.description"
 540 |         )
 541 |         self.assertEqual(
 542 |             parsed.parsed_result.metadata["meta"]["keywords"], "meta.keywords"
 543 |         )
 544 |         self.assertEqual(
 545 |             parsed.parsed_result.metadata["meta"]["og:description"],
 546 |             "meta.property=og:description",
 547 |         )
 548 |         self.assertEqual(
 549 |             parsed.parsed_result.metadata["meta"]["og:image"],
 550 |             "https://www.example.com/meta/property=og:image",
 551 |         )
 552 |         self.assertEqual(
 553 |             parsed.parsed_result.metadata["meta"]["og:site_name"],
 554 |             "meta.property=og:site_name",
 555 |         )
 556 |         self.assertEqual(
 557 |             parsed.parsed_result.metadata["meta"]["og:title"], "meta.property=og:title"
 558 |         )
 559 |         self.assertEqual(
 560 |             parsed.parsed_result.metadata["meta"]["og:type"], "meta.property=og:type"
 561 |         )
 562 |         self.assertEqual(
 563 |             parsed.parsed_result.metadata["meta"]["og:url"],
 564 |             "https://www.example.com/meta/property=og:url",
 565 |         )
 566 |         self.assertEqual(
 567 |             parsed.parsed_result.metadata["meta"]["twitter:card"],
 568 |             "meta.name=twitter:card",
 569 |         )
 570 |         self.assertEqual(
 571 |             parsed.parsed_result.metadata["meta"]["twitter:description"],
 572 |             "meta.name=twitter:description",
 573 |         )
 574 |         self.assertEqual(
 575 |             parsed.parsed_result.metadata["meta"]["twitter:image:src"],
 576 |             "https://example.com/meta/name=twitter:image:src",
 577 |         )
 578 |         self.assertEqual(
 579 |             parsed.parsed_result.metadata["meta"]["twitter:site"],
 580 |             "meta.name=twitter:site",
 581 |         )
 582 |         self.assertEqual(
 583 |             parsed.parsed_result.metadata["meta"]["twitter:title"],
 584 |             "meta.name=twitter:title",
 585 |         )
 586 |         self.assertEqual(
 587 |             parsed.parsed_result.metadata["meta"]["twitter:url"],
 588 |             "https://example.com/meta/name=twitter:url",
 589 |         )
 590 |         self.assertEqual(
 591 |             parsed.parsed_result.metadata["og"]["description"],
 592 |             "meta.property=og:description",
 593 |         )
 594 |         self.assertEqual(
 595 |             parsed.parsed_result.metadata["og"]["image"],
 596 |             "https://www.example.com/meta/property=og:image",
 597 |         )
 598 |         self.assertEqual(
 599 |             parsed.parsed_result.metadata["og"]["site_name"],
 600 |             "meta.property=og:site_name",
 601 |         )
 602 |         self.assertEqual(
 603 |             parsed.parsed_result.metadata["og"]["title"], "meta.property=og:title"
 604 |         )
 605 |         self.assertEqual(
 606 |             parsed.parsed_result.metadata["og"]["type"], "meta.property=og:type"
 607 |         )
 608 |         self.assertEqual(
 609 |             parsed.parsed_result.metadata["og"]["url"],
 610 |             "https://www.example.com/meta/property=og:url",
 611 |         )
 612 |         self.assertEqual(
 613 |             parsed.parsed_result.metadata["page"]["canonical"],
 614 |             "http://example.com/meta/rel=canonical",
 615 |         )
 616 |         self.assertEqual(
 617 |             parsed.parsed_result.metadata["page"]["shortlink"],
 618 |             "http://example.com/meta/rel=shortlink",
 619 |         )
 620 |         self.assertEqual(parsed.parsed_result.metadata["page"]["title"], "title")
 621 |         self.assertEqual(
 622 |             parsed.parsed_result.metadata["twitter"]["card"], "meta.name=twitter:card"
 623 |         )
 624 |         self.assertEqual(
 625 |             parsed.parsed_result.metadata["twitter"]["description"],
 626 |             "meta.name=twitter:description",
 627 |         )
 628 |         self.assertEqual(
 629 |             parsed.parsed_result.metadata["twitter"]["image:src"],
 630 |             "https://example.com/meta/name=twitter:image:src",
 631 |         )
 632 |         self.assertEqual(
 633 |             parsed.parsed_result.metadata["twitter"]["site"], "meta.name=twitter:site"
 634 |         )
 635 |         self.assertEqual(
 636 |             parsed.parsed_result.metadata["twitter"]["title"], "meta.name=twitter:title"
 637 |         )
 638 |         self.assertEqual(
 639 |             parsed.parsed_result.metadata["twitter"]["url"],
 640 |             "https://example.com/meta/name=twitter:url",
 641 |         )
 642 |         self.assertEqual(
 643 |             parsed.parsed_result.metadata["twitter"]["data"],
 644 |             "meta.name=twitter:data||value",
 645 |         )
 646 |         self.assertNotIn("label", parsed.parsed_result.metadata["twitter"])
 647 |         self.assertEqual(parsed.parsed_result.is_opengraph_minimum(), True)
 648 | 
 649 |     def test_html_urls(self):
 650 |         """this tests simple.html to have certain fields"""
 651 |         html = self._MakeOne("simple.html")
 652 |         parsed = metadata_parser.MetadataParser(url=None, html=html)
 653 |         # by default we do og_first
 654 |         self.assertEqual(
 655 |             parsed.get_discrete_url(), "https://www.example.com/meta/property=og:url"
 656 |         )
 657 |         self.assertEqual(
 658 |             parsed.get_discrete_url(canonical_first=True, og_first=False),
 659 |             "http://example.com/meta/rel=canonical",
 660 |         )
 661 |         self.assertEqual(
 662 |             parsed.get_url_opengraph(), "https://www.example.com/meta/property=og:url"
 663 |         )
 664 |         self.assertEqual(
 665 |             parsed.get_url_canonical(), "http://example.com/meta/rel=canonical"
 666 |         )
 667 | 
 668 |     def test_encoding_fallback(self):
 669 |         """this tests simple.html to have certain fields"""
 670 |         html = """<html><head></head><body>body</body></html>"""
 671 |         parsed = metadata_parser.MetadataParser(url=None, html=html)
 672 |         # typing scope
 673 |         assert parsed.response is not None
 674 |         self.assertEqual(parsed.response.encoding, "ISO-8859-1")
 675 | 
 676 |     def test_encoding_declared(self):
 677 |         html = """<html><head><meta charset="UTF-8"></head><body>body</body></html>"""
 678 |         parsed = metadata_parser.MetadataParser(url=None, html=html)
 679 |         # typing scope
 680 |         assert parsed.response is not None
 681 |         self.assertEqual(parsed.response.encoding, "UTF-8")
 682 | 
 683 |     def test_charsets(self):
 684 |         """
 685 |         python -m unittest tests.document_parsing.TestDocumentParsing.test_charsets
 686 |         """
 687 |         a_html = self._MakeOne("charset_a.html")
 688 |         a_parsed = metadata_parser.MetadataParser(url=None, html=a_html)
 689 |         self.assertEqual(
 690 |             a_parsed.parsed_result.metadata["meta"]["content-type"],
 691 |             "text/html; charset=UTF-8",
 692 |         )
 693 | 
 694 |         b_html = self._MakeOne("charset_b.html")
 695 |         b_parsed = metadata_parser.MetadataParser(url=None, html=b_html)
 696 |         self.assertEqual(b_parsed.parsed_result.metadata["meta"]["charset"], "UTF-8")
 697 | 
 698 |         c_html = self._MakeOne("charset_c.html")
 699 |         c_parsed = metadata_parser.MetadataParser(url=None, html=c_html)
 700 |         self.assertEqual(c_parsed.parsed_result.metadata["meta"]["charset"], "UTF-8")
 701 | 
 702 |     def test_malformed_twitter(self):
 703 |         """
 704 |         this tests simple.html to have certain fields
 705 |         python -munittest tests.document_parsing.TestDocumentParsing.test_malformed_twitter
 706 |         """
 707 |         html = self._MakeOne("simple.html")
 708 | 
 709 |         # the default behavior is to not support malformed
 710 |         # that means we should consult 'value' for data and 'label'
 711 |         # in `simple.html`, "label" (incorrectly) uses "content" and "data" uses "label"
 712 |         parsed = metadata_parser.MetadataParser(url=None, html=html)
 713 |         self.assertEqual(
 714 |             parsed.parsed_result.metadata["twitter"]["data"],
 715 |             "meta.name=twitter:data||value",
 716 |         )
 717 |         self.assertNotIn("label", parsed.parsed_result.metadata["twitter"])
 718 |         self.assertNotIn("invalid", parsed.parsed_result.metadata["twitter"])
 719 | 
 720 |         # now with `support_malformed` support we will load the label!
 721 |         parsed2 = metadata_parser.MetadataParser(
 722 |             url=None, html=html, support_malformed=True
 723 |         )
 724 |         self.assertEqual(
 725 |             parsed2.parsed_result.metadata["twitter"]["data"],
 726 |             "meta.name=twitter:data||value",
 727 |         )
 728 |         self.assertEqual(
 729 |             parsed2.parsed_result.metadata["twitter"]["label"],
 730 |             "meta.name=twitter:label||content",
 731 |         )
 732 |         self.assertEqual(
 733 |             parsed2.parsed_result.metadata["twitter"]["invalid"],
 734 |             "meta.name=twitter:invalid",
 735 |         )
 736 | 
 737 |         # try it with dupes...
 738 |         html_dupes = self._MakeOne("duplicates.html")
 739 |         parsed_dupe = metadata_parser.MetadataParser(url=None, html=html_dupes)
 740 |         # two items for each of data/label, but label is empty strings
 741 |         self.assertEqual(
 742 |             parsed_dupe.parsed_result.metadata["twitter"]["data"],
 743 |             ["meta.name=twitter:data||value,1", "meta.name=twitter:data||value,2"],
 744 |         )
 745 |         self.assertNotIn("label", parsed.parsed_result.metadata["twitter"])
 746 | 
 747 |         # everyone is happy when metadata is malformed!
 748 |         parsed_dupe = metadata_parser.MetadataParser(
 749 |             url=None, html=html_dupes, support_malformed=True
 750 |         )
 751 |         self.assertEqual(
 752 |             parsed_dupe.parsed_result.metadata["twitter"]["data"],
 753 |             ["meta.name=twitter:data||value,1", "meta.name=twitter:data||value,2"],
 754 |         )
 755 |         self.assertEqual(
 756 |             parsed_dupe.parsed_result.metadata["twitter"]["label"],
 757 |             [
 758 |                 "meta.name=twitter:label||content,1",
 759 |                 "meta.name=twitter:label||content,2",
 760 |             ],
 761 |         )
 762 | 
 763 | 
 764 | class TestDocumentParsing_Complex(unittest.TestCase, _TestDocumentParsingCore):
 765 |     """
 766 |     this tests duplicates.html to have certain fields under complex conditions
 767 |     """
 768 | 
 769 |     def test_og_image(self):
 770 |         parsed = self._MakeOneParsed()
 771 | 
 772 |         # we have 3 og:image entries in this file
 773 |         _computed_link = parsed.get_metadata_link("image", strategy=["og"])
 774 |         assert _computed_link == "https://www.example.com/meta/property=og:image"
 775 |         _all_og_images = parsed.parsed_result.get_metadatas("og:image")
 776 |         assert _all_og_images is not None
 777 |         assert isinstance(_all_og_images, dict)
 778 |         assert "meta" in _all_og_images
 779 | 
 780 |         all_og_images = _all_og_images["meta"]
 781 | 
 782 |         assert len(all_og_images) == 3
 783 |         assert "https://www.example.com/meta/property=og:image" in all_og_images
 784 |         # bs4 cleans up the ampersand internally into an entity, but prints it deserialized by default
 785 |         assert (
 786 |             "https://www.example.com/meta?property=og:image&duplicate=1"
 787 |             in all_og_images
 788 |         )
 789 |         assert (
 790 |             "https://www.example.com/meta?property=og:image&duplicate=2"
 791 |             in all_og_images
 792 |         )
 793 | 
 794 |     def test__citation_authors(self):
 795 |         parsed = self._MakeOneParsed()
 796 | 
 797 |         # -----
 798 |         # this is a duplicate element and should be stored in the metadata dict as a list
 799 |         citation_authors = [
 800 |             "citation_author:1",
 801 |             "citation_author:2",
 802 |             "citation_author:3",
 803 |         ]
 804 |         # these should be lists
 805 |         self.assertEqual(
 806 |             parsed.parsed_result.metadata["meta"]["citation_author"], citation_authors
 807 |         )
 808 | 
 809 |         self.assertEqual(
 810 |             parsed.parsed_result.get_metadatas("citation_author", ["meta"])["meta"],
 811 |             citation_authors,
 812 |         )
 813 | 
 814 |         # this is a string
 815 |         self.assertEqual(
 816 |             parsed.parsed_result.get_metadatas("citation_author", ["meta"])["meta"][0],
 817 |             citation_authors[0],
 818 |         )
 819 | 
 820 |     def test__meta_authors(self):
 821 |         parsed = self._MakeOneParsed()
 822 | 
 823 |         meta_authors = ["meta.author:1", "meta.author:2"]
 824 | 
 825 |         # these should be lists
 826 |         self.assertEqual(parsed.parsed_result.metadata["meta"]["author"], meta_authors)
 827 |         self.assertEqual(
 828 |             parsed.parsed_result.get_metadatas("author", ["meta"])["meta"], meta_authors
 829 |         )
 830 |         # this is a string
 831 |         self.assertEqual(
 832 |             parsed.parsed_result.get_metadatas("author", ["meta"])["meta"][0],
 833 |             meta_authors[0],
 834 |         )
 835 | 
 836 |     def test__meta_keywords(self):
 837 |         parsed = self._MakeOneParsed()
 838 | 
 839 |         meta_kws = ["meta.keywords:1", "meta.keywords:2"]
 840 |         # these should be lists
 841 |         self.assertEqual(
 842 |             parsed.parsed_result.metadata["meta"]["keywords"],
 843 |             meta_kws,
 844 |         )
 845 |         self.assertEqual(
 846 |             parsed.parsed_result.get_metadatas("keywords", ["meta"])["meta"],
 847 |             meta_kws,
 848 |         )
 849 |         # this is a string
 850 |         self.assertEqual(
 851 |             parsed.parsed_result.get_metadatas("keywords", ["meta"])["meta"][0],
 852 |             meta_kws[0],
 853 |         )
 854 | 
 855 |     def test__meta_description(self):
 856 |         parsed = self._MakeOneParsed()
 857 |         # -----
 858 |         # this is a single element and should be stored in the metadata dict as a string
 859 |         description = "meta.description"
 860 | 
 861 |         # these should be lists
 862 |         self.assertEqual(
 863 |             parsed.parsed_result.get_metadatas("description", ["meta"])["meta"],
 864 |             [description],
 865 |         )
 866 | 
 867 |         # this is a string
 868 |         self.assertEqual(
 869 |             parsed.parsed_result.metadata["meta"]["description"],
 870 |             description,
 871 |         )
 872 |         self.assertEqual(
 873 |             parsed.parsed_result.get_metadatas("description", ["meta"])["meta"][0],
 874 |             description,
 875 |         )
 876 | 
 877 |     def test__dc__basic(self):
 878 |         parsed = self._MakeOneParsed()
 879 |         # -----
 880 |         # dc creator has a language variant
 881 |         #  'dc': {'Creator': [{'content': 'Plato'},
 882 |         #                     {'content': 'Platon', 'lang': 'fr'}],
 883 | 
 884 |         self.assertIn("Creator", parsed.parsed_result.metadata["dc"])
 885 |         dc_creator = parsed.parsed_result.metadata["dc"]["Creator"]
 886 |         # so this should be a list
 887 |         self.assertIs(type(dc_creator), list)
 888 |         # with a length of 2
 889 |         self.assertEqual(len(dc_creator), 2)
 890 |         self.assertIs(type(dc_creator[0]), dict)
 891 |         self.assertIs(type(dc_creator[1]), dict)
 892 |         self.assertIn("content", dc_creator[0])
 893 |         self.assertEqual(dc_creator[0]["content"], "Plato")
 894 |         self.assertIn("content", dc_creator[1])
 895 |         self.assertEqual(dc_creator[1]["content"], "Platon")
 896 |         self.assertIn("lang", dc_creator[1])
 897 |         self.assertEqual(dc_creator[1]["lang"], "fr")
 898 | 
 899 |     def test__dc__subject(self):
 900 |         parsed = self._MakeOneParsed()
 901 |         # -----
 902 |         # dc subject has a scheme variant
 903 |         # 'Subject': [{'content': 'heart attack'},
 904 |         #             {'content': 'Myocardial Infarction; Pericardial Effusion',
 905 |         #              'scheme': 'MESH'},
 906 |         #             {'content': 'vietnam war'},
 907 |         #             {'content': 'Vietnamese Conflict, 1961-1975',
 908 |         #              'scheme': 'LCSH'},
 909 |         #             {'content': 'Friendship'},
 910 |         #             {'content': '158.25', 'scheme': 'ddc'}]},
 911 |         dcSubjectsExpected = [
 912 |             {"content": "heart attack"},
 913 |             {
 914 |                 "content": "Myocardial Infarction; Pericardial Effusion",
 915 |                 "scheme": "MESH",
 916 |             },
 917 |             {"content": "vietnam war"},
 918 |             {"content": "Vietnamese Conflict, 1961-1975", "scheme": "LCSH"},
 919 |             {"content": "Friendship"},
 920 |             {"content": "158.25", "scheme": "ddc"},
 921 |         ]
 922 |         self.assertIn("Subject", parsed.parsed_result.metadata["dc"])
 923 |         dc_subject = parsed.parsed_result.metadata["dc"]["Subject"]
 924 |         self.assertIs(type(dc_subject), list)
 925 |         self.assertEqual(len(dc_subject), len(dcSubjectsExpected))
 926 |         for idx, _expected in enumerate(dc_subject):
 927 |             self.assertIs(type(dc_subject[idx]), dict)
 928 |             self.assertEqual(
 929 |                 len(dc_subject[idx].keys()), len(dcSubjectsExpected[idx].keys())
 930 |             )
 931 |             self.assertEqual(
 932 |                 sorted(dc_subject[idx].keys()), sorted(dcSubjectsExpected[idx].keys())
 933 |             )
 934 |             for _key in dc_subject[idx].keys():
 935 |                 self.assertEqual(dc_subject[idx][_key], dcSubjectsExpected[idx][_key])
 936 | 
 937 |     def test__dc__TestMixedCandidates1(self):
 938 |         parsed = self._MakeOneParsed()
 939 |         # -----
 940 |         # dc TestMixedCandidates1
 941 |         # handle the ordering of results
 942 |         # the raw info tested is the same as the above Subject test...
 943 |         dcTestMixedCandidates1aExpected = {"content": "Friendship"}
 944 |         self.assertIn(
 945 |             "TestMixedCandidates1a",
 946 |             parsed.parsed_result.metadata["dc"],
 947 |         )
 948 |         dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
 949 |             "TestMixedCandidates1a"
 950 |         ]
 951 |         self.assertIs(type(dc_mixed_candidates), dict)
 952 |         self.assertEqual(
 953 |             len(dc_mixed_candidates.keys()), len(dcTestMixedCandidates1aExpected.keys())
 954 |         )
 955 |         self.assertEqual(
 956 |             sorted(dc_mixed_candidates.keys()),
 957 |             sorted(dcTestMixedCandidates1aExpected.keys()),
 958 |         )
 959 |         for _key in dc_mixed_candidates.keys():
 960 |             self.assertEqual(
 961 |                 dc_mixed_candidates[_key],
 962 |                 dcTestMixedCandidates1aExpected[_key],
 963 |             )
 964 | 
 965 |         # test get_metadatas
 966 |         with self.assertRaises(InvalidStrategy) as cm:
 967 |             parsed.parsed_result.get_metadatas("TestMixedCandidates1a", strategy="dc")
 968 |         self.assertEqual(
 969 |             cm.exception.args[0],
 970 |             'If `strategy` is not a `list`, it must be "all".',
 971 |         )
 972 | 
 973 |         self.assertEqual(
 974 |             parsed.parsed_result.get_metadatas(
 975 |                 "TestMixedCandidates1a", strategy=["dc"]
 976 |             )["dc"][0],
 977 |             {"content": "Friendship"},
 978 |         )
 979 |         self.assertEqual(
 980 |             parsed.parsed_result.get_metadatas(
 981 |                 "TestMixedCandidates1a", strategy=["dc"]
 982 |             )["dc"],
 983 |             [dcTestMixedCandidates1aExpected],
 984 |         )
 985 |         self.assertEqual(
 986 |             parsed.parsed_result.get_metadatas(
 987 |                 "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer
 988 |             )["dc"],
 989 |             [{"CONTENT": "FRIENDSHIP"}],
 990 |         )
 991 | 
 992 |         # 1b
 993 |         dcTestMixedCandidates1bExpected = {"content": "158.25", "scheme": "ddc"}
 994 |         self.assertIn("TestMixedCandidates1b", parsed.parsed_result.metadata["dc"])
 995 |         dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
 996 |             "TestMixedCandidates1b"
 997 |         ]
 998 |         self.assertIs(type(dc_mixed_candidates), dict)
 999 |         self.assertEqual(
1000 |             len(dc_mixed_candidates.keys()), len(dcTestMixedCandidates1bExpected.keys())
1001 |         )
1002 |         self.assertEqual(
1003 |             sorted(dc_mixed_candidates.keys()),
1004 |             sorted(dcTestMixedCandidates1bExpected.keys()),
1005 |         )
1006 |         for _key in dc_mixed_candidates.keys():
1007 |             self.assertEqual(
1008 |                 dc_mixed_candidates[_key], dcTestMixedCandidates1bExpected[_key]
1009 |             )
1010 | 
1011 |         # test get_metadatas
1012 |         self.assertEqual(
1013 |             parsed.parsed_result.get_metadatas(
1014 |                 "TestMixedCandidates1b", strategy=["dc"]
1015 |             )["dc"][0],
1016 |             {"content": "158.25", "scheme": "ddc"},
1017 |         )
1018 |         self.assertEqual(
1019 |             parsed.parsed_result.get_metadatas(
1020 |                 "TestMixedCandidates1b", strategy=["dc"]
1021 |             )["dc"],
1022 |             [dcTestMixedCandidates1bExpected],
1023 |         )
1024 |         self.assertEqual(
1025 |             parsed.parsed_result.get_metadatas(
1026 |                 "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer
1027 |             )["dc"],
1028 |             [{"CONTENT": "158.25", "SCHEME": "DDC"}],
1029 |         )
1030 | 
1031 |     def test__dc__TestMixedCandidates2(self):
1032 |         parsed = self._MakeOneParsed()
1033 |         # -----
1034 |         # dc TestMixedCandidates2
1035 |         # handle the ordering of results
1036 |         # the raw info tested is the same as the above Subject test...
1037 |         dcTestMixedCandidates2aExpected = [
1038 |             {"content": "158.25", "scheme": "ddc"},
1039 |             {"content": "Friendship"},
1040 |         ]
1041 |         self.assertIn(
1042 |             "TestMixedCandidates2a",
1043 |             parsed.parsed_result.metadata["dc"],
1044 |         )
1045 |         dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
1046 |             "TestMixedCandidates2a"
1047 |         ]
1048 |         self.assertIs(type(dc_mixed_candidates), list)
1049 |         self.assertEqual(len(dc_mixed_candidates), len(dcTestMixedCandidates2aExpected))
1050 |         for idx, _expected in enumerate(dc_mixed_candidates):
1051 |             self.assertIs(type(dc_mixed_candidates[idx]), dict)
1052 |             self.assertEqual(
1053 |                 len(dc_mixed_candidates[idx].keys()),
1054 |                 len(dcTestMixedCandidates2aExpected[idx].keys()),
1055 |             )
1056 |             self.assertEqual(
1057 |                 sorted(dc_mixed_candidates[idx].keys()),
1058 |                 sorted(dcTestMixedCandidates2aExpected[idx].keys()),
1059 |             )
1060 |             for _key in dc_mixed_candidates[idx].keys():
1061 |                 self.assertEqual(
1062 |                     dc_mixed_candidates[idx][_key],
1063 |                     dcTestMixedCandidates2aExpected[idx][_key],
1064 |                 )
1065 | 
1066 |         # test get_metadatas
1067 | 
1068 |         self.assertEqual(
1069 |             parsed.parsed_result.get_metadatas(
1070 |                 "TestMixedCandidates2a", strategy=["dc"]
1071 |             )["dc"][0],
1072 |             {"content": "158.25", "scheme": "ddc"},
1073 |         )
1074 |         self.assertEqual(
1075 |             parsed.parsed_result.get_metadatas(
1076 |                 "TestMixedCandidates2a", strategy=["dc"]
1077 |             )["dc"],
1078 |             dcTestMixedCandidates2aExpected,
1079 |         )
1080 |         self.assertEqual(
1081 |             parsed.parsed_result.get_metadatas(
1082 |                 "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
1083 |             )["dc"][0],
1084 |             {"CONTENT": "158.25", "SCHEME": "DDC"},
1085 |             {"CONTENT": "FRIENDSHIP"},
1086 |         )
1087 |         self.assertEqual(
1088 |             parsed.parsed_result.get_metadatas(
1089 |                 "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
1090 |             )["dc"],
1091 |             [{"CONTENT": "158.25", "SCHEME": "DDC"}, {"CONTENT": "FRIENDSHIP"}],
1092 |         )
1093 | 
1094 |         # 2b
1095 |         dcTestMixedCandidates2bExpected = [
1096 |             {"content": "Friendship"},
1097 |             {"content": "158.25", "scheme": "ddc"},
1098 |         ]
1099 |         self.assertIn(
1100 |             "TestMixedCandidates2b",
1101 |             parsed.parsed_result.metadata["dc"],
1102 |         )
1103 |         dc_mixed_candidates = parsed.parsed_result.metadata["dc"][
1104 |             "TestMixedCandidates2b"
1105 |         ]
1106 |         self.assertIs(type(dc_mixed_candidates), list)
1107 |         self.assertEqual(len(dc_mixed_candidates), len(dcTestMixedCandidates2bExpected))
1108 |         for idx, _expected in enumerate(dc_mixed_candidates):
1109 |             self.assertIs(type(dc_mixed_candidates[idx]), dict)
1110 |             self.assertEqual(
1111 |                 len(dc_mixed_candidates[idx].keys()),
1112 |                 len(dcTestMixedCandidates2bExpected[idx].keys()),
1113 |             )
1114 |             self.assertEqual(
1115 |                 sorted(dc_mixed_candidates[idx].keys()),
1116 |                 sorted(dcTestMixedCandidates2bExpected[idx].keys()),
1117 |             )
1118 |             for _key in dc_mixed_candidates[idx].keys():
1119 |                 self.assertEqual(
1120 |                     dc_mixed_candidates[idx][_key],
1121 |                     dcTestMixedCandidates2bExpected[idx][_key],
1122 |                 )
1123 | 
1124 |         # test get_metadatas
1125 |         self.assertEqual(
1126 |             parsed.parsed_result.get_metadatas(
1127 |                 "TestMixedCandidates2b", strategy=["dc"]
1128 |             )["dc"][0],
1129 |             {"content": "Friendship"},
1130 |         )
1131 |         self.assertEqual(
1132 |             parsed.parsed_result.get_metadatas(
1133 |                 "TestMixedCandidates2b", strategy=["dc"]
1134 |             )["dc"],
1135 |             dcTestMixedCandidates2bExpected,
1136 |         )
1137 |         self.assertEqual(
1138 |             parsed.parsed_result.get_metadatas(
1139 |                 "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
1140 |             )["dc"][0],
1141 |             {"CONTENT": "FRIENDSHIP"},
1142 |         )
1143 |         self.assertEqual(
1144 |             parsed.parsed_result.get_metadatas(
1145 |                 "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
1146 |             )["dc"],
1147 |             [{"CONTENT": "FRIENDSHIP"}, {"CONTENT": "158.25", "SCHEME": "DDC"}],
1148 |         )
1149 | 
1150 |     def test__TestMixedField0(self):
1151 |         parsed = self._MakeOneParsed()
1152 |         # ok, mixedfield tests:
1153 |         # TestMixedField0
1154 |         self.assertEqual(
1155 |             parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["dc"]),
1156 |             None,
1157 |         )
1158 |         self.assertEqual(
1159 |             parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["meta"])[
1160 |                 "meta"
1161 |             ][0],
1162 |             "meta:TestMixedField0",
1163 |         )
1164 |         self.assertEqual(
1165 |             parsed.parsed_result.get_metadatas("TestMixedField0", strategy="all"),
1166 |             {"meta": ["meta:TestMixedField0"]},
1167 |         )
1168 |         self.assertEqual(
1169 |             parsed.parsed_result.get_metadatas(
1170 |                 "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
1171 |             ),
1172 |             None,
1173 |         )
1174 |         self.assertEqual(
1175 |             parsed.parsed_result.get_metadatas(
1176 |                 "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
1177 |             )["meta"][0],
1178 |             "META:TESTMIXEDFIELD0",
1179 |         )
1180 |         self.assertEqual(
1181 |             parsed.parsed_result.get_metadatas(
1182 |                 "TestMixedField0", strategy="all", encoder=encoder_capitalizer
1183 |             ),
1184 |             {"meta": ["META:TESTMIXEDFIELD0"]},
1185 |         )
1186 |         self.assertEqual(
1187 |             parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["dc"]),
1188 |             None,
1189 |         )
1190 |         self.assertEqual(
1191 |             parsed.parsed_result.get_metadatas("TestMixedField0", strategy=["meta"])[
1192 |                 "meta"
1193 |             ],
1194 |             ["meta:TestMixedField0"],
1195 |         )
1196 |         self.assertEqual(
1197 |             parsed.parsed_result.get_metadatas("TestMixedField0", strategy="all"),
1198 |             {"meta": ["meta:TestMixedField0"]},
1199 |         )
1200 |         self.assertEqual(
1201 |             parsed.parsed_result.get_metadatas(
1202 |                 "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
1203 |             ),
1204 |             None,
1205 |         )
1206 |         self.assertEqual(
1207 |             parsed.parsed_result.get_metadatas(
1208 |                 "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
1209 |             )["meta"],
1210 |             ["META:TESTMIXEDFIELD0"],
1211 |         )
1212 |         self.assertEqual(
1213 |             parsed.parsed_result.get_metadatas(
1214 |                 "TestMixedField0", strategy="all", encoder=encoder_capitalizer
1215 |             ),
1216 |             {"meta": ["META:TESTMIXEDFIELD0"]},
1217 |         )
1218 | 
1219 |     def test__TestMixedField1(self):
1220 |         parsed = self._MakeOneParsed()
1221 |         # TestMixedField1
1222 |         self.assertEqual(
1223 |             parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["dc"])[
1224 |                 "dc"
1225 |             ][0],
1226 |             {"content": "dc:TestMixedField1"},
1227 |         )
1228 |         self.assertEqual(
1229 |             parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["meta"])[
1230 |                 "meta"
1231 |             ][0],
1232 |             "meta:TestMixedField1",
1233 |         )
1234 |         self.assertEqual(
1235 |             parsed.parsed_result.get_metadatas("TestMixedField1", strategy="all"),
1236 |             {
1237 |                 "dc": [{"content": "dc:TestMixedField1"}],
1238 |                 "meta": ["meta:TestMixedField1"],
1239 |             },
1240 |         )
1241 |         self.assertEqual(
1242 |             parsed.parsed_result.get_metadatas(
1243 |                 "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
1244 |             )["dc"][0],
1245 |             {"CONTENT": "DC:TESTMIXEDFIELD1"},
1246 |         )
1247 |         self.assertEqual(
1248 |             parsed.parsed_result.get_metadatas(
1249 |                 "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
1250 |             )["meta"][0],
1251 |             "META:TESTMIXEDFIELD1",
1252 |         )
1253 |         self.assertEqual(
1254 |             parsed.parsed_result.get_metadatas(
1255 |                 "TestMixedField1", strategy="all", encoder=encoder_capitalizer
1256 |             ),
1257 |             {
1258 |                 "dc": [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1259 |                 "meta": ["META:TESTMIXEDFIELD1"],
1260 |             },
1261 |         )
1262 |         self.assertEqual(
1263 |             parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["dc"])[
1264 |                 "dc"
1265 |             ],
1266 |             [{"content": "dc:TestMixedField1"}],
1267 |         )
1268 |         self.assertEqual(
1269 |             parsed.parsed_result.get_metadatas("TestMixedField1", strategy=["meta"])[
1270 |                 "meta"
1271 |             ],
1272 |             ["meta:TestMixedField1"],
1273 |         )
1274 |         self.assertEqual(
1275 |             parsed.parsed_result.get_metadatas("TestMixedField1", strategy="all"),
1276 |             {
1277 |                 "meta": ["meta:TestMixedField1"],
1278 |                 "dc": [{"content": "dc:TestMixedField1"}],
1279 |             },
1280 |         )
1281 |         self.assertEqual(
1282 |             parsed.parsed_result.get_metadatas(
1283 |                 "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
1284 |             )["dc"],
1285 |             [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1286 |         )
1287 |         self.assertEqual(
1288 |             parsed.parsed_result.get_metadatas(
1289 |                 "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
1290 |             )["meta"],
1291 |             ["META:TESTMIXEDFIELD1"],
1292 |         )
1293 |         self.assertEqual(
1294 |             parsed.parsed_result.get_metadatas(
1295 |                 "TestMixedField1", strategy="all", encoder=encoder_capitalizer
1296 |             ),
1297 |             {
1298 |                 "meta": ["META:TESTMIXEDFIELD1"],
1299 |                 "dc": [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
1300 |             },
1301 |         )
1302 | 
1303 |     def test__TestMixedField2(self):
1304 |         parsed = self._MakeOneParsed()
1305 |         # TestMixedField2
1306 |         self.assertEqual(
1307 |             parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["dc"])[
1308 |                 "dc"
1309 |             ][0],
1310 |             {"content": "dc:TestMixedField2"},
1311 |             # {"con[45 chars]dc"},
1312 |         )
1313 |         self.assertEqual(
1314 |             parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["meta"])[
1315 |                 "meta"
1316 |             ][0],
1317 |             "meta:TestMixedField2",
1318 |         )
1319 |         self.assertEqual(
1320 |             parsed.parsed_result.get_metadatas("TestMixedField2", strategy="all"),
1321 |             {
1322 |                 "dc": [
1323 |                     {"content": "dc:TestMixedField2"},
1324 |                     {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1325 |                 ],
1326 |                 "meta": ["meta:TestMixedField2"],
1327 |             },
1328 |         )
1329 |         self.assertEqual(
1330 |             parsed.parsed_result.get_metadatas(
1331 |                 "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
1332 |             )["dc"][0],
1333 |             {"CONTENT": "DC:TESTMIXEDFIELD2"},
1334 |         )
1335 |         self.assertEqual(
1336 |             parsed.parsed_result.get_metadatas(
1337 |                 "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
1338 |             )["meta"][0],
1339 |             "META:TESTMIXEDFIELD2",
1340 |         )
1341 |         self.assertEqual(
1342 |             parsed.parsed_result.get_metadatas(
1343 |                 "TestMixedField2", strategy="all", encoder=encoder_capitalizer
1344 |             ),
1345 |             {
1346 |                 "dc": [
1347 |                     {"CONTENT": "DC:TESTMIXEDFIELD2"},
1348 |                     {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1349 |                 ],
1350 |                 "meta": ["META:TESTMIXEDFIELD2"],
1351 |             },
1352 |         )
1353 |         self.assertEqual(
1354 |             parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["dc"])[
1355 |                 "dc"
1356 |             ],
1357 |             [
1358 |                 {"content": "dc:TestMixedField2"},
1359 |                 {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1360 |             ],
1361 |         )
1362 |         self.assertEqual(
1363 |             parsed.parsed_result.get_metadatas("TestMixedField2", strategy=["meta"])[
1364 |                 "meta"
1365 |             ],
1366 |             ["meta:TestMixedField2"],
1367 |         )
1368 |         self.assertEqual(
1369 |             parsed.parsed_result.get_metadatas("TestMixedField2", strategy="all"),
1370 |             {
1371 |                 "meta": ["meta:TestMixedField2"],
1372 |                 "dc": [
1373 |                     {"content": "dc:TestMixedField2"},
1374 |                     {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
1375 |                 ],
1376 |             },
1377 |         )
1378 |         self.assertEqual(
1379 |             parsed.parsed_result.get_metadatas(
1380 |                 "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
1381 |             )["dc"],
1382 |             [
1383 |                 {"CONTENT": "DC:TESTMIXEDFIELD2"},
1384 |                 {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1385 |             ],
1386 |         )
1387 |         self.assertEqual(
1388 |             parsed.parsed_result.get_metadatas(
1389 |                 "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
1390 |             )["meta"],
1391 |             ["META:TESTMIXEDFIELD2"],
1392 |         )
1393 |         self.assertEqual(
1394 |             parsed.parsed_result.get_metadatas(
1395 |                 "TestMixedField2", strategy="all", encoder=encoder_capitalizer
1396 |             ),
1397 |             {
1398 |                 "meta": ["META:TESTMIXEDFIELD2"],
1399 |                 "dc": [
1400 |                     {"CONTENT": "DC:TESTMIXEDFIELD2"},
1401 |                     {"CONTENT": "DC:TESTMIXEDFIELD2.DDC", "SCHEME": "DDC"},
1402 |                 ],
1403 |             },
1404 |         )
1405 | 
1406 |     def test__TestMixedField3(self):
1407 |         parsed = self._MakeOneParsed()
1408 |         # TestMixedField3
1409 |         self.assertEqual(
1410 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["dc"])[
1411 |                 "dc"
1412 |             ][0],
1413 |             {"content": "dc:TestMixedField3"},
1414 |         )
1415 |         self.assertEqual(
1416 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["meta"])[
1417 |                 "meta"
1418 |             ][0],
1419 |             "meta:TestMixedField3",
1420 |         )
1421 |         self.assertEqual(
1422 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy="all"),
1423 |             {
1424 |                 "dc": [{"content": "dc:TestMixedField3"}],
1425 |                 "meta": ["meta:TestMixedField3"],
1426 |             },
1427 |         )
1428 |         self.assertEqual(
1429 |             parsed.parsed_result.get_metadatas(
1430 |                 "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
1431 |             )["dc"][0],
1432 |             {"CONTENT": "DC:TESTMIXEDFIELD3"},
1433 |         )
1434 |         self.assertEqual(
1435 |             parsed.parsed_result.get_metadatas(
1436 |                 "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
1437 |             )["meta"][0],
1438 |             "META:TESTMIXEDFIELD3",
1439 |         )
1440 |         self.assertEqual(
1441 |             parsed.parsed_result.get_metadatas(
1442 |                 "TestMixedField3", strategy="all", encoder=encoder_capitalizer
1443 |             ),
1444 |             {
1445 |                 "dc": [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1446 |                 "meta": ["META:TESTMIXEDFIELD3"],
1447 |             },
1448 |         )
1449 |         self.assertEqual(
1450 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["dc"])[
1451 |                 "dc"
1452 |             ],
1453 |             [{"content": "dc:TestMixedField3"}],
1454 |         )
1455 |         self.assertEqual(
1456 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["meta"])[
1457 |                 "meta"
1458 |             ],
1459 |             ["meta:TestMixedField3"],
1460 |         )
1461 |         self.assertEqual(
1462 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy="all"),
1463 |             {
1464 |                 "meta": ["meta:TestMixedField3"],
1465 |                 "dc": [{"content": "dc:TestMixedField3"}],
1466 |             },
1467 |         )
1468 |         self.assertEqual(
1469 |             parsed.parsed_result.get_metadatas(
1470 |                 "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
1471 |             )["dc"],
1472 |             [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1473 |         )
1474 |         self.assertEqual(
1475 |             parsed.parsed_result.get_metadatas(
1476 |                 "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
1477 |             )["meta"],
1478 |             ["META:TESTMIXEDFIELD3"],
1479 |         )
1480 |         self.assertEqual(
1481 |             parsed.parsed_result.get_metadatas(
1482 |                 "TestMixedField3", strategy="all", encoder=encoder_capitalizer
1483 |             ),
1484 |             {
1485 |                 "meta": ["META:TESTMIXEDFIELD3"],
1486 |                 "dc": [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
1487 |             },
1488 |         )
1489 | 
1490 |         self.assertEqual(
1491 |             parsed.parsed_result.get_metadatas("news_keywords", strategy=["meta"])[
1492 |                 "meta"
1493 |             ][0],
1494 |             "",
1495 |         )
1496 |         self.assertEqual(
1497 |             parsed.parsed_result.get_metadatas("auto-publish", strategy=["meta"])[
1498 |                 "meta"
1499 |             ][0],
1500 |             "timely",
1501 |         )
1502 |         self.assertEqual(
1503 |             parsed.parsed_result.get_metadatas(
1504 |                 "article:modified_time", strategy=["meta"]
1505 |             )["meta"][0],
1506 |             "2017-10-11 01:01:01",
1507 |         )
1508 |         self.assertEqual(
1509 |             parsed.parsed_result.get_metadatas(
1510 |                 "msapplication-tap-highlight", strategy=["meta"]
1511 |             )["meta"][0],
1512 |             "no",
1513 |         )
1514 |         self.assertEqual(
1515 |             parsed.parsed_result.get_metadatas(
1516 |                 "google-site-verification", strategy=["meta"]
1517 |             )["meta"][0],
1518 |             "123123123",
1519 |         )
1520 |         self.assertEqual(
1521 |             parsed.parsed_result.get_metadatas("twitter:data1", strategy=["meta"])[
1522 |                 "meta"
1523 |             ][0],
1524 |             "8 min read",
1525 |         )
1526 |         self.assertEqual(
1527 |             parsed.parsed_result.get_metadatas("google", strategy=["meta"])["meta"][0],
1528 |             "notranslate",
1529 |         )
1530 |         self.assertEqual(
1531 |             parsed.parsed_result.get_metadatas("news_keywords", strategy=["meta"])[
1532 |                 "meta"
1533 |             ][0],
1534 |             "",
1535 |         )
1536 |         self.assertEqual(
1537 |             parsed.parsed_result.get_metadatas("viewport", strategy=["meta"])["meta"],
1538 |             [
1539 |                 "width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no",
1540 |                 "width=device-width, initial-scale=1, maximum-scale=1",
1541 |             ],
1542 |         )
1543 |         self.assertEqual(
1544 |             parsed.parsed_result.get_metadatas("thumbnail", strategy=["meta"])["meta"][
1545 |                 0
1546 |             ],
1547 |             "https://example.com/path/to/image.jpg",
1548 |         )
1549 |         self.assertEqual(
1550 |             parsed.get_metadata_link("thumbnail", strategy=["meta"]),
1551 |             "https://example.com/path/to/image.jpg",
1552 |         )
1553 |         self.assertEqual(
1554 |             parsed.parsed_result.get_metadatas("thumbnail-2", strategy=["meta"])[
1555 |                 "meta"
1556 |             ][0],
1557 |             "//example.com/path/to/image.jpg",
1558 |         )
1559 |         self.assertEqual(
1560 |             parsed.get_metadata_link("thumbnail-2", strategy=["meta"]), None
1561 |         )
1562 |         self.assertEqual(
1563 |             parsed.parsed_result.get_metadatas("thumbnail-3", strategy=["meta"])[
1564 |                 "meta"
1565 |             ][0],
1566 |             "/path/to/image.jpg",
1567 |         )
1568 |         self.assertEqual(
1569 |             parsed.get_metadata_link("thumbnail-3", strategy=["meta"]), None
1570 |         )
1571 | 
1572 |     def test__canonical(self):
1573 |         parsed = self._MakeOneParsed()
1574 |         # this should error!
1575 |         with self.assertRaises(InvalidStrategy) as cm:
1576 |             parsed.parsed_result.get_metadatas("canonical", strategy=["all"])
1577 |         self.assertEqual(
1578 |             cm.exception.args[0],
1579 |             'Submit "all" as a `str`, not in a `list`.',
1580 |         )
1581 | 
1582 |         # ok, now test the return types
1583 |         # some behavior was changed in the .7 release
1584 | 
1585 |         # get_metadatas - single section
1586 |         self.assertEqual(
1587 |             parsed.parsed_result.get_metadatas("canonical", strategy=["page"])["page"][
1588 |                 0
1589 |             ],
1590 |             "http://example.com/meta/rel=canonical",
1591 |         )
1592 |         self.assertEqual(
1593 |             parsed.parsed_result.get_metadatas("canonical", strategy=["meta"]),
1594 |             None,
1595 |         )
1596 |         self.assertEqual(
1597 |             parsed.parsed_result.get_metadatas("canonical", strategy="all"),
1598 |             {"page": ["http://example.com/meta/rel=canonical"]},
1599 |         )
1600 | 
1601 |         # get_metadatas - single section
1602 |         self.assertEqual(
1603 |             parsed.parsed_result.get_metadatas("canonical", strategy=["page"])["page"],
1604 |             ["http://example.com/meta/rel=canonical"],
1605 |         )
1606 |         self.assertEqual(
1607 |             parsed.parsed_result.get_metadatas("canonical", strategy=["meta"]),
1608 |             None,
1609 |         )
1610 |         self.assertEqual(
1611 |             parsed.parsed_result.get_metadatas("canonical", strategy="all"),
1612 |             {"page": ["http://example.com/meta/rel=canonical"]},
1613 |         )
1614 | 
1615 |     def test__description(self):
1616 |         parsed = self._MakeOneParsed()
1617 |         # get_metadatas - multiple section
1618 |         self.assertEqual(
1619 |             parsed.parsed_result.get_metadatas("description", strategy=["meta"])[
1620 |                 "meta"
1621 |             ][0],
1622 |             "meta.description",
1623 |         )
1624 |         self.assertEqual(
1625 |             parsed.parsed_result.get_metadatas("description", strategy="all"),
1626 |             {
1627 |                 "og": ["meta.property=og:description"],
1628 |                 "meta": ["meta.description"],
1629 |                 "twitter": ["meta.name=twitter:description"],
1630 |             },
1631 |         )
1632 |         # get_metadatas - multiple section
1633 |         self.assertEqual(
1634 |             parsed.parsed_result.get_metadatas("description", strategy=["meta"])[
1635 |                 "meta"
1636 |             ],
1637 |             ["meta.description"],
1638 |         )
1639 |         self.assertEqual(
1640 |             parsed.parsed_result.get_metadatas("description", strategy="all"),
1641 |             {
1642 |                 "og": ["meta.property=og:description"],
1643 |                 "meta": ["meta.description"],
1644 |                 "twitter": ["meta.name=twitter:description"],
1645 |             },
1646 |         )
1647 | 
1648 |     def test__keywords(self):
1649 |         parsed = self._MakeOneParsed()
1650 |         # multiple candidates!
1651 |         self.assertEqual(
1652 |             parsed.parsed_result.get_metadatas("keywords", strategy=["meta"])["meta"][
1653 |                 0
1654 |             ],
1655 |             "meta.keywords:1",
1656 |         )
1657 |         self.assertEqual(
1658 |             parsed.parsed_result.get_metadatas("keywords", strategy=["meta"])["meta"],
1659 |             ["meta.keywords:1", "meta.keywords:2"],
1660 |         )
1661 | 
1662 |     def test_complex_html__encoder(self):
1663 |         """
1664 |         pytest tests/test_document_parsing.py::TestDocumentParsing::test_complex_html__encoder
1665 |         """
1666 |         html = self._MakeOne("duplicates.html")
1667 |         parsed = metadata_parser.MetadataParser(url=None, html=html)
1668 | 
1669 |         # Test a few things with and without encoding
1670 | 
1671 |         # Test A1
1672 |         self.assertEqual(
1673 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["meta"]),
1674 |             {"meta": ["meta:TestMixedField3"]},
1675 |         )
1676 |         self.assertEqual(
1677 |             parsed.parsed_result.get_metadatas(
1678 |                 "TestMixedField3",
1679 |                 strategy=["meta"],
1680 |                 encoder=metadata_parser.utils.encode_ascii,
1681 |             ),
1682 |             {"meta": ["meta:TestMixedField3"]},
1683 |         )
1684 | 
1685 |         # Test A2 - dc only
1686 |         # without an encoder, DC generates a dict
1687 |         # with the encoder, DC generates a str
1688 |         self.assertEqual(
1689 |             parsed.parsed_result.get_metadatas("TestMixedField3", strategy=["dc"]),
1690 |             {"dc": [{"content": "dc:TestMixedField3"}]},
1691 |         )
1692 |         self.assertEqual(
1693 |             parsed.parsed_result.get_metadatas(
1694 |                 "TestMixedField3",
1695 |                 strategy=["dc"],
1696 |                 encoder=metadata_parser.utils.encode_ascii,
1697 |             ),
1698 |             {"dc": ["dc:TestMixedField3"]},
1699 |         )
1700 | 
1701 |         # Test A3 -  dc within all
1702 |         # without an encoder, DC generates a dict
1703 |         # with the encoder, DC generates a str
1704 |         self.assertEqual(
1705 |             parsed.parsed_result.get_metadatas(
1706 |                 "TestMixedField3",
1707 |                 strategy="all",
1708 |                 encoder=metadata_parser.utils.encode_ascii,
1709 |             ),
1710 |             {
1711 |                 "meta": ["meta:TestMixedField3"],
1712 |                 "dc": ["dc:TestMixedField3"],
1713 |             },
1714 |         )
1715 | 
1716 |         # Test A3 -  dc + meta
1717 |         # without an encoder, DC generates a dict
1718 |         # with the encoder, DC generates a str
1719 |         self.assertEqual(
1720 |             parsed.parsed_result.get_metadatas(
1721 |                 "TestMixedField3",
1722 |                 strategy=["dc", "meta"],
1723 |                 encoder=metadata_parser.utils.encode_ascii,
1724 |             ),
1725 |             {
1726 |                 "meta": ["meta:TestMixedField3"],
1727 |                 "dc": ["dc:TestMixedField3"],
1728 |             },
1729 |         )
1730 | 
1731 | 
1732 | class TestDocumentParsing_SelectFirstMatch(unittest.TestCase, _TestDocumentParsingCore):
1733 | 
1734 |     def _test__shared(self, parsed: MetadataParser):
1735 |         # but the data is still there...
1736 |         self.assertEqual(
1737 |             parsed.parsed_result.get_metadatas("keywords.order", strategy="all"),
1738 |             {
1739 |                 "dc": [
1740 |                     {"content": "dc:keywords.order::1"},
1741 |                     {"content": "dc:keywords.order::2"},
1742 |                 ],
1743 |                 "meta": [
1744 |                     "meta.keywords.order::1",
1745 |                     "meta.keywords.order::2",
1746 |                 ],
1747 |                 "og": [
1748 |                     "meta.property=og:keywords.order::1",
1749 |                     "meta.property=og:keywords.order::2",
1750 |                 ],
1751 |                 "twitter": [
1752 |                     "meta.name=twitter:keywords.order::1",
1753 |                     "meta.name=twitter:keywords.order::2",
1754 |                 ],
1755 |             },
1756 |         )
1757 | 
1758 |         # all gets meta first
1759 |         self.assertEqual(
1760 |             parsed.parsed_result.select_first_match("keywords.order", strategy="all"),
1761 |             "meta.keywords.order::1",
1762 |         )
1763 | 
1764 |         # only look in: meta
1765 |         self.assertEqual(
1766 |             parsed.parsed_result.select_first_match(
1767 |                 "keywords.order", strategy=["meta"]
1768 |             ),
1769 |             "meta.keywords.order::1",
1770 |         )
1771 |         # only look in: page
1772 |         self.assertEqual(
1773 |             parsed.parsed_result.select_first_match(
1774 |                 "keywords.order", strategy=["page"]
1775 |             ),
1776 |             None,
1777 |         )
1778 |         # only look in: dc
1779 |         self.assertEqual(
1780 |             parsed.parsed_result.select_first_match("keywords.order", strategy=["dc"]),
1781 |             "dc:keywords.order::1",
1782 |         )
1783 |         # only look in: og
1784 |         self.assertEqual(
1785 |             parsed.parsed_result.select_first_match("keywords.order", strategy=["og"]),
1786 |             "meta.property=og:keywords.order::1",
1787 |         )
1788 |         # only look in: twitter
1789 |         self.assertEqual(
1790 |             parsed.parsed_result.select_first_match(
1791 |                 "keywords.order", strategy=["twitter"]
1792 |             ),
1793 |             "meta.name=twitter:keywords.order::1",
1794 |         )
1795 | 
1796 |     def test__basic(self):
1797 |         parsed = self._MakeOneParsed()
1798 |         self._test__shared(parsed)
1799 | 
1800 |         # multiple candidates!
1801 |         self.assertEqual(
1802 |             parsed.parsed_result.get_metadatas("keywords.order"),
1803 |             {
1804 |                 "dc": [
1805 |                     {"content": "dc:keywords.order::1"},
1806 |                     {"content": "dc:keywords.order::2"},
1807 |                 ],
1808 |                 "meta": [
1809 |                     "meta.keywords.order::1",
1810 |                     "meta.keywords.order::2",
1811 |                 ],
1812 |                 "og": [
1813 |                     "meta.property=og:keywords.order::1",
1814 |                     "meta.property=og:keywords.order::2",
1815 |                 ],
1816 |                 "twitter": [
1817 |                     "meta.name=twitter:keywords.order::1",
1818 |                     "meta.name=twitter:keywords.order::2",
1819 |                 ],
1820 |             },
1821 |         )
1822 | 
1823 |         # default gets meta first
1824 |         self.assertEqual(
1825 |             parsed.parsed_result.select_first_match("keywords.order"),
1826 |             "meta.keywords.order::1",
1827 |         )
1828 | 
1829 |     def test__all(self):
1830 |         parsed = self._MakeOneParsed(strategy="all")
1831 |         self._test__shared(parsed)
1832 | 
1833 |         # multiple candidates!
1834 |         self.assertEqual(
1835 |             parsed.parsed_result.get_metadatas("keywords.order"),
1836 |             {
1837 |                 "dc": [
1838 |                     {"content": "dc:keywords.order::1"},
1839 |                     {"content": "dc:keywords.order::2"},
1840 |                 ],
1841 |                 "meta": [
1842 |                     "meta.keywords.order::1",
1843 |                     "meta.keywords.order::2",
1844 |                 ],
1845 |                 "og": [
1846 |                     "meta.property=og:keywords.order::1",
1847 |                     "meta.property=og:keywords.order::2",
1848 |                 ],
1849 |                 "twitter": [
1850 |                     "meta.name=twitter:keywords.order::1",
1851 |                     "meta.name=twitter:keywords.order::2",
1852 |                 ],
1853 |             },
1854 |         )
1855 | 
1856 |         # default gets meta first
1857 |         self.assertEqual(
1858 |             parsed.parsed_result.select_first_match("keywords.order"),
1859 |             "meta.keywords.order::1",
1860 |         )
1861 | 
1862 |     def test__meta(self):
1863 |         parsed = self._MakeOneParsed(strategy=["meta"])
1864 |         self._test__shared(parsed)
1865 | 
1866 |         # multiple candidates!
1867 |         # only shows the meta, because of the init
1868 |         self.assertEqual(
1869 |             parsed.parsed_result.get_metadatas("keywords.order"),
1870 |             {"meta": ["meta.keywords.order::1", "meta.keywords.order::2"]},
1871 |         )
1872 | 
1873 |         # default gets meta first
1874 |         self.assertEqual(
1875 |             parsed.parsed_result.select_first_match("keywords.order"),
1876 |             "meta.keywords.order::1",
1877 |         )
1878 | 
1879 |     def test__reversed(self):
1880 |         parsed = self._MakeOneParsed(strategy=["twitter", "dc", "og", "page", "meta"])
1881 | 
1882 |         self._test__shared(parsed)
1883 | 
1884 |         # default gets TWITTER first
1885 |         self.assertEqual(
1886 |             parsed.parsed_result.select_first_match("keywords.order"),
1887 |             "meta.name=twitter:keywords.order::1",
1888 |         )
1889 | 
1890 | 
1891 | class Test_UrlParserCacheable(unittest.TestCase):
1892 |     """
1893 |     python -m unittest tests.document_parsing.Test_UrlParserCacheable
1894 |     """
1895 | 
1896 |     def test__default(self):
1897 |         """MetadataParser()"""
1898 |         parsed, errors = _docs_test_parser(
1899 |             [
1900 |                 "good-canonical-relative",
1901 |                 "good-canonical-relative_alt",
1902 |                 "good-og-relative_alt",
1903 |             ],
1904 |             "*no-kwarg",
1905 |         )
1906 |         if errors:
1907 |             raise ValueError(errors)
1908 | 
1909 |     def test__True(self):
1910 |         """MetadataParser(cached_urlparser=True)"""
1911 |         parsed, errors = _docs_test_parser(
1912 |             [
1913 |                 "good-canonical-relative",
1914 |                 "good-canonical-relative_alt",
1915 |                 "good-og-relative_alt",
1916 |             ],
1917 |             True,
1918 |         )
1919 |         if errors:
1920 |             raise ValueError(errors)
1921 | 
1922 |     def test__Int_1(self):
1923 |         """MetadataParser(cached_urlparser=1)"""
1924 |         # this should fail
1925 |         with self.assertRaises(ValueError) as cm:
1926 |             parsed, errors = _docs_test_parser(
1927 |                 [
1928 |                     "good-canonical-relative",
1929 |                     "good-canonical-relative_alt",
1930 |                     "good-og-relative_alt",
1931 |                 ],
1932 |                 1,
1933 |             )
1934 |             if errors:
1935 |                 raise ValueError(errors)
1936 |         assert isinstance(cm.exception, ValueError)
1937 |         assert cm.exception.args[0] == "`cached_urlparser` must be a callable"
1938 | 
1939 |     def test__Int_0(self):
1940 |         """MetadataParser(cached_urlparser=0)"""
1941 |         parsed, errors = _docs_test_parser(
1942 |             [
1943 |                 "good-canonical-relative",
1944 |                 "good-canonical-relative_alt",
1945 |                 "good-og-relative_alt",
1946 |             ],
1947 |             0,
1948 |         )
1949 |         if errors:
1950 |             raise ValueError(errors)
1951 |         # equivalent to `cached_urlparser=False`
1952 |         assert parsed.urlparse is urlparse
1953 | 
1954 |     def test__None(self):
1955 |         parsed, errors = _docs_test_parser(
1956 |             [
1957 |                 "good-canonical-relative",
1958 |                 "good-canonical-relative_alt",
1959 |                 "good-og-relative_alt",
1960 |             ],
1961 |             None,
1962 |         )
1963 |         if errors:
1964 |             raise ValueError(errors)
1965 | 
1966 |     def test__False(self):
1967 |         parsed, errors = _docs_test_parser(
1968 |             [
1969 |                 "good-canonical-relative",
1970 |                 "good-canonical-relative_alt",
1971 |                 "good-og-relative_alt",
1972 |             ],
1973 |             False,
1974 |         )
1975 |         if errors:
1976 |             raise ValueError(errors)
1977 | 
1978 |     def test__CustomParser(self):
1979 |         custom_parser_obj = metadata_parser.UrlParserCacheable()
1980 |         custom_parser = custom_parser_obj.urlparse
1981 |         parsed, errors = _docs_test_parser(
1982 |             [
1983 |                 "good-canonical-relative",
1984 |                 "good-canonical-relative_alt",
1985 |                 "good-og-relative_alt",
1986 |             ],
1987 |             custom_parser,
1988 |         )
1989 |         if errors:
1990 |             raise ValueError(errors)
1991 | 
1992 | 
1993 | class Test_UrlParserCacheable_MaxItems(unittest.TestCase):
1994 | 
1995 |     def test__default(self):
1996 |         """MetadataParser()"""
1997 |         parsed, errors = _docs_test_parser(
1998 |             [
1999 |                 "good-canonical-relative",
2000 |                 "good-canonical-relative_alt",
2001 |                 "good-og-relative_alt",
2002 |             ],
2003 |             "*no-kwarg",
2004 |             cached_urlparser_maxitems=1,
2005 |         )
2006 |         if errors:
2007 |             raise ValueError(errors)
2008 | 
2009 |     def test__True(self):
2010 |         # this should fail
2011 |         parsed, errors = _docs_test_parser(
2012 |             [
2013 |                 "good-canonical-relative",
2014 |                 "good-canonical-relative_alt",
2015 |                 "good-og-relative_alt",
2016 |             ],
2017 |             True,
2018 |             cached_urlparser_maxitems=1,
2019 |         )
2020 |         if errors:
2021 |             raise ValueError(errors)
2022 | 
2023 |     def test__False(self):
2024 |         # this should fail
2025 |         with self.assertRaises(ValueError) as cm:
2026 |             parsed, errors = _docs_test_parser(
2027 |                 [
2028 |                     "good-canonical-relative",
2029 |                     "good-canonical-relative_alt",
2030 |                     "good-og-relative_alt",
2031 |                 ],
2032 |                 False,
2033 |                 cached_urlparser_maxitems=1,
2034 |             )
2035 |             if errors:
2036 |                 raise ValueError(errors)
2037 |         assert isinstance(cm.exception, ValueError)
2038 |         assert (
2039 |             cm.exception.args[0]
2040 |             == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2041 |         )
2042 | 
2043 |     def test__Int_1(self):
2044 |         # this should fail
2045 |         with self.assertRaises(ValueError) as cm:
2046 |             parsed, errors = _docs_test_parser(
2047 |                 [
2048 |                     "good-canonical-relative",
2049 |                     "good-canonical-relative_alt",
2050 |                     "good-og-relative_alt",
2051 |                 ],
2052 |                 1,
2053 |                 cached_urlparser_maxitems=1,
2054 |             )
2055 |             if errors:
2056 |                 raise ValueError(errors)
2057 |         assert isinstance(cm.exception, ValueError)
2058 |         assert (
2059 |             cm.exception.args[0]
2060 |             == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2061 |         )
2062 | 
2063 |     def test__Int_0(self):
2064 |         """MetadataParser(cached_urlparser=0)"""
2065 |         # this should fail
2066 |         with self.assertRaises(ValueError) as cm:
2067 |             parsed, errors = _docs_test_parser(
2068 |                 [
2069 |                     "good-canonical-relative",
2070 |                     "good-canonical-relative_alt",
2071 |                     "good-og-relative_alt",
2072 |                 ],
2073 |                 0,
2074 |                 cached_urlparser_maxitems=1,
2075 |             )
2076 |             if errors:
2077 |                 raise ValueError(errors)
2078 |         assert isinstance(cm.exception, ValueError)
2079 |         assert (
2080 |             cm.exception.args[0]
2081 |             == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2082 |         )
2083 | 
2084 |     def test__None(self):
2085 |         # this should fail
2086 |         with self.assertRaises(ValueError) as cm:
2087 |             parsed, errors = _docs_test_parser(
2088 |                 [
2089 |                     "good-canonical-relative",
2090 |                     "good-canonical-relative_alt",
2091 |                     "good-og-relative_alt",
2092 |                 ],
2093 |                 None,
2094 |                 cached_urlparser_maxitems=1,
2095 |             )
2096 |             if errors:
2097 |                 raise ValueError(errors)
2098 |         assert isinstance(cm.exception, ValueError)
2099 |         assert (
2100 |             cm.exception.args[0]
2101 |             == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2102 |         )
2103 | 
2104 |     def test__CustomParser(self):
2105 |         # this should fail
2106 |         custom_parser_obj = metadata_parser.UrlParserCacheable()
2107 |         custom_parser = custom_parser_obj.urlparse
2108 |         with self.assertRaises(ValueError) as cm:
2109 |             parsed, errors = _docs_test_parser(
2110 |                 [
2111 |                     "good-canonical-relative",
2112 |                     "good-canonical-relative_alt",
2113 |                     "good-og-relative_alt",
2114 |                 ],
2115 |                 custom_parser,
2116 |                 cached_urlparser_maxitems=1,
2117 |             )
2118 |             if errors:
2119 |                 raise ValueError(errors)
2120 |         assert isinstance(cm.exception, ValueError)
2121 |         assert (
2122 |             cm.exception.args[0]
2123 |             == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
2124 |         )
2125 | 


--------------------------------------------------------------------------------
/tests/test_ip_tracking.py:
--------------------------------------------------------------------------------
 1 | # stdlib
 2 | import unittest
 3 | 
 4 | # local
 5 | import metadata_parser
 6 | 
 7 | # ==============================================================================
 8 | 
 9 | 
10 | class TestIpLookups(unittest.TestCase):
11 |     """"""
12 | 
13 |     def test_ip_lookup(self):
14 |         """
15 |         this is using the live internet
16 | 
17 |         todo: use httpbin
18 |         """
19 |         url = "https://example.com/"
20 |         page = metadata_parser.MetadataParser(url=url)
21 |         self.assertTrue(page.peername)
22 | 


--------------------------------------------------------------------------------
/tests/test_responses.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # stdlib
 4 | import unittest
 5 | 
 6 | # pypi
 7 | import requests
 8 | import responses
 9 | 
10 | # local
11 | from metadata_parser import derive_encoding__hook
12 | 
13 | # ==============================================================================
14 | 
15 | 
16 | URLS_HEADER = {
17 |     "https://example.com/header=none": (None, "ISO-8859-1", "&hearts;"),
18 |     "https://example.com/header=ISO-8859-1": ("ISO-8859-1", "ISO-8859-1", "&hearts;"),
19 |     "https://example.com/header=utf-8": ("utf-8", "utf-8", "♥"),
20 |     "https://example.com/header=UTF-8": ("UTF-8", "UTF-8", "♥"),
21 | }
22 | URLS_META = {
23 |     "https://example.com/content_type=none": (None, "ISO-8859-1", "&hearts;"),
24 |     "https://example.com/content_type=ISO-8859-1": (
25 |         "ISO-8859-1",
26 |         "ISO-8859-1",
27 |         "&hearts;",
28 |     ),
29 |     "https://example.com/content_type=utf-8": ("utf-8", "utf-8", "♥"),
30 |     "https://example.com/content_type=UTF-8": ("UTF-8", "UTF-8", "♥"),
31 | }
32 | 
33 | 
34 | class TestMockedResponse(unittest.TestCase):
35 |     def test_simple_encoding_found(self):
36 |         """these tests just check to see we derive the right content with `derive_encoding__hook`"""
37 | 
38 |         requests_session = requests.Session()
39 |         requests_session.hooks["response"].append(derive_encoding__hook)
40 | 
41 |         with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
42 |             # track results to this
43 |             to_test = {}
44 | 
45 |             # set up the header tests
46 |             for url in URLS_HEADER.keys():
47 |                 (_header, _expected, _body_char) = URLS_HEADER[url]
48 |                 _content_type = "text/html"
49 |                 if _header:
50 |                     _content_type = "text/html; charset=%s" % _header
51 |                 _body = "<html><head></head><body>%s</body></html>" % _body_char
52 |                 rsps.add(
53 |                     responses.GET,
54 |                     url,
55 |                     body=_body,
56 |                     status=200,
57 |                     content_type=_content_type,
58 |                 )
59 |                 to_test[url] = (_expected, _body)
60 | 
61 |             # set up the meta tests
62 |             for url in URLS_META.keys():
63 |                 (_header, _expected, _body_char) = URLS_META[url]
64 |                 _body = "<html><head></head><body>%s</body></html>" % _body_char
65 |                 if _header:
66 |                     _body = (
67 |                         '<html><head><meta charset="%s"></head><body>%s</body></html>'
68 |                         % (_header, _body_char)
69 |                     )
70 |                 rsps.add(
71 |                     responses.GET, url, body=_body, status=200, content_type="text/html"
72 |                 )
73 |                 to_test[url] = (_expected, _body)
74 | 
75 |             for url in to_test:
76 |                 (_expected, _body) = to_test[url]
77 |                 r = requests_session.get(url)
78 |                 self.assertEqual(r.status_code, 200)
79 |                 self.assertEqual(r.encoding, _expected)
80 |                 self.assertEqual(r.text, _body)
81 | 


--------------------------------------------------------------------------------
/tests/test_sessions.py:
--------------------------------------------------------------------------------
 1 | # stdlib
 2 | from typing import Optional
 3 | import unittest
 4 | 
 5 | # pypi
 6 | from httpbin import app as httpbin_app
 7 | import pytest_httpbin.serve
 8 | import requests
 9 | 
10 | # local
11 | import metadata_parser
12 | 
13 | # ==============================================================================
14 | 
15 | 
16 | class SessionRedirect(requests.Session):
17 |     num_checked = None
18 | 
19 |     def get_redirect_target(self, resp):
20 |         # previous versions cached this for later use, but now we use a hook
21 |         # cached_peername = metadata_parser.get_response_peername(resp)
22 |         def _get():
23 |             if self.num_checked is None:
24 |                 self.num_checked = 0
25 |             self.num_checked += 1
26 |             if resp.is_redirect:
27 |                 return resp.headers["location"]
28 |             if resp.status_code == 200:
29 |                 # some servers will do a 200 but put a redirect header in there. WTF
30 |                 dumb_redirect = resp.headers.get("location")
31 |                 if dumb_redirect:
32 |                     return dumb_redirect
33 |             return None
34 | 
35 |         # --
36 |         if not hasattr(resp, "_redirect_target"):
37 |             resp._redirect_target = _get()
38 |         return resp._redirect_target
39 | 
40 | 
41 | class TestSessionsHttpBin(unittest.TestCase):
42 |     def setUp(self):
43 |         self.httpbin_server = pytest_httpbin.serve.Server(application=httpbin_app)
44 |         self.httpbin_server.start()
45 | 
46 |     def tearDown(self):
47 |         self.httpbin_server.stop()
48 |         try:
49 |             # we're not invoking `pytest_httpbin.serve.Server` in the standard way
50 |             # our implementation was copied off another project
51 |             # the `_server` is a wsgiref server, and in Py3 simply calling
52 |             # `stop()` wil shutdown the server, but it will not `close()` any
53 |             # lingering sockets. this explicitly does that.
54 |             self.httpbin_server._server.socket.close()
55 |         except Exception as exc:  # noqa: F841
56 |             pass
57 | 
58 |     def test_no_session(self):
59 |         """just checking for args"""
60 |         url = self.httpbin_server.url + "/html"
61 |         page = metadata_parser.MetadataParser(url=url)
62 |         assert page
63 |         assert page.url == url
64 | 
65 |     def test_simple_session(self):
66 |         """just checking for args"""
67 |         url = self.httpbin_server.url + "/html"
68 |         with requests.Session() as s:
69 |             page = metadata_parser.MetadataParser(url=url, requests_session=s)
70 |             assert page
71 |             assert page.url == url
72 | 
73 |     def test_custom_session(self):
74 |         """just checking for a custom session"""
75 |         num_redirects = 4
76 |         url = self.httpbin_server.url + "/redirect/%s" % num_redirects
77 |         with SessionRedirect() as s:
78 |             page: Optional[metadata_parser.MetadataParser]
79 |             try:
80 |                 page = metadata_parser.MetadataParser(url=url, requests_session=s)
81 |             except metadata_parser.NotParsableJson as e:
82 |                 page = e.metadataParser
83 |             # typing scope
84 |             assert page is not None
85 |             assert page.response is not None
86 |             # we end on get
87 |             self.assertEqual(page.response.url, self.httpbin_server.url + "/get")
88 |             # the session should have checked the following responses: redirects + final
89 |             self.assertEqual(num_redirects + 1, s.num_checked)
90 |             self.assertEqual(num_redirects, len(page.response.history))
91 | 
92 |             # make sure that we tracked the peername.  httpbin will encode
93 |             self.assertTrue(metadata_parser.get_response_peername(page.response))
94 |             for h in page.response.history:
95 |                 self.assertTrue(metadata_parser.get_response_peername(h))
96 | 


--------------------------------------------------------------------------------
/tests/test_url_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding=utf-8 -*-
  2 | 
  3 | # stdlib
  4 | import unittest
  5 | from urllib.parse import ParseResult
  6 | from urllib.parse import ParseResultBytes
  7 | from urllib.parse import urlparse
  8 | 
  9 | # local
 10 | import metadata_parser
 11 | 
 12 | # ==============================================================================
 13 | 
 14 | 
 15 | URLS_VALID = [
 16 |     "http://example.com",
 17 |     "http://example.com/",
 18 |     "http://example.com/one",
 19 |     "http://example.com/one/two.html",
 20 |     "http://foo.example.com",
 21 |     "http://example.com:80",
 22 |     "http://example.com:80/",
 23 |     "http://example.com:80/one",
 24 |     "http://example.com:80/one/two.html",
 25 |     "http://192.168.1.1",
 26 |     "http://192.168.1.1/",
 27 |     "http://192.168.1.1:80",
 28 |     "http://192.168.1.1:8080",
 29 |     "http://192.168.1.1:80/",
 30 |     "http://192.168.1.1:8080/",
 31 |     "http://192.168.1.1:80/a.html",
 32 |     "http://192.168.1.1:8080/a.html",
 33 |     "https://example.com",
 34 |     "https://example.com/",
 35 |     "https://example.com/one",
 36 |     "https://example.com/one/two.html",
 37 |     "https://foo.example.com",
 38 |     "https://example.com:80",
 39 |     "https://example.com:80/",
 40 |     "https://example.com:80/one",
 41 |     "https://example.com:80/one/two.html",
 42 |     "https://192.168.1.1",
 43 |     "https://192.168.1.1/",
 44 |     "https://192.168.1.1:80",
 45 |     "https://192.168.1.1:8080",
 46 |     "https://192.168.1.1:80/",
 47 |     "https://192.168.1.1:8080/",
 48 |     "https://192.168.1.1:80/a.html",
 49 |     "https://192.168.1.1:8080/a.html",
 50 | ]
 51 | 
 52 | URLS_VALID_CONDITIONAL = [
 53 |     "http://localhost",
 54 |     "http://localhost:80",
 55 |     "http://localhost:8000",
 56 |     "http://localhost/foo",
 57 |     "http://localhost:80/foo",
 58 |     "http://localhost:8000/foo",
 59 |     "https://localhost",
 60 |     "https://localhost:80",
 61 |     "https://localhost:8000",
 62 |     "https://localhost/foo",
 63 |     "https://localhost:80/foo",
 64 |     "https://localhost:8000/foo",
 65 |     "http://127.0.0.1",
 66 |     "http://127.0.0.1:80",
 67 |     "http://127.0.0.1:8000",
 68 |     "http://127.0.0.1/foo",
 69 |     "http://127.0.0.1:80/foo",
 70 |     "http://127.0.0.1:8000/foo",
 71 |     "https://127.0.0.1",
 72 |     "https://127.0.0.1:80",
 73 |     "https://127.0.0.1:8000",
 74 |     "https://127.0.0.1/foo",
 75 |     "https://127.0.0.1:80/foo",
 76 |     "https://127.0.0.1:8000/foo",
 77 |     "http://0.0.0.0",
 78 |     "http://0.0.0.0:80",
 79 |     "http://0.0.0.0:8000",
 80 |     "http://0.0.0.0/foo",
 81 |     "http://0.0.0.0:80/foo",
 82 |     "http://0.0.0.0:8000/foo",
 83 |     "https://0.0.0.0",
 84 |     "https://0.0.0.0:80",
 85 |     "https://0.0.0.0:8000",
 86 |     "https://0.0.0.0/foo",
 87 |     "https://0.0.0.0:80/foo",
 88 |     "https://0.0.0.0:8000/foo",
 89 | ]
 90 | 
 91 | URLS_INVALID = [
 92 |     "http://example_com",
 93 |     "http://example_com/",
 94 |     "http://example_com/one",
 95 |     "http://999.999.999.999/",
 96 |     "http://999.999.999.999.999/",
 97 |     "http://999.999.999.999.999:8080:8080",
 98 |     "https://example_com",
 99 |     "https://example_com/",
100 |     "https://example_com/one",
101 |     "https://999.999.999.999/",
102 |     "https://999.999.999.999.999/",
103 |     "https://999.999.999.999.999:8080:8080",
104 | ]
105 | 
106 | 
107 | RFC_REGEX_VALID = [
108 |     """http://user:password@one.example.com/foo/bar;one=two&three=four?foo=bar&biz=bash#foo"""
109 | ]
110 | 
111 | RFC_REGEX_INVALID = ["""</p><br /><p>Then l""", """ccurl" style="display:none;" """]
112 | 
113 | 
114 | class TestUrlRfcValid(unittest.TestCase):
115 |     """
116 |     python -m unittest tests.url_parsing.TestUrlRfcValid
117 | 
118 |     Ensures URLs contain rfc valid components
119 |     """
120 | 
121 |     def test_urls_valid(self):
122 |         for i in RFC_REGEX_VALID:
123 |             matched = metadata_parser.RE_rfc3986_valid_characters.match(i)
124 |             self.assertTrue(matched)
125 | 
126 |     def test_urls_invalid(self):
127 |         for i in RFC_REGEX_INVALID:
128 |             matched = metadata_parser.RE_rfc3986_valid_characters.match(i)
129 |             self.assertTrue(matched is None)
130 | 
131 | 
132 | class TestUrlParsing(unittest.TestCase):
133 |     """
134 |     python -m unittest tests.url_parsing.TestUrls
135 | 
136 |     Ensures URLs are parsed correctly as valid/invalid
137 |     """
138 | 
139 |     def test_urls_valid(self):
140 |         for i in URLS_VALID:
141 |             parsed = urlparse(i)
142 |             self.assertTrue(metadata_parser.is_parsed_valid_url(parsed))
143 | 
144 |     def test_urls_invalid(self):
145 |         for i in URLS_INVALID:
146 |             parsed = urlparse(i)
147 |             self.assertFalse(metadata_parser.is_parsed_valid_url(parsed))
148 | 
149 |     def test_urls_valid_conditional(self):
150 |         for i in URLS_VALID_CONDITIONAL:
151 |             parsed = urlparse(i)
152 |             self.assertFalse(
153 |                 metadata_parser.is_parsed_valid_url(
154 |                     parsed, require_public_netloc=True, allow_localhosts=False
155 |                 )
156 |             )
157 |             self.assertTrue(
158 |                 metadata_parser.is_parsed_valid_url(
159 |                     parsed, require_public_netloc=False, allow_localhosts=True
160 |                 )
161 |             )
162 | 
163 | 
164 | class TestAbsoluteUpgrades(unittest.TestCase):
165 |     """
166 |     python -m unittest tests.url_parsing.TestAbsoluteUpgrades
167 | 
168 |     Ensures URLs are parsed correctly as valid/invalid
169 |     """
170 | 
171 |     def test_none_returns_none(self):
172 |         absolute = metadata_parser.url_to_absolute_url(None, url_fallback=None)
173 |         self.assertEqual(absolute, None)
174 | 
175 |     def test_nothing(self):
176 |         absolute = metadata_parser.url_to_absolute_url(
177 |             "http://example.com", url_fallback="http://example.com"
178 |         )
179 |         self.assertEqual(absolute, "http://example.com")
180 | 
181 |     def test_upgrade(self):
182 |         absolute = metadata_parser.url_to_absolute_url(
183 |             "a.html", url_fallback="http://example.com"
184 |         )
185 |         self.assertEqual(absolute, "http://example.com/a.html")
186 | 
187 |     def test_fallback(self):
188 |         absolute = metadata_parser.url_to_absolute_url(
189 |             None, url_fallback="http://example.com"
190 |         )
191 |         self.assertEqual(absolute, "http://example.com")
192 | 
193 | 
194 | class _DocumentCanonicalsMixin(object):
195 |     def _MakeOne(self, url):
196 |         """generates a canonical document"""
197 |         doc_base = """<html><head>%(head)s</head><body></body></html>"""
198 |         canonical_base = """<link rel='canonical' href='%(canonical)s' />"""
199 |         _canonical_html = canonical_base % {"canonical": url}
200 |         _doc_html = doc_base % {"head": _canonical_html}
201 |         return _doc_html
202 | 
203 | 
204 | class TestDocumentCanonicals(unittest.TestCase, _DocumentCanonicalsMixin):
205 |     """
206 |     python -m unittest tests.url_parsing.TestDocumentCanonicals
207 |     """
208 | 
209 |     def test_canonical_simple(self):
210 |         """someone did their job"""
211 |         url = None
212 |         rel_canonical = "https://example.com/canonical"
213 |         rel_expected = "https://example.com/canonical"
214 |         html_doc = self._MakeOne(rel_canonical)
215 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
216 |         parsed_url = parsed.get_discrete_url()
217 |         self.assertEqual(parsed_url, rel_expected)
218 | 
219 |     def test_canonical_upgrade(self):
220 |         """someone else did their job. not as good, but did their job"""
221 |         url = "https://example.com"
222 |         rel_canonical = "/canonical"
223 |         rel_expected = "https://example.com/canonical"
224 |         html_doc = self._MakeOne(rel_canonical)
225 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
226 |         parsed_url = parsed.get_discrete_url()
227 |         self.assertEqual(parsed_url, rel_expected)
228 | 
229 |     def test_upgrade_invalid_root(self):
230 |         """
231 |         you had one job...
232 |         """
233 |         url = "https://example.com"
234 |         rel_canonical = "http://localhost:8080"
235 |         rel_expected = "https://example.com"
236 |         html_doc = self._MakeOne(rel_canonical)
237 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
238 |         parsed_url = parsed.get_discrete_url()
239 |         self.assertEqual(parsed_url, rel_expected)
240 | 
241 |     def test_upgrade_utf8_path(self):
242 |         """
243 |         you had one job... but you didn't read the RFC you shitty third rate enterprise cms
244 |         """
245 |         url = "https://example.com"
246 |         rel_canonical = r"https://example.com/canonical-ü"
247 |         rel_expected = r"https://example.com/canonical-%C3%BC"
248 |         html_doc = self._MakeOne(rel_canonical)
249 |         parsed = metadata_parser.MetadataParser(
250 |             url=url,
251 |             html=html_doc,
252 |             derive_encoding=False,
253 |             default_encoding="utf-8",
254 |             html_encoding="utf-8",
255 |         )
256 |         parsed_url = parsed.get_discrete_url()
257 |         self.assertEqual(parsed_url, rel_expected)
258 | 
259 |     def test_upgrade_invalid_file(self):
260 |         """
261 |         you had one job...
262 |         if someone lists the canonical as an invalid domain, remount the right domain
263 | 
264 |         python -m unittest tests.url_parsing.TestDocumentCanonicals.test_upgrade_invalid_file
265 |         """
266 |         url = "https://example.com/a"
267 |         rel_canonical = "http://localhost:8080"
268 |         rel_expected = "https://example.com"
269 |         html_doc = self._MakeOne(rel_canonical)
270 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
271 |         parsed_url = parsed.get_discrete_url()
272 |         self.assertEqual(parsed_url, rel_expected)
273 | 
274 |     def test_upgrade_invalid_file_b(self):
275 |         """
276 |         you had one job...
277 |         if someone lists the canonical as a different file on an invalid domain, remount the right domain
278 |         """
279 |         url = "https://example.com/a"
280 |         rel_canonical = "http://localhost:8080/b"
281 |         rel_expected = "https://example.com/b"
282 |         html_doc = self._MakeOne(rel_canonical)
283 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
284 |         parsed_url = parsed.get_discrete_url()
285 |         self.assertEqual(parsed_url, rel_expected)
286 | 
287 |     def test_readme_scenario(self):
288 |         """
289 |         you had one job...
290 |         if someone lists the canonical as an invalid LOCAL domain, remount the right domain
291 | 
292 |         python -m unittest tests.url_parsing.TestDocumentCanonicals.test_readme_scenario
293 |         """
294 |         url = "https://example.com/a"
295 |         rel_canonical = "http://localhost:8000/alt-path/to/foo"
296 |         rel_expected = "https://example.com/alt-path/to/foo"
297 |         rel_expected_legacy = rel_canonical
298 |         html_doc = self._MakeOne(rel_canonical)
299 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
300 | 
301 |         # ensure we replace the bad domain with the right one
302 |         parsed_url = parsed.get_discrete_url()
303 |         self.assertEqual(parsed_url, rel_expected)
304 | 
305 |         # ensure support for the legacy behavior...
306 |         parsed_url = parsed.get_discrete_url(require_public_global=False)
307 |         self.assertEqual(parsed_url, rel_expected_legacy)
308 | 
309 | 
310 | class TestDocumentCanonicalsRelative(unittest.TestCase, _DocumentCanonicalsMixin):
311 |     """
312 |     python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative
313 |     python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative.test_upgrade_local_a
314 |     python -m unittest tests.url_parsing.TestDocumentCanonicalsRelative.test_upgrade_local_b
315 |     """
316 | 
317 |     def test_upgrade_local_a(self):
318 |         """"""
319 |         url = "https://example.com/nested/A.html"
320 |         rel_canonical = "/nested/B.html"
321 |         rel_expected = "https://example.com/nested/B.html"
322 |         html_doc = self._MakeOne(rel_canonical)
323 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
324 |         parsed_url = parsed.get_discrete_url()
325 |         self.assertEqual(parsed_url, rel_expected)
326 | 
327 |     def test_upgrade_local_b(self):
328 |         """"""
329 |         url = "https://example.com/nested/A.html"
330 |         rel_canonical = "B.html"
331 |         rel_expected = "https://example.com/nested/B.html"
332 |         html_doc = self._MakeOne(rel_canonical)
333 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
334 |         parsed_url = parsed.get_discrete_url()
335 |         self.assertEqual(parsed_url, rel_expected)
336 | 
337 |     def test_upgrade_local_bb(self):
338 |         """"""
339 |         url = "https://example.com/nested/A.html"
340 |         rel_canonical = "path/to/B.html"
341 |         rel_expected = "https://example.com/nested/path/to/B.html"
342 |         html_doc = self._MakeOne(rel_canonical)
343 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
344 |         parsed_url = parsed.get_discrete_url()
345 |         self.assertEqual(parsed_url, rel_expected)
346 | 
347 |     def test_upgrade_local_c(self):
348 |         """"""
349 |         url = "https://example.com/nested/A.html"
350 |         rel_canonical = "/B.html"
351 |         rel_expected = "https://example.com/B.html"
352 |         html_doc = self._MakeOne(rel_canonical)
353 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
354 |         parsed_url = parsed.get_discrete_url()
355 |         self.assertEqual(parsed_url, rel_expected)
356 | 
357 |     def test_noupgrade_a(self):
358 |         """
359 |         these tests currently require tldextract; otherwise they won't work right.
360 |         """
361 |         if not metadata_parser.USE_TLDEXTRACT:
362 |             raise ValueError("these tests currently require tldextract")
363 | 
364 |         url = "https://example.com/nested/A.html"
365 |         rel_canonical = "https://foo.local/B.html"
366 |         rel_expected = None
367 |         html_doc = self._MakeOne(rel_canonical)
368 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
369 | 
370 |         parsed_url = parsed.get_url_canonical(require_public_global=True)
371 |         self.assertEqual(parsed_url, rel_expected)
372 | 
373 |         parsed_url = parsed.get_url_opengraph(require_public_global=True)
374 |         self.assertEqual(parsed_url, rel_expected)
375 | 
376 |         parsed_url = parsed.get_url_canonical(
377 |             require_public_global=True, url_fallback=url
378 |         )
379 |         self.assertEqual(parsed_url, rel_expected)
380 | 
381 |         parsed_url = parsed.get_url_opengraph(
382 |             require_public_global=True, url_fallback=url
383 |         )
384 |         self.assertEqual(parsed_url, rel_expected)
385 | 
386 | 
387 | class TestFixUnicodeUrls(unittest.TestCase):
388 |     def test_fix_unicode_path(self):
389 |         _test_pairs = (
390 |             (
391 |                 "https://example.com/2017/12/abcdefgühijklmnop?a=%20foo",
392 |                 "https://example.com/2017/12/abcdefg%C3%BChijklmnop?a=%20foo",
393 |             ),
394 |         )
395 |         for raw, expected in _test_pairs:
396 |             cleaned = metadata_parser.fix_unicode_url(raw)
397 |             self.assertEqual(cleaned, expected)
398 | 
399 |     def test_fix_unicode_path_leave_unicode_kwargs(self):
400 |         _test_pairs = (
401 |             (
402 |                 "https://example.com/2017/12/abcdefgühijklmnop?a=%20foo&b=ü",
403 |                 "https://example.com/2017/12/abcdefg%C3%BChijklmnop?a=%20foo&b=ü",
404 |             ),
405 |         )
406 |         for raw, expected in _test_pairs:
407 |             cleaned = metadata_parser.fix_unicode_url(raw)
408 |             self.assertEqual(cleaned, expected)
409 | 
410 | 
411 | class TestArgsExceptions(unittest.TestCase, _DocumentCanonicalsMixin):
412 |     """
413 |     python -m unittest tests.url_parsing.TestArgsExceptions
414 |     """
415 | 
416 |     def test_no_args__good(self):
417 |         url = "https://example.com/nested/A.html"
418 |         rel_canonical = "/B.html"
419 |         rel_expected = "https://example.com/B.html"  # noqa: F841
420 |         html_doc = self._MakeOne(rel_canonical)
421 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
422 |         parsed_url = parsed.get_discrete_url()  # noqa: F841
423 | 
424 |     def test_og_first__good(self):
425 |         url = "https://example.com/nested/A.html"
426 |         rel_canonical = "/B.html"
427 |         rel_expected = "https://example.com/B.html"  # noqa: F841
428 |         html_doc = self._MakeOne(rel_canonical)
429 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
430 |         parsed_url = parsed.get_discrete_url(og_first=True)  # noqa: F841
431 | 
432 |     def test_og_first_canonical_first__bad(self):
433 |         url = "https://example.com/nested/A.html"
434 |         rel_canonical = "/B.html"
435 |         rel_expected = "https://example.com/B.html"  # noqa: F841
436 |         html_doc = self._MakeOne(rel_canonical)
437 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
438 |         self.assertRaises(
439 |             ValueError, parsed.get_discrete_url, og_first=True, canonical_first=True
440 |         )
441 | 
442 |     def test_canonical_first__bad(self):
443 |         url = "https://example.com/nested/A.html"
444 |         rel_canonical = "/B.html"
445 |         rel_expected = "https://example.com/B.html"  # noqa: F841
446 |         html_doc = self._MakeOne(rel_canonical)
447 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
448 |         self.assertRaises(ValueError, parsed.get_discrete_url, canonical_first=True)
449 | 
450 |     def test_canonical_first__good(self):
451 |         url = "https://example.com/nested/A.html"
452 |         rel_canonical = "/B.html"
453 |         rel_expected = "https://example.com/B.html"  # noqa: F841
454 |         html_doc = self._MakeOne(rel_canonical)
455 |         parsed = metadata_parser.MetadataParser(url=url, html=html_doc)
456 |         parsed_url = parsed.get_discrete_url(  # noqa: F841
457 |             og_first=False, canonical_first=True
458 |         )
459 | 
460 | 
461 | class TestCommands(unittest.TestCase, _DocumentCanonicalsMixin):
462 |     """
463 |     python -m unittest tests.url_parsing.TestCommands
464 |     """
465 | 
466 |     def test_is_parsed_valid_url__string(self):
467 |         url = "https://example.com/A.html"
468 |         parsed = urlparse(url)
469 |         self.assertIsInstance(parsed, ParseResult)
470 |         is_valid = metadata_parser.is_parsed_valid_url(parsed)
471 |         self.assertTrue(is_valid)
472 | 
473 |     def test_is_parsed_valid_url__bytes(self):
474 |         url = b"https://example.com/A.html"
475 |         parsed = urlparse(url)
476 |         self.assertIsInstance(parsed, ParseResultBytes)
477 |         is_valid = metadata_parser.is_parsed_valid_url(parsed)
478 |         self.assertTrue(is_valid)
479 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 | 	lint,
 4 | 	mypy,
 5 | 	py37,py38,py39,py310,py311,py312,py313
 6 | 
 7 | [testenv]
 8 | commands =
 9 |     python --version
10 |     pytest {posargs:}
11 | extras =
12 |     testing
13 | 


--------------------------------------------------------------------------------
/types.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | types {
 3 |     text/html                             html htm shtml;
 4 |     text/css                              css;
 5 |     text/xml                              xml;
 6 |     image/gif                             gif;
 7 |     image/jpeg                            jpeg jpg;
 8 |     application/javascript                js;
 9 |     application/atom+xml                  atom;
10 |     application/rss+xml                   rss;
11 | 
12 |     text/mathml                           mml;
13 |     text/plain                            txt;
14 |     text/vnd.sun.j2me.app-descriptor      jad;
15 |     text/vnd.wap.wml                      wml;
16 |     text/x-component                      htc;
17 | 
18 |     image/png                             png;
19 |     image/tiff                            tif tiff;
20 |     image/vnd.wap.wbmp                    wbmp;
21 |     image/x-icon                          ico;
22 |     image/x-jng                           jng;
23 |     image/x-ms-bmp                        bmp;
24 |     image/svg+xml                         svg svgz;
25 |     image/webp                            webp;
26 | 
27 |     application/font-woff                 woff;
28 |     application/java-archive              jar war ear;
29 |     application/json                      json;
30 |     application/mac-binhex40              hqx;
31 |     application/msword                    doc;
32 |     application/pdf                       pdf;
33 |     application/postscript                ps eps ai;
34 |     application/rtf                       rtf;
35 |     application/vnd.ms-excel              xls;
36 |     application/vnd.ms-fontobject         eot;
37 |     application/vnd.ms-powerpoint         ppt;
38 |     application/vnd.wap.wmlc              wmlc;
39 |     application/vnd.google-earth.kml+xml  kml;
40 |     application/vnd.google-earth.kmz      kmz;
41 |     application/x-7z-compressed           7z;
42 |     application/x-cocoa                   cco;
43 |     application/x-java-archive-diff       jardiff;
44 |     application/x-java-jnlp-file          jnlp;
45 |     application/x-makeself                run;
46 |     application/x-perl                    pl pm;
47 |     application/x-pilot                   prc pdb;
48 |     application/x-rar-compressed          rar;
49 |     application/x-redhat-package-manager  rpm;
50 |     application/x-sea                     sea;
51 |     application/x-shockwave-flash         swf;
52 |     application/x-stuffit                 sit;
53 |     application/x-tcl                     tcl tk;
54 |     application/x-x509-ca-cert            der pem crt;
55 |     application/x-xpinstall               xpi;
56 |     application/xhtml+xml                 xhtml;
57 |     application/zip                       zip;
58 | 
59 |     application/octet-stream              bin exe dll;
60 |     application/octet-stream              deb;
61 |     application/octet-stream              dmg;
62 |     application/octet-stream              iso img;
63 |     application/octet-stream              msi msp msm;
64 | 
65 |     application/vnd.openxmlformats-officedocument.wordprocessingml.document    docx;
66 |     application/vnd.openxmlformats-officedocument.spreadsheetml.sheet          xlsx;
67 |     application/vnd.openxmlformats-officedocument.presentationml.presentation  pptx;
68 | 
69 |     audio/midi                            mid midi kar;
70 |     audio/mpeg                            mp3;
71 |     audio/ogg                             ogg;
72 |     audio/x-m4a                           m4a;
73 |     audio/x-realaudio                     ra;
74 | 
75 |     video/3gpp                            3gpp 3gp;
76 |     video/mp4                             mp4;
77 |     video/mpeg                            mpeg mpg;
78 |     video/quicktime                       mov;
79 |     video/webm                            webm;
80 |     video/x-flv                           flv;
81 |     video/x-m4v                           m4v;
82 |     video/x-mng                           mng;
83 |     video/x-ms-asf                        asx asf;
84 |     video/x-ms-wmv                        wmv;
85 |     video/x-msvideo                       avi;
86 | }
87 | 


--------------------------------------------------------------------------------