├── .github └── workflows │ └── tox.yml ├── .gitignore ├── .hgignore ├── CHANGES.rst ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── README_fixt.py ├── conftest.py ├── docs ├── Makefile ├── api.rst ├── attributes.rst ├── changes.rst ├── conf.py ├── conftest.py ├── css.rst ├── future.rst ├── index.rst ├── manipulating.rst ├── pseudo_classes.rst ├── scrap.rst ├── testing.rst ├── tips.rst └── traversing.rst ├── pyquery ├── __init__.py ├── cssselectpatch.py ├── openers.py ├── pyquery.py └── text.py ├── pytest.ini ├── setup.py ├── tests ├── __init__.py ├── apps.py ├── browser_base.py ├── doctests.rst ├── geckodriver.sh ├── invalid.xml ├── selenium.sh ├── test.html ├── test_browser.py ├── test_pyquery.py └── test_real_browser.py └── tox.ini /.github/workflows/tox.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | name: tox 4 | 5 | on: [push, pull_request] 6 | 7 | jobs: 8 | linux: 9 | runs-on: ubuntu-latest 10 | env: 11 | MOZ_HEADLESS: "1" 12 | strategy: 13 | matrix: 14 | python: [3.8, 3.9, "3.10", 3.11, 3.12] 15 | 16 | steps: 17 | - name: Setup firefox 18 | uses: browser-actions/setup-firefox@latest 19 | with: 20 | firefox-version: latest 21 | - uses: actions/checkout@v3 22 | - name: Setup Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python }} 26 | - name: Install geckodriver 27 | run: tests/geckodriver.sh 28 | - name: Install Tox and any other packages 29 | run: pip install tox 30 | - name: Run Tox 31 | run: | 32 | export PATH=$PATH:$PWD 33 | tox -e py 34 | - name: Run lint / docs 35 | run: tox -e lint,docs 36 | if: matrix.python == 3.11 37 | 38 | windows: 39 | runs-on: windows-latest 40 | strategy: 41 | matrix: 42 | python: [3.8, 3.9, "3.10", 3.11, 3.12] 43 | 44 | steps: 45 | - uses: actions/checkout@v3 46 | - name: Setup Python 47 | uses: actions/setup-python@v4 48 | with: 49 | python-version: ${{ matrix.python }} 50 | - name: Install Tox and any other packages 51 | run: pip install tox 52 | - name: Run Tox 53 | run: tox -e py 54 | 55 | macos: 56 | runs-on: macos-latest 57 | strategy: 58 | matrix: 59 | python: [3.8, 3.9, "3.10", 3.11, 3.12] 60 | 61 | steps: 62 | - uses: actions/checkout@v3 63 | - name: Setup Python 64 | uses: actions/setup-python@v4 65 | with: 66 | python-version: ${{ matrix.python }} 67 | - name: Install Tox and any other packages 68 | run: pip install tox 69 | - name: Run Tox 70 | run: tox -e py 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Distribution / packaging 6 | develop-eggs/ 7 | bin/ 8 | dist/ 9 | build/ 10 | parts/ 11 | .tox/ 12 | .installed.cfg 13 | *.egg-info 14 | *.swp 15 | 16 | # Temporary files 17 | *~ 18 | geckodriver 19 | 20 | # Log files 21 | geckodriver.log 22 | 23 | # Sphinx documentation 24 | docs/_build/ 25 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | 4 | develop-eggs/ 5 | bin/ 6 | dist/ 7 | build/ 8 | parts/ 9 | docs/_build/ 10 | .tox/ 11 | .installed.cfg 12 | *.egg-info 13 | *.pyc 14 | *.swp 15 | *~ 16 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | 2.0.2 (unreleased) 2 | ------------------ 3 | 4 | - Nothing changed yet. 5 | 6 | 7 | 2.0.1 (2024-08-30) 8 | ------------------ 9 | 10 | - Breaking change: its seems no longer possible to use the html parser with a xml file so its no longer tested 11 | 12 | - Drop support for python 3.7 13 | 14 | 2.0.0 (2022-12-28) 15 | ------------------ 16 | 17 | - Breaking change: inputs starting with ``"http://"`` or ``"https://"`` like 18 | ``PyQuery("http://example.com")`` will no longer fetch the contents of the URL. 19 | Users desiring the old behavior should switch to ``PyQuery(url="http://example.com")``. 20 | 21 | - Add nextUntil method 22 | 23 | - ``.remove()`` no longer inserts a space in place of the removed element 24 | 25 | - Fix escaping of top-level element text in ``.html()`` output 26 | 27 | - Support (and require) cssselect 1.2+ 28 | 29 | - Drop support for python 3.5/3.6 30 | 31 | 32 | 1.4.3 (2020-11-21) 33 | ------------------ 34 | 35 | - No longer use a universal wheel 36 | 37 | 38 | 1.4.2 (2020-11-21) 39 | ------------------ 40 | 41 | - Fix exception raised when calling `PyQuery("").text()` 42 | 43 | - python2 is no longer supported 44 | 45 | 1.4.1 (2019-10-26) 46 | ------------------ 47 | 48 | - This is the latest release with py2 support 49 | 50 | - Remove py33, py34 support 51 | 52 | - web scraping improvements: default timeout and session support 53 | 54 | - Add API methods to serialize form-related elements according to spec 55 | 56 | - Include HTML markup when querying textarea text/value 57 | 58 | 59 | 1.4.0 (2018-01-11) 60 | ------------------ 61 | 62 | - Refactoring of `.text()` to match firefox behavior. 63 | 64 | 65 | 1.3.0 (2017-10-21) 66 | ------------------ 67 | 68 | - Remove some unmaintained modules: ``pyquery.ajax`` and ``pyquery.rules`` 69 | 70 | - Code cleanup. No longer use ugly hacks required by python2.6/python3.2. 71 | 72 | - Run tests with python3.6 on CI 73 | 74 | - Add a ``method`` argument to ``.outer_html()`` 75 | 76 | 77 | 1.2.17 (2016-10-14) 78 | ------------------- 79 | 80 | - ``PyQuery('').val()`` is ``''`` 81 | - ``PyQuery('').val()`` is ``''`` 82 | 83 | 84 | 1.2.16 (2016-10-14) 85 | ------------------- 86 | 87 | - ``.attr('value', '')`` no longer removes the ``value`` attribute 88 | 89 | - ```` without ``value="..."`` have a ``.val()`` of 90 | ``'on'`` 91 | 92 | - ```` without ``value="..."`` have a ``.val()`` of 93 | ``'on'`` 94 | 95 | - ``')) 226 | >>> d(':input') 227 | [, ')) 196 | >>> d(':input') 197 | [, 434 | 437 | ''' 438 | 439 | html4 = ''' 440 | 444 | 449 | 451 | 456 | ''' 457 | 458 | html6 = ''' 459 | 464 | 469 | 474 | ''' 475 | 476 | html5 = ''' 477 |
478 | 479 | 480 | 481 |
482 | ''' 483 | 484 | def test_attr_empty_string(self): 485 | d = pq('
') 486 | d.attr('value', '') 487 | self.assertEqual(d.outer_html(), '
') 488 | self.assertEqual(d.outer_html(method="xml"), '
') 489 | 490 | def test_remove(self): 491 | d = pq(self.html) 492 | d('img').remove() 493 | val = d('a:first').html() 494 | assert val == 'TestMy link text', repr(val) 495 | val = d('a:last').html() 496 | assert val == 'My link text 2', repr(val) 497 | 498 | def test_class(self): 499 | d = pq('
') 500 | d.removeClass('xx') 501 | assert 'class' not in str(d), str(d) 502 | 503 | def test_val_for_inputs(self): 504 | d = pq(self.html2) 505 | self.assertIsNone(d('input[name="none"]').val()) 506 | self.assertEqual(d('input[name="spam"]').val(), 'Spam') 507 | self.assertEqual(d('input[name="eggs"]').val(), 'Eggs') 508 | self.assertEqual(d('input:checkbox').val(), 'Bacon') 509 | self.assertEqual(d('input:radio').val(), 'Ham') 510 | d('input[name="spam"]').val('42') 511 | d('input[name="eggs"]').val('43') 512 | d('input:checkbox').val('44') 513 | d('input:radio').val('45') 514 | self.assertEqual(d('input[name="spam"]').val(), '42') 515 | self.assertEqual(d('input[name="eggs"]').val(), '43') 516 | self.assertEqual(d('input:checkbox').val(), '44') 517 | self.assertEqual(d('input:radio').val(), '45') 518 | 519 | def test_val_for_inputs_with_newline(self): 520 | d = pq(self.html2_newline) 521 | self.assertEqual(d('#newline-text').val(), 'Spam') 522 | self.assertEqual(d('#newline-radio').val(), 'S\npam') 523 | 524 | def test_val_for_textarea(self): 525 | d = pq(self.html3) 526 | self.assertEqual(d('#textarea-single').val(), 'Spam') 527 | self.assertEqual(d('#textarea-single').text(), 'Spam') 528 | d('#textarea-single').val('42') 529 | self.assertEqual(d('#textarea-single').val(), '42') 530 | # Note: jQuery still returns 'Spam' here. 531 | self.assertEqual(d('#textarea-single').text(), '42') 532 | 533 | multi_expected = '''Spam\nEggs\nBacon''' 534 | self.assertEqual(d('#textarea-multi').val(), multi_expected) 535 | self.assertEqual(d('#textarea-multi').text(), multi_expected) 536 | multi_new = '''Bacon\nEggs\nSpam''' 537 | multi_new_expected = '''Bacon\n<b>Eggs</b>\nSpam''' 538 | d('#textarea-multi').val(multi_new) 539 | self.assertEqual(d('#textarea-multi').val(), multi_new_expected) 540 | self.assertEqual(d('#textarea-multi').text(), multi_new_expected) 541 | 542 | def test_val_for_select(self): 543 | d = pq(self.html4) 544 | self.assertEqual(d('#first').val(), 'spam') 545 | self.assertEqual(d('#second').val(), 'eggs') 546 | self.assertIsNone(d('#third').val()) 547 | d('#first').val('eggs') 548 | d('#second').val('bacon') 549 | d('#third').val('eggs') # Selecting non-existing option. 550 | self.assertEqual(d('#first').val(), 'eggs') 551 | self.assertEqual(d('#second').val(), 'bacon') 552 | self.assertIsNone(d('#third').val()) 553 | d('#first').val('bacon') # Selecting non-existing option. 554 | self.assertEqual(d('#first').val(), 'spam') 555 | # Value set based on option order, not value order 556 | d('#second').val(['bacon', 'eggs']) 557 | self.assertEqual(d('#second').val(), 'eggs') 558 | d('#fourth').val(['spam']) 559 | self.assertEqual(d('#fourth').val(), 'spam') 560 | # Sets first option with matching value 561 | self.assertEqual(d('#fourth option[selected]').length, 1) 562 | self.assertEqual(d('#fourth option[selected]').text(), 'Spam') 563 | 564 | def test_val_for_select_multiple(self): 565 | d = pq(self.html6) 566 | self.assertEqual(d('#first').val(), ['spam', 'eggs']) 567 | # Selecting non-existing option. 568 | d('#first').val(['eggs', 'sausage', 'bacon']) 569 | self.assertEqual(d('#first').val(), ['eggs', 'bacon']) 570 | self.assertEqual(d('#second').val(), []) 571 | d('#second').val('eggs') 572 | self.assertEqual(d('#second').val(), ['eggs']) 573 | d('#second').val(['not spam', 'not eggs']) 574 | self.assertEqual(d('#second').val(), []) 575 | d('#third').val(['spam']) 576 | self.assertEqual(d('#third').val(), ['spam', 'spam', 'spam']) 577 | 578 | def test_val_for_input_and_textarea_given_array_value(self): 579 | d = pq('') 580 | d('input').val(['spam', 'eggs']) 581 | self.assertEqual(d('input').val(), 'spam,eggs') 582 | d = pq('') 583 | d('textarea').val(['spam', 'eggs']) 584 | self.assertEqual(d('textarea').val(), 'spam,eggs') 585 | 586 | def test_val_for_multiple_elements(self): 587 | d = pq(self.html5) 588 | # "Get" returns *first* value. 589 | self.assertEqual(d('div > *').val(), 'spam') 590 | # "Set" updates *every* value. 591 | d('div > *').val('42') 592 | self.assertEqual(d('#first').val(), '42') 593 | self.assertEqual(d('#second').val(), '42') 594 | self.assertEqual(d('#third').val(), '42') 595 | 596 | def test_val_checkbox_no_value_attribute(self): 597 | d = pq('') 598 | self.assertEqual(d.val(), 'on') 599 | d = pq('') 600 | self.assertEqual(d.val(), '') 601 | 602 | def test_val_radio_no_value_attribute(self): 603 | d = pq('') 604 | self.assertEqual(d.val(), 'on') 605 | 606 | def test_val_value_is_empty_string(self): 607 | d = pq('') 608 | self.assertEqual(d.val(), '') 609 | 610 | def test_val_input_has_no_value_attr(self): 611 | d = pq('') 612 | self.assertEqual(d.val(), '') 613 | 614 | def test_html_replacement(self): 615 | html = '
Not MeReplace MeNot Me
' 616 | replacement = 'New Contents New' 617 | expected = html.replace('Replace Me', replacement) 618 | 619 | d = pq(html) 620 | d.find('span').html(replacement) 621 | 622 | new_html = d.outerHtml() 623 | self.assertEqual(new_html, expected) 624 | self.assertIn(replacement, new_html) 625 | 626 | def test_html_escape(self): 627 | inner_html = 'encoded <script> tag with "quotes".' \ 628 | 'nested <tag>' 629 | html = '
' + inner_html + '
' 630 | d = pq(html) 631 | self.assertEqual(d.html(), inner_html) 632 | 633 | 634 | class TestAjax(TestCase): 635 | 636 | html = ''' 637 |
638 | 639 |
640 |
641 |
642 | 643 | 644 | 645 |
646 |
647 | 648 |
649 |
650 | 651 | 652 |
653 | ''' 654 | 655 | html2 = ''' 656 |
657 | 658 |
659 | 660 | 661 |
662 |
663 |
664 |
665 | 666 | 667 |
668 | ''' 669 | 670 | html3 = ''' 671 |
672 | 673 | 674 |
675 | 676 |
677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 |
687 | ''' 688 | 689 | html4 = ''' 690 |
691 | 693 | 699 | 702 |
703 | ''' 704 | 705 | def test_serialize_pairs_form_id(self): 706 | d = pq(self.html) 707 | self.assertEqual(d('#div').serialize_pairs(), []) 708 | self.assertEqual(d('#dispersed').serialize_pairs(), [ 709 | ('order', 'spam'), ('order', 'eggs'), ('order', 'ham'), 710 | ('order', 'tomato'), ('order', 'baked beans'), 711 | ]) 712 | self.assertEqual(d('.no-id').serialize_pairs(), [ 713 | ('spam', 'Spam'), 714 | ]) 715 | 716 | def test_serialize_pairs_form_controls(self): 717 | d = pq(self.html2) 718 | self.assertEqual(d('fieldset').serialize_pairs(), [ 719 | ('fieldset', 'eggs'), ('fieldset', 'ham'), 720 | ]) 721 | self.assertEqual(d('#input, fieldset, #first').serialize_pairs(), [ 722 | ('order', 'spam'), ('fieldset', 'eggs'), ('fieldset', 'ham'), 723 | ('fieldset', 'eggs'), ('fieldset', 'ham'), ('fieldset', 'ham'), 724 | ]) 725 | self.assertEqual(d('#datalist').serialize_pairs(), [ 726 | ('datalist', 'eggs'), ('checkbox', 'on'), ('radio', 'on'), 727 | ]) 728 | 729 | def test_serialize_pairs_filter_controls(self): 730 | d = pq(self.html3) 731 | self.assertEqual(d('form').serialize_pairs(), [ 732 | ('order', 'spam') 733 | ]) 734 | 735 | def test_serialize_pairs_form_values(self): 736 | d = pq(self.html4) 737 | self.assertEqual(d('form').serialize_pairs(), [ 738 | ('spam', 'Spam/spam'), ('order', 'baked\r\nbeans'), 739 | ('order', 'tomato'), ('multiline', 'multiple\r\nlines\r\nof text'), 740 | ]) 741 | 742 | def test_serialize_array(self): 743 | d = pq(self.html4) 744 | self.assertEqual(d('form').serialize_array(), [ 745 | {'name': 'spam', 'value': 'Spam/spam'}, 746 | {'name': 'order', 'value': 'baked\r\nbeans'}, 747 | {'name': 'order', 'value': 'tomato'}, 748 | {'name': 'multiline', 'value': 'multiple\r\nlines\r\nof text'}, 749 | ]) 750 | 751 | def test_serialize(self): 752 | d = pq(self.html4) 753 | self.assertEqual( 754 | d('form').serialize(), 755 | 'spam=Spam%2Fspam&order=baked%0D%0Abeans&order=tomato&' 756 | 'multiline=multiple%0D%0Alines%0D%0Aof%20text' 757 | ) 758 | 759 | def test_serialize_dict(self): 760 | d = pq(self.html4) 761 | self.assertEqual(d('form').serialize_dict(), { 762 | 'spam': 'Spam/spam', 763 | 'order': ['baked\r\nbeans', 'tomato'], 764 | 'multiline': 'multiple\r\nlines\r\nof text', 765 | }) 766 | 767 | 768 | class TestMakeLinks(TestCase): 769 | 770 | html = ''' 771 | 772 |
773 | with href 774 | without href 775 |
776 | 777 | ''' 778 | 779 | def test_make_link(self): 780 | d = pq(self.html, parser='xml') 781 | d.make_links_absolute(base_url='http://example.com') 782 | self.assertTrue(len(d('a[href]')), 1) 783 | self.assertEqual(d('a[href]').attr('href'), 784 | 'http://example.com/path_info') 785 | 786 | 787 | class TestHTMLParser(TestCase): 788 | xml = "
I'm valid XML
" 789 | html = '''
790 | TestimageMy link text 791 | imageMy link text 2 792 | Behind you, a three-headed HTML‐Entity! 793 |
''' 794 | 795 | def test_parser_persistance(self): 796 | d = pq(self.xml, parser='xml') 797 | self.assertRaises(etree.XMLSyntaxError, lambda: d.after(self.html)) 798 | d = pq(self.xml, parser='html') 799 | d.after(self.html) # this should not fail 800 | 801 | def test_replaceWith(self): 802 | expected = '''
803 | TestimageMy link text 804 | imageMy link text 2 805 | Behind you, a three-headed HTML&dash;Entity! 806 |
''' 807 | d = pq(self.html) 808 | d('img').replace_with('image') 809 | val = d.__html__() 810 | assert val == expected, (repr(val), repr(expected)) 811 | 812 | def test_replaceWith_with_function(self): 813 | expected = '''
814 | TestimageMy link text 815 | imageMy link text 2 816 | Behind you, a three-headed HTML&dash;Entity! 817 |
''' 818 | d = pq(self.html) 819 | d('a').replace_with(lambda i, e: pq(e).html()) 820 | val = d.__html__() 821 | assert val == expected, (repr(val), repr(expected)) 822 | 823 | 824 | class TestXMLNamespace(TestCase): 825 | xml = ''' 826 | 827 | What 828 | 123 829 | 830 | 831 | 832 | ''' 833 | 834 | xhtml = ''' 835 | 836 | 837 |
What
838 | 839 | ''' 840 | 841 | namespaces = {'bar': 'http://example.com/bar', 842 | 'baz': 'http://example.com/baz'} 843 | 844 | def test_selector(self): 845 | expected = 'What' 846 | d = pq(self.xml.encode('utf8'), parser='xml') 847 | val = d('bar|blah', 848 | namespaces=self.namespaces).text() 849 | self.assertEqual(repr(val), repr(expected)) 850 | 851 | def test_selector_with_xml(self): 852 | expected = 'What' 853 | d = pq('bar|blah', self.xml.encode('utf8'), parser='xml', 854 | namespaces=self.namespaces) 855 | val = d.text() 856 | self.assertEqual(repr(val), repr(expected)) 857 | 858 | def test_xhtml_namespace(self): 859 | expected = 'What' 860 | d = pq(self.xhtml.encode('utf8'), parser='xml') 861 | d.xhtml_to_html() 862 | val = d('div').text() 863 | self.assertEqual(repr(val), repr(expected)) 864 | 865 | def test_xhtml_namespace_html_parser(self): 866 | expected = 'What' 867 | d = pq(self.xhtml, parser='html') 868 | d.xhtml_to_html() 869 | val = d('div').text() 870 | self.assertEqual(repr(val), repr(expected)) 871 | 872 | def test_remove_namespaces(self): 873 | expected = 'What' 874 | d = pq(self.xml.encode('utf8'), parser='xml').remove_namespaces() 875 | val = d('blah').text() 876 | self.assertEqual(repr(val), repr(expected)) 877 | 878 | def test_persistent_namespaces(self): 879 | d = pq(self.xml.encode('utf8'), parser='xml', 880 | namespaces=self.namespaces) 881 | val = d('bar|blah').text() 882 | self.assertEqual(repr(val), repr('What')) 883 | 884 | def test_namespace_traversal(self): 885 | d = pq(self.xml.encode('utf8'), parser='xml', 886 | namespaces=self.namespaces) 887 | val = d('baz|subbaz').closest('baz|baz').attr('a') 888 | self.assertEqual(repr(val), repr('b')) 889 | 890 | 891 | class TestWebScrapping(TestCase): 892 | 893 | def setUp(self): 894 | self.s = http.StopableWSGIServer.create(debug_app) 895 | self.s.wait() 896 | self.application_url = self.s.application_url.rstrip('/') 897 | 898 | def test_get(self): 899 | d = pq(url=self.application_url, data={'q': 'foo'}, 900 | method='get') 901 | print(d) 902 | self.assertIn('REQUEST_METHOD: GET', d('p').text()) 903 | self.assertIn('q=foo', d('p').text()) 904 | 905 | def test_post(self): 906 | d = pq(url=self.application_url, data={'q': 'foo'}, 907 | method='post') 908 | self.assertIn('REQUEST_METHOD: POST', d('p').text()) 909 | self.assertIn('q=foo', d('p').text()) 910 | 911 | def test_session(self): 912 | if HAS_REQUEST: 913 | import requests 914 | session = requests.Session() 915 | session.headers.update({'X-FOO': 'bar'}) 916 | d = pq(url=self.application_url, data={'q': 'foo'}, 917 | method='get', session=session) 918 | self.assertIn('HTTP_X_FOO: bar', d('p').text()) 919 | else: 920 | self.skipTest('no requests library') 921 | 922 | def tearDown(self): 923 | self.s.shutdown() 924 | 925 | 926 | class TestWebScrappingEncoding(TestCase): 927 | 928 | def test_get(self): 929 | d = pq(url='http://ru.wikipedia.org/wiki/Заглавная_страница', 930 | method='get') 931 | print(d) 932 | self.assertEqual(d('#pt-login').text(), 'Войти') 933 | 934 | 935 | class TestWebScrappingTimeouts(TestCase): 936 | 937 | def setUp(self): 938 | def app(environ, start_response): 939 | start_response('200 OK', [('Content-Type', 'text/plain')]) 940 | time.sleep(2) 941 | return [b'foobar\n'] 942 | self.s = http.StopableWSGIServer.create(app) 943 | self.s.wait() 944 | self.application_url = self.s.application_url.rstrip('/') 945 | 946 | def test_get(self): 947 | pq(url=self.application_url) 948 | with self.assertRaises(Exception): 949 | pq(url=self.application_url, timeout=1) 950 | 951 | def tearDown(self): 952 | self.s.shutdown() 953 | -------------------------------------------------------------------------------- /tests/test_real_browser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from threading import Thread 4 | from time import sleep 5 | 6 | from .browser_base import TextExtractionMixin 7 | 8 | SELENIUM = 'MOZ_HEADLESS' in os.environ 9 | 10 | try: 11 | from selenium import webdriver 12 | from selenium.webdriver.firefox.options import Options 13 | except ImportError: 14 | SELENIUM = False 15 | 16 | if SELENIUM: 17 | from urllib.parse import urlunsplit 18 | from http.server import HTTPServer, BaseHTTPRequestHandler 19 | from queue import Queue 20 | 21 | class BaseTestRequestHandler(BaseHTTPRequestHandler): 22 | _last_html = '' 23 | 24 | def _get_last_html(self): 25 | q = self.server.html_queue 26 | while not q.empty(): 27 | self._last_html = q.get_nowait() 28 | return self._last_html 29 | 30 | def log_request(self, code='-', size='-'): 31 | pass 32 | 33 | def recv_from_testsuite(self, non_blocking=False): 34 | q = self.server.in_queue 35 | if non_blocking: 36 | return None if q.empty() else q.get_nowait() 37 | return q.get() 38 | 39 | def send_to_testsuite(self, value): 40 | self.server.out_queue.put(value) 41 | 42 | class HTMLSnippetSender(BaseTestRequestHandler): 43 | last_html = b'' 44 | 45 | def get_last_html(self): 46 | while True: 47 | value = self.recv_from_testsuite(non_blocking=True) 48 | if value is None: 49 | break 50 | self.last_html = value 51 | return self.last_html 52 | 53 | def do_GET(self): 54 | if self.path == '/': 55 | self.send_response(200) 56 | self.send_header('Content-Type', 'text/html; charset=utf-8') 57 | self.end_headers() 58 | self.wfile.write(self.get_last_html().encode('utf-8')) 59 | else: 60 | self.send_response(404) 61 | self.end_headers() 62 | 63 | class BaseBrowserTest(unittest.TestCase): 64 | LOCAL_IP = '127.0.0.1' 65 | PORT = 28546 66 | # descendant of BaseBrowserTestRequestHandler 67 | REQUEST_HANDLER_CLASS = None 68 | 69 | @classmethod 70 | def setUpClass(cls): 71 | cls.to_server_queue = Queue() 72 | cls.from_server_queue = Queue() 73 | cls.server = HTTPServer((cls.LOCAL_IP, cls.PORT), 74 | cls.REQUEST_HANDLER_CLASS) 75 | cls.server.in_queue = cls.to_server_queue 76 | cls.server.out_queue = cls.from_server_queue 77 | cls.server_thread = Thread(target=cls.server.serve_forever) 78 | cls.server_thread.daemon = True 79 | cls.server_thread.start() 80 | options = Options() 81 | options.add_argument('-headless') 82 | cls.driver = webdriver.Firefox(options=options) 83 | sleep(1) 84 | 85 | @classmethod 86 | def tearDownClass(cls): 87 | cls.driver.quit() 88 | cls.server.shutdown() 89 | cls.server.server_close() 90 | 91 | def send_to_server(self, value): 92 | self.to_server_queue.put(value) 93 | 94 | def recv_from_server(self, non_blocking=False): 95 | q = self.from_server_queue 96 | if non_blocking: 97 | return None if q.empty() else q.get_nowait() 98 | return q.get() 99 | 100 | def open_url(self, path): 101 | self.driver.get(urlunsplit( 102 | ('http', '{}:{}'.format( 103 | self.LOCAL_IP, self.PORT), path, '', ''))) 104 | 105 | class TestInnerText(BaseBrowserTest, TextExtractionMixin): 106 | REQUEST_HANDLER_CLASS = HTMLSnippetSender 107 | 108 | def _simple_test(self, html, expected_sq, expected_nosq, **kwargs): 109 | self.send_to_server(html) 110 | self.open_url('/') 111 | 112 | selenium_text = self.driver.find_element_by_tag_name('body').text 113 | self.assertEqual(selenium_text, expected_sq) 114 | 115 | # inner_text = self.driver.execute_script( 116 | # 'return document.body.innerText') 117 | # text_content = self.driver.execute_script( 118 | # 'return document.body.textContent') 119 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist=py38,py39,py310,py311,py312 3 | 4 | [testenv] 5 | whitelist_externals= 6 | rm 7 | passenv= 8 | MOZ_HEADLESS 9 | commands = 10 | pytest [] 11 | deps = 12 | py38: selenium 13 | -e .[test] 14 | 15 | [testenv:lint] 16 | skipsdist=true 17 | skip_install=true 18 | basepython = python3.11 19 | commands = 20 | ruff check 21 | deps = 22 | ruff 23 | 24 | [testenv:docs] 25 | skip_install=false 26 | skipsdist=true 27 | basepython = python3.11 28 | changedir = docs 29 | deps = 30 | sphinx 31 | Pygments 32 | allowlist_externals = 33 | rm 34 | commands = 35 | rm -Rf {envtmpdir}/doctrees {envtmpdir}/html 36 | sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html 37 | 38 | # [testenv:selenium] 39 | # basepython = python3.5 40 | # deps = 41 | # selenium 42 | # commands = 43 | # {envbindir}/python -m unittest seleniumtests.offline 44 | # {envbindir}/python -m unittest seleniumtests.browser 45 | --------------------------------------------------------------------------------