├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── docs ├── Makefile ├── addon.png ├── conf.py ├── configuration.rst ├── faq.rst ├── index.rst ├── installation.rst ├── make.bat ├── misc.png ├── usage.md └── userpref.png ├── html.py ├── icon.png ├── ipfs.py ├── modules └── bs4 │ ├── AUTHORS.txt │ ├── LICENSE │ ├── README.txt │ ├── __init__.py │ ├── __init__.py.bak │ ├── builder │ ├── __init__.py │ ├── __init__.py.bak │ ├── _html5lib.py │ ├── _html5lib.py.bak │ ├── _htmlparser.py │ ├── _htmlparser.py.bak │ ├── _lxml.py │ └── _lxml.py.bak │ ├── dammit.py │ ├── dammit.py.bak │ ├── diagnose.py │ ├── diagnose.py.bak │ ├── element.py │ ├── element.py.bak │ ├── testing.py │ ├── testing.py.bak │ └── tests │ ├── __init__.py │ ├── test_builder_registry.py │ ├── test_docs.py │ ├── test_html5lib.py │ ├── test_html5lib.py.bak │ ├── test_htmlparser.py │ ├── test_lxml.py │ ├── test_lxml.py.bak │ ├── test_soup.py │ ├── test_soup.py.bak │ ├── test_tree.py │ └── test_tree.py.bak ├── primitives ├── capsule.obj ├── cone.obj ├── cube.obj ├── cylinder.obj ├── pipe.obj ├── plane.obj ├── pyramid.obj ├── sphere.obj └── torus.obj ├── vr_export.py ├── vr_import.py └── zipthis.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | FireVR 2 | 3 | # To the extent possible under law, the persons who associated CC0 with 4 | # FireVR has waived all copyright and related or neighboring rights 5 | # to FireVR. 6 | 7 | # You should have received a copy of the CC0 legalcode along with this 8 | # work. If not, see . 9 | 10 | This license does not apply to other external libraries in this directory, which specify their own LICENSE. 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Fire: JanusVR Exporter & IPFS Publisher 2 | 3 | ##Description 4 | 5 | Fire is a blender addon that exports the current scene to the JanusVR FireBoxHTML-format. 6 | It generates the XML description of the room automatically, exports the objects and provides instant publication over the IPFS network (no server needed!). It can also export the room to a local destination (no ipfs required). 7 | 8 | ##Requirements 9 | 10 | - [JanusVR](http://www.janusvr.com/) 11 | 12 | Optional: 13 | - [IPFS](http://ipfs.io/docs/install/) (only required for IPFS publication) 14 | 15 | To use IPFS: 16 | - IPFS must be present in the PATH. 17 | - When not using the IPFS gateway, you need to run the IPFS daemon locally ("ipfs daemon"). 18 | 19 | This addon was tested under Ubuntu/Linux and Windows. Your feedback is welcome! 20 | 21 | ##Documentation 22 | https://firevr.readthedocs.io/en/latest/ 23 | 24 | ##Installation 25 | 26 | 1. Download this repository as .zip file 27 | 2. Blender -> User Preferences -> Addons -> Install from File -> Select the .zip, enable the addon 28 | 29 | ##Usage 30 | 31 | 1. Create your scene 32 | 2. Blender -> 3D View -> Tool Shelf -> Misc -> Set your room and object attributes using the panel options 33 | 3. Blender -> 3D View -> Tool Shelf -> Misc -> Set the JanusVR and Export target directories 34 | 4. Click on Start JanusVR to export and launch your room in JanusVR 35 | 36 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = FireVR 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | # make livehtml 23 | livehtml: 24 | sphinx-autobuild -b html . $(BUILDDIR)/html 25 | -------------------------------------------------------------------------------- /docs/addon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spyduck/FireVR/535fbbe47bef72c8302da7a1c6334760896e5ce4/docs/addon.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # FireVR documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Feb 5 05:57:31 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'FireVR' 50 | copyright = '2017, void' 51 | author = 'void' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'alabaster' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | 100 | # -- Options for HTMLHelp output ------------------------------------------ 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = 'FireVRdoc' 104 | 105 | 106 | # -- Options for LaTeX output --------------------------------------------- 107 | 108 | latex_elements = { 109 | # The paper size ('letterpaper' or 'a4paper'). 110 | # 111 | # 'papersize': 'letterpaper', 112 | 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | 121 | # Latex figure (float) alignment 122 | # 123 | # 'figure_align': 'htbp', 124 | } 125 | 126 | # Grouping the document tree into LaTeX files. List of tuples 127 | # (source start file, target name, title, 128 | # author, documentclass [howto, manual, or own class]). 129 | latex_documents = [ 130 | (master_doc, 'FireVR.tex', 'FireVR Documentation', 131 | 'void', 'manual'), 132 | ] 133 | 134 | 135 | # -- Options for manual page output --------------------------------------- 136 | 137 | # One entry per manual page. List of tuples 138 | # (source start file, name, description, authors, manual section). 139 | man_pages = [ 140 | (master_doc, 'firevr', 'FireVR Documentation', 141 | [author], 1) 142 | ] 143 | 144 | 145 | # -- Options for Texinfo output ------------------------------------------- 146 | 147 | # Grouping the document tree into Texinfo files. List of tuples 148 | # (source start file, target name, title, author, 149 | # dir menu entry, description, category) 150 | texinfo_documents = [ 151 | (master_doc, 'FireVR', 'FireVR Documentation', 152 | author, 'FireVR', 'One line description of project.', 153 | 'Miscellaneous'), 154 | ] 155 | 156 | from recommonmark.parser import CommonMarkParser 157 | 158 | source_parsers = { 159 | '.md': CommonMarkParser, 160 | } 161 | 162 | source_suffix = ['.rst', '.md'] 163 | 164 | import sphinx_rtd_theme 165 | 166 | html_theme = "sphinx_rtd_theme" 167 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 168 | -------------------------------------------------------------------------------- /docs/configuration.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Configuration 3 | ============= 4 | 5 | You can find FireVR in the Misc Tab in the Toolbar (key: t). 6 | 7 | .. image:: misc.png 8 | 9 | Now, two last things are necessary: 10 | 11 | 1. Set the Export Path in the Export settings (this is where all files will be saved). 12 | 2. Set the JanusVR path in the Run settings (select the Janus executable) 13 | 14 | To test if the settings are correct, click "Start JanusVR" - you should now see the default cube floating in space. 15 | 16 | To make the configuration permanent, save the user settings again. 17 | -------------------------------------------------------------------------------- /docs/faq.rst: -------------------------------------------------------------------------------- 1 | === 2 | FAQ 3 | === 4 | 5 | The texture doesn't load 6 | """"""""""""""""""""""""" 7 | Consider using the Wavefront .obj format for now. 8 | Currently, only textures with the Source: Single Image is supported. 9 | 10 | The objects are loading slowly 11 | """""""""""""""""""""""""""""""" 12 | 13 | Consider running IPFS locally, or disabling IPFS. 14 | 15 | The objects are rotated incorrectly 16 | """""""""""""""""""""""""""""""" 17 | 18 | Use Apply Rotation under export options 19 | 20 | The objects are scaled incorrectly 21 | """""""""""""""""""""""""""""""" 22 | 23 | Use Apply Scale under export options 24 | 25 | I used it and it messed with all my rotations and scales 26 | """""""""""""""""""""""""""""""" 27 | 28 | Turn off Apply Rotation and Apply Scale under export options 29 | 30 | Getting errors about file paths 31 | """""""""""""""""""""""""""""""" 32 | 33 | Always use absolute paths, either disable "use relative paths" under user preferences or unclick the relative path checkbox when selecting your file. 34 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. FireVR documentation master file, created by 2 | sphinx-quickstart on Sun Feb 5 05:57:31 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to FireVR's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | installation 14 | configuration 15 | usage.md 16 | faq 17 | 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | Use this link to download FireVR: 6 | 7 | https://github.com/void4/FireVR/archive/master.zip 8 | 9 | Or go to https://github.com/void4/FireVR and select the green "Clone or download" button at the top right, then "Download ZIP". 10 | 11 | Make sure you have the latest Blender version (currently: 2.78a). 12 | Next, start Blender and enter the User Preferences. 13 | 14 | .. image:: userpref.png 15 | 16 | Then 17 | 18 | 1. Install from File 19 | 2. Activate the addon by marking the checkbox 20 | 3. Save User Settings to make the installation permanent 21 | 22 | .. image:: addon.png 23 | 24 | Done! 25 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=FireVR 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/misc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spyduck/FireVR/535fbbe47bef72c8302da7a1c6334760896e5ce4/docs/misc.png -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | Check the JanusVR docs for more details: 4 | 5 | [http://www.janusvr.com/guide/markuplanguage/index.html 6 | ](http://www.janusvr.com/guide/markuplanguage/index.html) 7 | ### Firebox 8 | 9 | - **Start JanusVR** All-in-one button, exports room and launches Janus 10 | - **Export FireBox** Only exports the objects and generates the room 11 | 12 | ### Export Settings 13 | 14 | - **Export path** The local directory where the rooms are exported to 15 | - **Use IPFS** Check this to enable IPFS (requires IPFS installed and present in PATH) 16 | - **IPFS Gateway** Use the IPFS HTTP Gateway (http://gateway.ipfs.io/) 17 | - **IPNS** Use the IPNS nameserver / set name 18 | - **Apply Rotation** Apply Current Scene Rotation to Objects 19 | - **Apply Scale** Apply Current Scene Scale to Objects 20 | - **Apply Position** Apply Current Scene Position to Objects 21 | - **Unpack Textures** Unpack all textures when exporting 22 | 23 | ### Run Settings 24 | 25 | - **Janus VR path** The path to the JanusVR application 26 | - **Display Mode** Select 2D, Rift, SBS, SBSR mode 27 | - **Rate** Server update rate 28 | - **JanusVR FullScreen** Starts JanusVR in fullscreen mode 29 | - **Window Size** Launch JanusVR with the specified window dimensions 30 | 31 | ### Objects 32 | 33 | __*These attributes are all set on a per object basis__ 34 | 35 | _**Mesh Objects/Common**_ 36 | 37 | - **Object Type** For the most part, should be "Object (model)". However, it can be used to allow making placeholder objects with meshes. 38 | - **js\_id** Specify js\_id for object here, blank will give a default numeric id 39 | 40 | _**Mesh Objects/Link**_ 41 | 42 | When making one of these, start off with a newly created Plane. 43 | 44 | Don't go into edit mode to resize it - just use the transform 45 | (the exporter won't pick up on mesh resizing, the plane itself is just a placeholder) 46 | 47 | With this, you should be able to semi-accurately place portals. 48 | 49 | To check the orientation: 50 | Local Y should be up, Z should be facing outwards. 51 | 52 | I'm not quite sure it works completely accurately, and if not what's responsible, but it's a start. 53 | 54 | - **Link Name** The name displayed on the portal. 55 | - **Link URL** Since (unlike the old "text as portal" system) link objects don't directly hold text, the URL is put here. 56 | - **Active** If false, ``active="false"`` is set. 57 | 58 | _**Mesh Objects/Mesh**_ 59 | 60 | - **Export Format** Select Wavefront (.obj) or Collada (.dae) export format 61 | - **Collision** Enable collision for this object 62 | - **Locked** Lock this object 63 | - **Visible** Draw this item in the Janus room (setting to false with collision set to true is useful for proxy collision geometry) 64 | - **Set Color** Enable a Janus color value for this object 65 | - **Color** Select color value for this object 66 | - **Websurface** Texture the current object with a Janus Websurface 67 | - **URL** Set URL for websurface 68 | - **Width&Height** Set pixel dimensions for websurface 69 | - **Cull Face** Set desired face culling (back, front, none) 70 | - **GLSL Shader** Set a custom GLSL Shader for this object 71 | - **Frag Shader** Set path to Fragment Shader (use absolute paths) 72 | - **Vertex Shader** Set path to Vertex Shader (use absolute paths) 73 | 74 | _**Sound Objects (use speaker in Blender)**_ 75 | 76 | - **Sound** Set path to sound file (use absolute paths) 77 | - **js\_id** js\_id for sound object 78 | - **Distance** Distance at which sound plays at full volume 79 | - **XY1** X and Z positions for first corner of trigger rectangle 80 | - **XY2** X and Z positions for second corner of trigger rectangle 81 | - **Loop** loop sound 82 | - **Place once** play the sound only the first time triggered per user session 83 | 84 | _**Text Objects**_ 85 | 86 | Text objects don't have any properties as such, but there are the following things to note: 87 | 88 | 1. Text is created for single-line, Paragraph for multi-line. 89 | 2. It seems JanusVR ignores the lines anyway, so this is fine. 90 | 3. The old "beginning with http creates link" behavior still exists - not exactly sublime. 91 | 92 | ### Room 93 | 94 | - **Room** Sets the room model (see the [FireBox docs](http://www.dgp.toronto.edu/~mccrae/projects/firebox/notes.html) for further details. 95 | - **Visible** If checked, makes the room visible 96 | - **Color** Sets the rooms color 97 | - **Select Skybox Images** Sets custom skybox images for this room 98 | - **Skybox Left** Set path to Left Skybox Image (use absolute paths) 99 | - **Skybox Right** Set path to Right Skybox Image (use absolute paths) 100 | - **Skybox Front** Set path to Front Skybox Image (use absolute paths) 101 | - **Skybox Back** Set path to Back Skybox Image (use absolute paths) 102 | - **Skybox Up** Set path to Up Skybox Image (use absolute paths) 103 | - **Skybox Down** Set path to Down Skybox Image (use absolute paths) 104 | - **Gravity** Sets the rooms gravity 105 | - **Walk Speed** Sets the players walk speed 106 | - **Run Speed** Sets the players run speed 107 | - **Jump Velocity** Sets the players jump velocity 108 | - **Clip Plane** Sets the near and far clip distances 109 | - **Teleport Range** Sets the min and max teleport distances 110 | - **Default Sounds** Use default sounds in room 111 | - **Show Cursor** Show Cursor in room 112 | - **Fog** Enable Fog effects 113 | - **Color** Set fog color 114 | - **Fog Mode** Set fog mode (exp, exp2, linear) 115 | - **Density** Set fog density (exp and exp2 modes) 116 | - **Start & End** Set fog start and end dist (linear mode) 117 | - **Asset Scripts** Enable JS scripts for room 118 | - **Script 1-4** Set path to Asset Scripts (use absolute paths, up to 4 scripts supported) 119 | - **Global GLSL Shader** Set a global GLSL shader for the room 120 | - **Frag Shader** Set path to Fragment Shader (use absolute paths) 121 | - **Vertex Shader** Set path to Vertex Shader (use absolute paths) 122 | - **Lock Room** Lock room from edits 123 | 124 | ### Multiplayer Server 125 | - **Default Server** Use the default server specified in Janus 126 | - **Server** URL to the server 127 | - **Port** Port of the server 128 | 129 | ### Debug 130 | - **JanusVR** enable debug mode 131 | -------------------------------------------------------------------------------- /docs/userpref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spyduck/FireVR/535fbbe47bef72c8302da7a1c6334760896e5ce4/docs/userpref.png -------------------------------------------------------------------------------- /html.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from functools import cmp_to_key 3 | from collections import OrderedDict 4 | 5 | class Tag: 6 | def __init__(self, tag, attr=[], single=False): 7 | self.tag = tag 8 | self.attr = attr 9 | self.sub = [] 10 | self.single = single 11 | 12 | def write(self, w, nice=True, level=0, indent=" ", loop=0): 13 | if nice: 14 | w(indent*level) 15 | 16 | w("<%s" % self.tag) 17 | 18 | def cmpfunc(x,y): 19 | if x[0]=="id": 20 | return -1 21 | if y[0]=="id": 22 | return 1 23 | #TODO order strings here? 24 | return 0 25 | 26 | for k,v in sorted(self.attr, key=cmp_to_key(cmpfunc)): 27 | w(" %s=\"%s\"" % (k, str(v))) 28 | 29 | if len(self.sub)==0 and not self.single: 30 | w(" />") 31 | else: 32 | if self.tag=="Object": 33 | w(" ") 34 | w(">") 35 | # self.sub must not be indented, as Text objects are sensitive to this under some conditions (tried on JanusVR 54.1 under Wine 1.9.23) and will result in bells. 36 | # Maybe that's a bug in JanusVR, maybe that's a bug in Wine, maybe that's a bug here, in any case, this works around it. 37 | for i,s in enumerate(self.sub): 38 | if isinstance(s, str): 39 | w(s) 40 | else: 41 | #if loop" % self.tag) 51 | if nice: 52 | w("\n") 53 | 54 | 55 | def __call__(self, tag): 56 | #print("Adding %s to %s" % (tag.tag, self.tag)) 57 | self.sub.append(tag) 58 | 59 | def __contains__(self, tag): 60 | return tag in self.sub 61 | 62 | def __repr__(self): 63 | s = StringIO() 64 | self.write(s.write) 65 | s.seek(0) 66 | return s.read() 67 | -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spyduck/FireVR/535fbbe47bef72c8302da7a1c6334760896e5ce4/icon.png -------------------------------------------------------------------------------- /ipfs.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, call, check_output, STDOUT, PIPE, CalledProcessError 2 | from tempfile import TemporaryFile, NamedTemporaryFile, tempdir 3 | import base64 4 | import os 5 | 6 | DEVNULL = open(os.devnull, "wb") 7 | 8 | CALL_OPTIONS = {"stderr" : DEVNULL} 9 | CALL_OPTIONS = {"stdout" : DEVNULL} 10 | 11 | def init(): 12 | call(["ipfs", "init"], stdout=DEVNULL, stderr=DEVNULL) 13 | 14 | def daemon(): 15 | try: 16 | Popen(["ipfs", "daemon"], stdout=DEVNULL, stderr=DEVNULL) 17 | except CalledProcessError: 18 | pass 19 | 20 | def start(): 21 | init() 22 | daemon() 23 | 24 | def add(path, pin=True): 25 | if type(path)==str: 26 | output = check_output(["ipfs", "add", path]).decode("ascii") 27 | else: 28 | try: 29 | output = check_output(["ipfs", "add"], stdin=path).decode("ascii") 30 | except CalledProcessError as err: 31 | output = err.output 32 | """ 33 | if pin: 34 | try: 35 | check_output(["ipfs", "pin", output.split(" ")[1]]) 36 | except CalledProcessError as err: 37 | pass 38 | """ 39 | return output.split(" ")[1] 40 | 41 | def addRecursive(path): 42 | out = check_output(["ipfs", "add", "-r", path]).decode("utf-8") 43 | return [line.split(" ")[1] for line in out.split("\n") if len(line)>1] 44 | 45 | def cat(path): 46 | check_output(["ipfs", "cat", path]) 47 | 48 | def get(path): 49 | try: 50 | output = check_output(["ipfs", "get", path], stderr=DEVNULL) 51 | except CalledProcessError as err: 52 | output = err.output 53 | return output.split(" ")[-1].strip() 54 | 55 | def publish(path): 56 | return check_output(["ipfs", "name", "publish", path]) 57 | 58 | def resolve(path): 59 | return check_output(["ipfs", "name", "resolve", path]) 60 | 61 | def ls(path): 62 | return check_output(["ipfs", "ls", path]) 63 | 64 | def refs(path): 65 | return check_output(["ipfs", "refs", path]) 66 | 67 | def save(bytes): 68 | with TemporaryFile("wb+") as f: 69 | f.seek(0) 70 | f.write(bytes) 71 | f.flush() 72 | f.seek(0) 73 | return add(f) 74 | 75 | def load(path): 76 | if os.path.isfile(path): 77 | with open(path, "rb+") as t: 78 | t.seek(0)#why is this necessary? 79 | d = t.read() 80 | 81 | else: 82 | #with NamedTemporaryFile("rb+") as f: 83 | # err = get(path, f) 84 | # print("ERR",err) 85 | # f.seek(0) 86 | # d = f.read() 87 | err = get(path)#tempdir 88 | with open(path,"rb+") as f:#~/.go-ipfs/datastore/"+ 89 | d = f.read() 90 | 91 | return d 92 | 93 | #start() 94 | -------------------------------------------------------------------------------- /modules/bs4/AUTHORS.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Spyduck/FireVR/535fbbe47bef72c8302da7a1c6334760896e5ce4/modules/bs4/AUTHORS.txt -------------------------------------------------------------------------------- /modules/bs4/LICENSE: -------------------------------------------------------------------------------- 1 | Beautiful Soup is made available under the MIT license: 2 | 3 | Copyright (c) 2004-2016 Leonard Richardson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | 25 | Beautiful Soup incorporates code from the html5lib library, which is 26 | also made available under the MIT license. Copyright (c) 2006-2013 27 | James Graham and other contributors 28 | -------------------------------------------------------------------------------- /modules/bs4/README.txt: -------------------------------------------------------------------------------- 1 | = Introduction = 2 | 3 | >>> from bs4 import BeautifulSoup 4 | >>> soup = BeautifulSoup("

SomebadHTML") 5 | >>> print soup.prettify() 6 | 7 | 8 |

9 | Some 10 | 11 | bad 12 | 13 | HTML 14 | 15 | 16 |

17 | 18 | 19 | >>> soup.find(text="bad") 20 | u'bad' 21 | 22 | >>> soup.i 23 | HTML 24 | 25 | >>> soup = BeautifulSoup("SomebadXML", "xml") 26 | >>> print soup.prettify() 27 | 28 | 29 | Some 30 | 31 | bad 32 | 33 | XML 34 | 35 | 36 | 37 | = Full documentation = 38 | 39 | The bs4/doc/ directory contains full documentation in Sphinx 40 | format. Run "make html" in that directory to create HTML 41 | documentation. 42 | 43 | = Running the unit tests = 44 | 45 | Beautiful Soup supports unit test discovery from the project root directory: 46 | 47 | $ nosetests 48 | 49 | $ python -m unittest discover -s bs4 # Python 2.7 and up 50 | 51 | If you checked out the source tree, you should see a script in the 52 | home directory called test-all-versions. This script will run the unit 53 | tests under Python 2.7, then create a temporary Python 3 conversion of 54 | the source and run the unit tests again under Python 3. 55 | 56 | = Links = 57 | 58 | Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/ 59 | Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 60 | http://readthedocs.org/docs/beautiful-soup-4/ 61 | Discussion group: http://groups.google.com/group/beautifulsoup/ 62 | Development: https://code.launchpad.net/beautifulsoup/ 63 | Bug tracker: https://bugs.launchpad.net/beautifulsoup/ 64 | -------------------------------------------------------------------------------- /modules/bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Use of this source code is governed by a BSD-style license that can be 2 | # found in the LICENSE file. 3 | 4 | from collections import defaultdict 5 | import itertools 6 | import sys 7 | from bs4.element import ( 8 | CharsetMetaAttributeValue, 9 | ContentMetaAttributeValue, 10 | HTMLAwareEntitySubstitution, 11 | whitespace_re 12 | ) 13 | 14 | __all__ = [ 15 | 'HTMLTreeBuilder', 16 | 'SAXTreeBuilder', 17 | 'TreeBuilder', 18 | 'TreeBuilderRegistry', 19 | ] 20 | 21 | # Some useful features for a TreeBuilder to have. 22 | FAST = 'fast' 23 | PERMISSIVE = 'permissive' 24 | STRICT = 'strict' 25 | XML = 'xml' 26 | HTML = 'html' 27 | HTML_5 = 'html5' 28 | 29 | 30 | class TreeBuilderRegistry(object): 31 | 32 | def __init__(self): 33 | self.builders_for_feature = defaultdict(list) 34 | self.builders = [] 35 | 36 | def register(self, treebuilder_class): 37 | """Register a treebuilder based on its advertised features.""" 38 | for feature in treebuilder_class.features: 39 | self.builders_for_feature[feature].insert(0, treebuilder_class) 40 | self.builders.insert(0, treebuilder_class) 41 | 42 | def lookup(self, *features): 43 | if len(self.builders) == 0: 44 | # There are no builders at all. 45 | return None 46 | 47 | if len(features) == 0: 48 | # They didn't ask for any features. Give them the most 49 | # recently registered builder. 50 | return self.builders[0] 51 | 52 | # Go down the list of features in order, and eliminate any builders 53 | # that don't match every feature. 54 | features = list(features) 55 | features.reverse() 56 | candidates = None 57 | candidate_set = None 58 | while len(features) > 0: 59 | feature = features.pop() 60 | we_have_the_feature = self.builders_for_feature.get(feature, []) 61 | if len(we_have_the_feature) > 0: 62 | if candidates is None: 63 | candidates = we_have_the_feature 64 | candidate_set = set(candidates) 65 | else: 66 | # Eliminate any candidates that don't have this feature. 67 | candidate_set = candidate_set.intersection( 68 | set(we_have_the_feature)) 69 | 70 | # The only valid candidates are the ones in candidate_set. 71 | # Go through the original list of candidates and pick the first one 72 | # that's in candidate_set. 73 | if candidate_set is None: 74 | return None 75 | for candidate in candidates: 76 | if candidate in candidate_set: 77 | return candidate 78 | return None 79 | 80 | # The BeautifulSoup class will take feature lists from developers and use them 81 | # to look up builders in this registry. 82 | builder_registry = TreeBuilderRegistry() 83 | 84 | class TreeBuilder(object): 85 | """Turn a document into a Beautiful Soup object tree.""" 86 | 87 | NAME = "[Unknown tree builder]" 88 | ALTERNATE_NAMES = [] 89 | features = [] 90 | 91 | is_xml = False 92 | picklable = False 93 | preserve_whitespace_tags = set() 94 | empty_element_tags = None # A tag will be considered an empty-element 95 | # tag when and only when it has no contents. 96 | 97 | # A value for these tag/attribute combinations is a space- or 98 | # comma-separated list of CDATA, rather than a single CDATA. 99 | cdata_list_attributes = {} 100 | 101 | 102 | def __init__(self): 103 | self.soup = None 104 | 105 | def reset(self): 106 | pass 107 | 108 | def can_be_empty_element(self, tag_name): 109 | """Might a tag with this name be an empty-element tag? 110 | 111 | The final markup may or may not actually present this tag as 112 | self-closing. 113 | 114 | For instance: an HTMLBuilder does not consider a

tag to be 115 | an empty-element tag (it's not in 116 | HTMLBuilder.empty_element_tags). This means an empty

tag 117 | will be presented as "

", not "

". 118 | 119 | The default implementation has no opinion about which tags are 120 | empty-element tags, so a tag will be presented as an 121 | empty-element tag if and only if it has no contents. 122 | "" will become "", and "bar" will 123 | be left alone. 124 | """ 125 | if self.empty_element_tags is None: 126 | return True 127 | return tag_name in self.empty_element_tags 128 | 129 | def feed(self, markup): 130 | raise NotImplementedError() 131 | 132 | def prepare_markup(self, markup, user_specified_encoding=None, 133 | document_declared_encoding=None): 134 | return markup, None, None, False 135 | 136 | def test_fragment_to_document(self, fragment): 137 | """Wrap an HTML fragment to make it look like a document. 138 | 139 | Different parsers do this differently. For instance, lxml 140 | introduces an empty tag, and html5lib 141 | doesn't. Abstracting this away lets us write simple tests 142 | which run HTML fragments through the parser and compare the 143 | results against other HTML fragments. 144 | 145 | This method should not be used outside of tests. 146 | """ 147 | return fragment 148 | 149 | def set_up_substitutions(self, tag): 150 | return False 151 | 152 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 153 | """Replaces class="foo bar" with class=["foo", "bar"] 154 | 155 | Modifies its input in place. 156 | """ 157 | if not attrs: 158 | return attrs 159 | if self.cdata_list_attributes: 160 | universal = self.cdata_list_attributes.get('*', []) 161 | tag_specific = self.cdata_list_attributes.get( 162 | tag_name.lower(), None) 163 | for attr in list(attrs.keys()): 164 | if attr in universal or (tag_specific and attr in tag_specific): 165 | # We have a "class"-type attribute whose string 166 | # value is a whitespace-separated list of 167 | # values. Split it into a list. 168 | value = attrs[attr] 169 | if isinstance(value, str): 170 | values = whitespace_re.split(value) 171 | else: 172 | # html5lib sometimes calls setAttributes twice 173 | # for the same tag when rearranging the parse 174 | # tree. On the second call the attribute value 175 | # here is already a list. If this happens, 176 | # leave the value alone rather than trying to 177 | # split it again. 178 | values = value 179 | attrs[attr] = values 180 | return attrs 181 | 182 | class SAXTreeBuilder(TreeBuilder): 183 | """A Beautiful Soup treebuilder that listens for SAX events.""" 184 | 185 | def feed(self, markup): 186 | raise NotImplementedError() 187 | 188 | def close(self): 189 | pass 190 | 191 | def startElement(self, name, attrs): 192 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 193 | #print "Start %s, %r" % (name, attrs) 194 | self.soup.handle_starttag(name, attrs) 195 | 196 | def endElement(self, name): 197 | #print "End %s" % name 198 | self.soup.handle_endtag(name) 199 | 200 | def startElementNS(self, nsTuple, nodeName, attrs): 201 | # Throw away (ns, nodeName) for now. 202 | self.startElement(nodeName, attrs) 203 | 204 | def endElementNS(self, nsTuple, nodeName): 205 | # Throw away (ns, nodeName) for now. 206 | self.endElement(nodeName) 207 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 208 | 209 | def startPrefixMapping(self, prefix, nodeValue): 210 | # Ignore the prefix for now. 211 | pass 212 | 213 | def endPrefixMapping(self, prefix): 214 | # Ignore the prefix for now. 215 | # handler.endPrefixMapping(prefix) 216 | pass 217 | 218 | def characters(self, content): 219 | self.soup.handle_data(content) 220 | 221 | def startDocument(self): 222 | pass 223 | 224 | def endDocument(self): 225 | pass 226 | 227 | 228 | class HTMLTreeBuilder(TreeBuilder): 229 | """This TreeBuilder knows facts about HTML. 230 | 231 | Such as which tags are empty-element tags. 232 | """ 233 | 234 | preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags 235 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 236 | 'spacer', 'link', 'frame', 'base']) 237 | 238 | # The HTML standard defines these attributes as containing a 239 | # space-separated list of values, not a single value. That is, 240 | # class="foo bar" means that the 'class' attribute has two values, 241 | # 'foo' and 'bar', not the single value 'foo bar'. When we 242 | # encounter one of these attributes, we will parse its value into 243 | # a list of values if possible. Upon output, the list will be 244 | # converted back into a string. 245 | cdata_list_attributes = { 246 | "*" : ['class', 'accesskey', 'dropzone'], 247 | "a" : ['rel', 'rev'], 248 | "link" : ['rel', 'rev'], 249 | "td" : ["headers"], 250 | "th" : ["headers"], 251 | "td" : ["headers"], 252 | "form" : ["accept-charset"], 253 | "object" : ["archive"], 254 | 255 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 256 | "area" : ["rel"], 257 | "icon" : ["sizes"], 258 | "iframe" : ["sandbox"], 259 | "output" : ["for"], 260 | } 261 | 262 | def set_up_substitutions(self, tag): 263 | # We are only interested in tags 264 | if tag.name != 'meta': 265 | return False 266 | 267 | http_equiv = tag.get('http-equiv') 268 | content = tag.get('content') 269 | charset = tag.get('charset') 270 | 271 | # We are interested in tags that say what encoding the 272 | # document was originally in. This means HTML 5-style 273 | # tags that provide the "charset" attribute. It also means 274 | # HTML 4-style tags that provide the "content" 275 | # attribute and have "http-equiv" set to "content-type". 276 | # 277 | # In both cases we will replace the value of the appropriate 278 | # attribute with a standin object that can take on any 279 | # encoding. 280 | meta_encoding = None 281 | if charset is not None: 282 | # HTML 5 style: 283 | # 284 | meta_encoding = charset 285 | tag['charset'] = CharsetMetaAttributeValue(charset) 286 | 287 | elif (content is not None and http_equiv is not None 288 | and http_equiv.lower() == 'content-type'): 289 | # HTML 4 style: 290 | # 291 | tag['content'] = ContentMetaAttributeValue(content) 292 | 293 | return (meta_encoding is not None) 294 | 295 | def register_treebuilders_from(module): 296 | """Copy TreeBuilders from the given module into this module.""" 297 | # I'm fairly sure this is not the best way to do this. 298 | this_module = sys.modules['bs4.builder'] 299 | for name in module.__all__: 300 | obj = getattr(module, name) 301 | 302 | if issubclass(obj, TreeBuilder): 303 | setattr(this_module, name, obj) 304 | this_module.__all__.append(name) 305 | # Register the builder while we're at it. 306 | this_module.builder_registry.register(obj) 307 | 308 | class ParserRejectedMarkup(Exception): 309 | pass 310 | 311 | # Builders are registered in reverse order of priority, so that custom 312 | # builder registrations will take precedence. In general, we want lxml 313 | # to take precedence over html5lib, because it's faster. And we only 314 | # want to use HTMLParser as a last result. 315 | from . import _htmlparser 316 | register_treebuilders_from(_htmlparser) 317 | try: 318 | from . import _html5lib 319 | register_treebuilders_from(_html5lib) 320 | except ImportError: 321 | # They don't have html5lib installed. 322 | pass 323 | try: 324 | from . import _lxml 325 | register_treebuilders_from(_lxml) 326 | except ImportError: 327 | # They don't have lxml installed. 328 | pass 329 | -------------------------------------------------------------------------------- /modules/bs4/builder/__init__.py.bak: -------------------------------------------------------------------------------- 1 | # Use of this source code is governed by a BSD-style license that can be 2 | # found in the LICENSE file. 3 | 4 | from collections import defaultdict 5 | import itertools 6 | import sys 7 | from bs4.element import ( 8 | CharsetMetaAttributeValue, 9 | ContentMetaAttributeValue, 10 | HTMLAwareEntitySubstitution, 11 | whitespace_re 12 | ) 13 | 14 | __all__ = [ 15 | 'HTMLTreeBuilder', 16 | 'SAXTreeBuilder', 17 | 'TreeBuilder', 18 | 'TreeBuilderRegistry', 19 | ] 20 | 21 | # Some useful features for a TreeBuilder to have. 22 | FAST = 'fast' 23 | PERMISSIVE = 'permissive' 24 | STRICT = 'strict' 25 | XML = 'xml' 26 | HTML = 'html' 27 | HTML_5 = 'html5' 28 | 29 | 30 | class TreeBuilderRegistry(object): 31 | 32 | def __init__(self): 33 | self.builders_for_feature = defaultdict(list) 34 | self.builders = [] 35 | 36 | def register(self, treebuilder_class): 37 | """Register a treebuilder based on its advertised features.""" 38 | for feature in treebuilder_class.features: 39 | self.builders_for_feature[feature].insert(0, treebuilder_class) 40 | self.builders.insert(0, treebuilder_class) 41 | 42 | def lookup(self, *features): 43 | if len(self.builders) == 0: 44 | # There are no builders at all. 45 | return None 46 | 47 | if len(features) == 0: 48 | # They didn't ask for any features. Give them the most 49 | # recently registered builder. 50 | return self.builders[0] 51 | 52 | # Go down the list of features in order, and eliminate any builders 53 | # that don't match every feature. 54 | features = list(features) 55 | features.reverse() 56 | candidates = None 57 | candidate_set = None 58 | while len(features) > 0: 59 | feature = features.pop() 60 | we_have_the_feature = self.builders_for_feature.get(feature, []) 61 | if len(we_have_the_feature) > 0: 62 | if candidates is None: 63 | candidates = we_have_the_feature 64 | candidate_set = set(candidates) 65 | else: 66 | # Eliminate any candidates that don't have this feature. 67 | candidate_set = candidate_set.intersection( 68 | set(we_have_the_feature)) 69 | 70 | # The only valid candidates are the ones in candidate_set. 71 | # Go through the original list of candidates and pick the first one 72 | # that's in candidate_set. 73 | if candidate_set is None: 74 | return None 75 | for candidate in candidates: 76 | if candidate in candidate_set: 77 | return candidate 78 | return None 79 | 80 | # The BeautifulSoup class will take feature lists from developers and use them 81 | # to look up builders in this registry. 82 | builder_registry = TreeBuilderRegistry() 83 | 84 | class TreeBuilder(object): 85 | """Turn a document into a Beautiful Soup object tree.""" 86 | 87 | NAME = "[Unknown tree builder]" 88 | ALTERNATE_NAMES = [] 89 | features = [] 90 | 91 | is_xml = False 92 | picklable = False 93 | preserve_whitespace_tags = set() 94 | empty_element_tags = None # A tag will be considered an empty-element 95 | # tag when and only when it has no contents. 96 | 97 | # A value for these tag/attribute combinations is a space- or 98 | # comma-separated list of CDATA, rather than a single CDATA. 99 | cdata_list_attributes = {} 100 | 101 | 102 | def __init__(self): 103 | self.soup = None 104 | 105 | def reset(self): 106 | pass 107 | 108 | def can_be_empty_element(self, tag_name): 109 | """Might a tag with this name be an empty-element tag? 110 | 111 | The final markup may or may not actually present this tag as 112 | self-closing. 113 | 114 | For instance: an HTMLBuilder does not consider a

tag to be 115 | an empty-element tag (it's not in 116 | HTMLBuilder.empty_element_tags). This means an empty

tag 117 | will be presented as "

", not "

". 118 | 119 | The default implementation has no opinion about which tags are 120 | empty-element tags, so a tag will be presented as an 121 | empty-element tag if and only if it has no contents. 122 | "" will become "", and "bar" will 123 | be left alone. 124 | """ 125 | if self.empty_element_tags is None: 126 | return True 127 | return tag_name in self.empty_element_tags 128 | 129 | def feed(self, markup): 130 | raise NotImplementedError() 131 | 132 | def prepare_markup(self, markup, user_specified_encoding=None, 133 | document_declared_encoding=None): 134 | return markup, None, None, False 135 | 136 | def test_fragment_to_document(self, fragment): 137 | """Wrap an HTML fragment to make it look like a document. 138 | 139 | Different parsers do this differently. For instance, lxml 140 | introduces an empty tag, and html5lib 141 | doesn't. Abstracting this away lets us write simple tests 142 | which run HTML fragments through the parser and compare the 143 | results against other HTML fragments. 144 | 145 | This method should not be used outside of tests. 146 | """ 147 | return fragment 148 | 149 | def set_up_substitutions(self, tag): 150 | return False 151 | 152 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 153 | """Replaces class="foo bar" with class=["foo", "bar"] 154 | 155 | Modifies its input in place. 156 | """ 157 | if not attrs: 158 | return attrs 159 | if self.cdata_list_attributes: 160 | universal = self.cdata_list_attributes.get('*', []) 161 | tag_specific = self.cdata_list_attributes.get( 162 | tag_name.lower(), None) 163 | for attr in attrs.keys(): 164 | if attr in universal or (tag_specific and attr in tag_specific): 165 | # We have a "class"-type attribute whose string 166 | # value is a whitespace-separated list of 167 | # values. Split it into a list. 168 | value = attrs[attr] 169 | if isinstance(value, basestring): 170 | values = whitespace_re.split(value) 171 | else: 172 | # html5lib sometimes calls setAttributes twice 173 | # for the same tag when rearranging the parse 174 | # tree. On the second call the attribute value 175 | # here is already a list. If this happens, 176 | # leave the value alone rather than trying to 177 | # split it again. 178 | values = value 179 | attrs[attr] = values 180 | return attrs 181 | 182 | class SAXTreeBuilder(TreeBuilder): 183 | """A Beautiful Soup treebuilder that listens for SAX events.""" 184 | 185 | def feed(self, markup): 186 | raise NotImplementedError() 187 | 188 | def close(self): 189 | pass 190 | 191 | def startElement(self, name, attrs): 192 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 193 | #print "Start %s, %r" % (name, attrs) 194 | self.soup.handle_starttag(name, attrs) 195 | 196 | def endElement(self, name): 197 | #print "End %s" % name 198 | self.soup.handle_endtag(name) 199 | 200 | def startElementNS(self, nsTuple, nodeName, attrs): 201 | # Throw away (ns, nodeName) for now. 202 | self.startElement(nodeName, attrs) 203 | 204 | def endElementNS(self, nsTuple, nodeName): 205 | # Throw away (ns, nodeName) for now. 206 | self.endElement(nodeName) 207 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 208 | 209 | def startPrefixMapping(self, prefix, nodeValue): 210 | # Ignore the prefix for now. 211 | pass 212 | 213 | def endPrefixMapping(self, prefix): 214 | # Ignore the prefix for now. 215 | # handler.endPrefixMapping(prefix) 216 | pass 217 | 218 | def characters(self, content): 219 | self.soup.handle_data(content) 220 | 221 | def startDocument(self): 222 | pass 223 | 224 | def endDocument(self): 225 | pass 226 | 227 | 228 | class HTMLTreeBuilder(TreeBuilder): 229 | """This TreeBuilder knows facts about HTML. 230 | 231 | Such as which tags are empty-element tags. 232 | """ 233 | 234 | preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags 235 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 236 | 'spacer', 'link', 'frame', 'base']) 237 | 238 | # The HTML standard defines these attributes as containing a 239 | # space-separated list of values, not a single value. That is, 240 | # class="foo bar" means that the 'class' attribute has two values, 241 | # 'foo' and 'bar', not the single value 'foo bar'. When we 242 | # encounter one of these attributes, we will parse its value into 243 | # a list of values if possible. Upon output, the list will be 244 | # converted back into a string. 245 | cdata_list_attributes = { 246 | "*" : ['class', 'accesskey', 'dropzone'], 247 | "a" : ['rel', 'rev'], 248 | "link" : ['rel', 'rev'], 249 | "td" : ["headers"], 250 | "th" : ["headers"], 251 | "td" : ["headers"], 252 | "form" : ["accept-charset"], 253 | "object" : ["archive"], 254 | 255 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 256 | "area" : ["rel"], 257 | "icon" : ["sizes"], 258 | "iframe" : ["sandbox"], 259 | "output" : ["for"], 260 | } 261 | 262 | def set_up_substitutions(self, tag): 263 | # We are only interested in tags 264 | if tag.name != 'meta': 265 | return False 266 | 267 | http_equiv = tag.get('http-equiv') 268 | content = tag.get('content') 269 | charset = tag.get('charset') 270 | 271 | # We are interested in tags that say what encoding the 272 | # document was originally in. This means HTML 5-style 273 | # tags that provide the "charset" attribute. It also means 274 | # HTML 4-style tags that provide the "content" 275 | # attribute and have "http-equiv" set to "content-type". 276 | # 277 | # In both cases we will replace the value of the appropriate 278 | # attribute with a standin object that can take on any 279 | # encoding. 280 | meta_encoding = None 281 | if charset is not None: 282 | # HTML 5 style: 283 | # 284 | meta_encoding = charset 285 | tag['charset'] = CharsetMetaAttributeValue(charset) 286 | 287 | elif (content is not None and http_equiv is not None 288 | and http_equiv.lower() == 'content-type'): 289 | # HTML 4 style: 290 | # 291 | tag['content'] = ContentMetaAttributeValue(content) 292 | 293 | return (meta_encoding is not None) 294 | 295 | def register_treebuilders_from(module): 296 | """Copy TreeBuilders from the given module into this module.""" 297 | # I'm fairly sure this is not the best way to do this. 298 | this_module = sys.modules['bs4.builder'] 299 | for name in module.__all__: 300 | obj = getattr(module, name) 301 | 302 | if issubclass(obj, TreeBuilder): 303 | setattr(this_module, name, obj) 304 | this_module.__all__.append(name) 305 | # Register the builder while we're at it. 306 | this_module.builder_registry.register(obj) 307 | 308 | class ParserRejectedMarkup(Exception): 309 | pass 310 | 311 | # Builders are registered in reverse order of priority, so that custom 312 | # builder registrations will take precedence. In general, we want lxml 313 | # to take precedence over html5lib, because it's faster. And we only 314 | # want to use HTMLParser as a last result. 315 | from . import _htmlparser 316 | register_treebuilders_from(_htmlparser) 317 | try: 318 | from . import _html5lib 319 | register_treebuilders_from(_html5lib) 320 | except ImportError: 321 | # They don't have html5lib installed. 322 | pass 323 | try: 324 | from . import _lxml 325 | register_treebuilders_from(_lxml) 326 | except ImportError: 327 | # They don't have lxml installed. 328 | pass 329 | -------------------------------------------------------------------------------- /modules/bs4/builder/_html5lib.py: -------------------------------------------------------------------------------- 1 | # Use of this source code is governed by a BSD-style license that can be 2 | # found in the LICENSE file. 3 | 4 | __all__ = [ 5 | 'HTML5TreeBuilder', 6 | ] 7 | 8 | import warnings 9 | import re 10 | from bs4.builder import ( 11 | PERMISSIVE, 12 | HTML, 13 | HTML_5, 14 | HTMLTreeBuilder, 15 | ) 16 | from bs4.element import ( 17 | NamespacedAttribute, 18 | whitespace_re, 19 | ) 20 | import html5lib 21 | from html5lib.constants import ( 22 | namespaces, 23 | prefixes, 24 | ) 25 | from bs4.element import ( 26 | Comment, 27 | Doctype, 28 | NavigableString, 29 | Tag, 30 | ) 31 | 32 | try: 33 | # Pre-0.99999999 34 | from html5lib.treebuilders import _base as treebuilder_base 35 | new_html5lib = False 36 | except ImportError as e: 37 | # 0.99999999 and up 38 | from html5lib.treebuilders import base as treebuilder_base 39 | new_html5lib = True 40 | 41 | class HTML5TreeBuilder(HTMLTreeBuilder): 42 | """Use html5lib to build a tree.""" 43 | 44 | NAME = "html5lib" 45 | 46 | features = [NAME, PERMISSIVE, HTML_5, HTML] 47 | 48 | def prepare_markup(self, markup, user_specified_encoding, 49 | document_declared_encoding=None, exclude_encodings=None): 50 | # Store the user-specified encoding for use later on. 51 | self.user_specified_encoding = user_specified_encoding 52 | 53 | # document_declared_encoding and exclude_encodings aren't used 54 | # ATM because the html5lib TreeBuilder doesn't use 55 | # UnicodeDammit. 56 | if exclude_encodings: 57 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 58 | yield (markup, None, None, False) 59 | 60 | # These methods are defined by Beautiful Soup. 61 | def feed(self, markup): 62 | if self.soup.parse_only is not None: 63 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 64 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 65 | 66 | extra_kwargs = dict() 67 | if not isinstance(markup, str): 68 | if new_html5lib: 69 | extra_kwargs['override_encoding'] = self.user_specified_encoding 70 | else: 71 | extra_kwargs['encoding'] = self.user_specified_encoding 72 | doc = parser.parse(markup, **extra_kwargs) 73 | 74 | # Set the character encoding detected by the tokenizer. 75 | if isinstance(markup, str): 76 | # We need to special-case this because html5lib sets 77 | # charEncoding to UTF-8 if it gets Unicode input. 78 | doc.original_encoding = None 79 | else: 80 | original_encoding = parser.tokenizer.stream.charEncoding[0] 81 | if not isinstance(original_encoding, str): 82 | # In 0.99999999 and up, the encoding is an html5lib 83 | # Encoding object. We want to use a string for compatibility 84 | # with other tree builders. 85 | original_encoding = original_encoding.name 86 | doc.original_encoding = original_encoding 87 | 88 | def create_treebuilder(self, namespaceHTMLElements): 89 | self.underlying_builder = TreeBuilderForHtml5lib( 90 | namespaceHTMLElements, self.soup) 91 | return self.underlying_builder 92 | 93 | def test_fragment_to_document(self, fragment): 94 | """See `TreeBuilder`.""" 95 | return '%s' % fragment 96 | 97 | 98 | class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 99 | 100 | def __init__(self, namespaceHTMLElements, soup=None): 101 | if soup: 102 | self.soup = soup 103 | else: 104 | from bs4 import BeautifulSoup 105 | self.soup = BeautifulSoup("", "html.parser") 106 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 107 | 108 | def documentClass(self): 109 | self.soup.reset() 110 | return Element(self.soup, self.soup, None) 111 | 112 | def insertDoctype(self, token): 113 | name = token["name"] 114 | publicId = token["publicId"] 115 | systemId = token["systemId"] 116 | 117 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 118 | self.soup.object_was_parsed(doctype) 119 | 120 | def elementClass(self, name, namespace): 121 | tag = self.soup.new_tag(name, namespace) 122 | return Element(tag, self.soup, namespace) 123 | 124 | def commentClass(self, data): 125 | return TextNode(Comment(data), self.soup) 126 | 127 | def fragmentClass(self): 128 | from bs4 import BeautifulSoup 129 | self.soup = BeautifulSoup("", "html.parser") 130 | self.soup.name = "[document_fragment]" 131 | return Element(self.soup, self.soup, None) 132 | 133 | def appendChild(self, node): 134 | # XXX This code is not covered by the BS4 tests. 135 | self.soup.append(node.element) 136 | 137 | def getDocument(self): 138 | return self.soup 139 | 140 | def getFragment(self): 141 | return treebuilder_base.TreeBuilder.getFragment(self).element 142 | 143 | def testSerializer(self, element): 144 | from bs4 import BeautifulSoup 145 | rv = [] 146 | doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') 147 | 148 | def serializeElement(element, indent=0): 149 | if isinstance(element, BeautifulSoup): 150 | pass 151 | if isinstance(element, Doctype): 152 | m = doctype_re.match(element) 153 | if m: 154 | name = m.group(1) 155 | if m.lastindex > 1: 156 | publicId = m.group(2) or "" 157 | systemId = m.group(3) or m.group(4) or "" 158 | rv.append("""|%s""" % 159 | (' ' * indent, name, publicId, systemId)) 160 | else: 161 | rv.append("|%s" % (' ' * indent, name)) 162 | else: 163 | rv.append("|%s" % (' ' * indent,)) 164 | elif isinstance(element, Comment): 165 | rv.append("|%s" % (' ' * indent, element)) 166 | elif isinstance(element, NavigableString): 167 | rv.append("|%s\"%s\"" % (' ' * indent, element)) 168 | else: 169 | if element.namespace: 170 | name = "%s %s" % (prefixes[element.namespace], 171 | element.name) 172 | else: 173 | name = element.name 174 | rv.append("|%s<%s>" % (' ' * indent, name)) 175 | if element.attrs: 176 | attributes = [] 177 | for name, value in list(element.attrs.items()): 178 | if isinstance(name, NamespacedAttribute): 179 | name = "%s %s" % (prefixes[name.namespace], name.name) 180 | if isinstance(value, list): 181 | value = " ".join(value) 182 | attributes.append((name, value)) 183 | 184 | for name, value in sorted(attributes): 185 | rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) 186 | indent += 2 187 | for child in element.children: 188 | serializeElement(child, indent) 189 | serializeElement(element, 0) 190 | 191 | return "\n".join(rv) 192 | 193 | class AttrList(object): 194 | def __init__(self, element): 195 | self.element = element 196 | self.attrs = dict(self.element.attrs) 197 | def __iter__(self): 198 | return list(self.attrs.items()).__iter__() 199 | def __setitem__(self, name, value): 200 | # If this attribute is a multi-valued attribute for this element, 201 | # turn its value into a list. 202 | list_attr = HTML5TreeBuilder.cdata_list_attributes 203 | if (name in list_attr['*'] 204 | or (self.element.name in list_attr 205 | and name in list_attr[self.element.name])): 206 | # A node that is being cloned may have already undergone 207 | # this procedure. 208 | if not isinstance(value, list): 209 | value = whitespace_re.split(value) 210 | self.element[name] = value 211 | def items(self): 212 | return list(self.attrs.items()) 213 | def keys(self): 214 | return list(self.attrs.keys()) 215 | def __len__(self): 216 | return len(self.attrs) 217 | def __getitem__(self, name): 218 | return self.attrs[name] 219 | def __contains__(self, name): 220 | return name in list(self.attrs.keys()) 221 | 222 | 223 | class Element(treebuilder_base.Node): 224 | def __init__(self, element, soup, namespace): 225 | treebuilder_base.Node.__init__(self, element.name) 226 | self.element = element 227 | self.soup = soup 228 | self.namespace = namespace 229 | 230 | def appendChild(self, node): 231 | string_child = child = None 232 | if isinstance(node, str): 233 | # Some other piece of code decided to pass in a string 234 | # instead of creating a TextElement object to contain the 235 | # string. 236 | string_child = child = node 237 | elif isinstance(node, Tag): 238 | # Some other piece of code decided to pass in a Tag 239 | # instead of creating an Element object to contain the 240 | # Tag. 241 | child = node 242 | elif node.element.__class__ == NavigableString: 243 | string_child = child = node.element 244 | node.parent = self 245 | else: 246 | child = node.element 247 | node.parent = self 248 | 249 | if not isinstance(child, str) and child.parent is not None: 250 | node.element.extract() 251 | 252 | if (string_child and self.element.contents 253 | and self.element.contents[-1].__class__ == NavigableString): 254 | # We are appending a string onto another string. 255 | # TODO This has O(n^2) performance, for input like 256 | # "aaa..." 257 | old_element = self.element.contents[-1] 258 | new_element = self.soup.new_string(old_element + string_child) 259 | old_element.replace_with(new_element) 260 | self.soup._most_recent_element = new_element 261 | else: 262 | if isinstance(node, str): 263 | # Create a brand new NavigableString from this string. 264 | child = self.soup.new_string(node) 265 | 266 | # Tell Beautiful Soup to act as if it parsed this element 267 | # immediately after the parent's last descendant. (Or 268 | # immediately after the parent, if it has no children.) 269 | if self.element.contents: 270 | most_recent_element = self.element._last_descendant(False) 271 | elif self.element.next_element is not None: 272 | # Something from further ahead in the parse tree is 273 | # being inserted into this earlier element. This is 274 | # very annoying because it means an expensive search 275 | # for the last element in the tree. 276 | most_recent_element = self.soup._last_descendant() 277 | else: 278 | most_recent_element = self.element 279 | 280 | self.soup.object_was_parsed( 281 | child, parent=self.element, 282 | most_recent_element=most_recent_element) 283 | 284 | def getAttributes(self): 285 | if isinstance(self.element, Comment): 286 | return {} 287 | return AttrList(self.element) 288 | 289 | def setAttributes(self, attributes): 290 | 291 | if attributes is not None and len(attributes) > 0: 292 | 293 | converted_attributes = [] 294 | for name, value in list(attributes.items()): 295 | if isinstance(name, tuple): 296 | new_name = NamespacedAttribute(*name) 297 | del attributes[name] 298 | attributes[new_name] = value 299 | 300 | self.soup.builder._replace_cdata_list_attribute_values( 301 | self.name, attributes) 302 | for name, value in list(attributes.items()): 303 | self.element[name] = value 304 | 305 | # The attributes may contain variables that need substitution. 306 | # Call set_up_substitutions manually. 307 | # 308 | # The Tag constructor called this method when the Tag was created, 309 | # but we just set/changed the attributes, so call it again. 310 | self.soup.builder.set_up_substitutions(self.element) 311 | attributes = property(getAttributes, setAttributes) 312 | 313 | def insertText(self, data, insertBefore=None): 314 | text = TextNode(self.soup.new_string(data), self.soup) 315 | if insertBefore: 316 | self.insertBefore(text, insertBefore) 317 | else: 318 | self.appendChild(text) 319 | 320 | def insertBefore(self, node, refNode): 321 | index = self.element.index(refNode.element) 322 | if (node.element.__class__ == NavigableString and self.element.contents 323 | and self.element.contents[index-1].__class__ == NavigableString): 324 | # (See comments in appendChild) 325 | old_node = self.element.contents[index-1] 326 | new_str = self.soup.new_string(old_node + node.element) 327 | old_node.replace_with(new_str) 328 | else: 329 | self.element.insert(index, node.element) 330 | node.parent = self 331 | 332 | def removeChild(self, node): 333 | node.element.extract() 334 | 335 | def reparentChildren(self, new_parent): 336 | """Move all of this tag's children into another tag.""" 337 | # print "MOVE", self.element.contents 338 | # print "FROM", self.element 339 | # print "TO", new_parent.element 340 | 341 | element = self.element 342 | new_parent_element = new_parent.element 343 | # Determine what this tag's next_element will be once all the children 344 | # are removed. 345 | final_next_element = element.next_sibling 346 | 347 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) 348 | if len(new_parent_element.contents) > 0: 349 | # The new parent already contains children. We will be 350 | # appending this tag's children to the end. 351 | new_parents_last_child = new_parent_element.contents[-1] 352 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 353 | else: 354 | # The new parent contains no children. 355 | new_parents_last_child = None 356 | new_parents_last_descendant_next_element = new_parent_element.next_element 357 | 358 | to_append = element.contents 359 | if len(to_append) > 0: 360 | # Set the first child's previous_element and previous_sibling 361 | # to elements within the new parent 362 | first_child = to_append[0] 363 | if new_parents_last_descendant: 364 | first_child.previous_element = new_parents_last_descendant 365 | else: 366 | first_child.previous_element = new_parent_element 367 | first_child.previous_sibling = new_parents_last_child 368 | if new_parents_last_descendant: 369 | new_parents_last_descendant.next_element = first_child 370 | else: 371 | new_parent_element.next_element = first_child 372 | if new_parents_last_child: 373 | new_parents_last_child.next_sibling = first_child 374 | 375 | # Find the very last element being moved. It is now the 376 | # parent's last descendant. It has no .next_sibling and 377 | # its .next_element is whatever the previous last 378 | # descendant had. 379 | last_childs_last_descendant = to_append[-1]._last_descendant(False, True) 380 | 381 | last_childs_last_descendant.next_element = new_parents_last_descendant_next_element 382 | if new_parents_last_descendant_next_element: 383 | # TODO: This code has no test coverage and I'm not sure 384 | # how to get html5lib to go through this path, but it's 385 | # just the other side of the previous line. 386 | new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant 387 | last_childs_last_descendant.next_sibling = None 388 | 389 | for child in to_append: 390 | child.parent = new_parent_element 391 | new_parent_element.contents.append(child) 392 | 393 | # Now that this element has no children, change its .next_element. 394 | element.contents = [] 395 | element.next_element = final_next_element 396 | 397 | # print "DONE WITH MOVE" 398 | # print "FROM", self.element 399 | # print "TO", new_parent_element 400 | 401 | def cloneNode(self): 402 | tag = self.soup.new_tag(self.element.name, self.namespace) 403 | node = Element(tag, self.soup, self.namespace) 404 | for key,value in self.attributes: 405 | node.attributes[key] = value 406 | return node 407 | 408 | def hasContent(self): 409 | return self.element.contents 410 | 411 | def getNameTuple(self): 412 | if self.namespace == None: 413 | return namespaces["html"], self.name 414 | else: 415 | return self.namespace, self.name 416 | 417 | nameTuple = property(getNameTuple) 418 | 419 | class TextNode(Element): 420 | def __init__(self, element, soup): 421 | treebuilder_base.Node.__init__(self, None) 422 | self.element = element 423 | self.soup = soup 424 | 425 | def cloneNode(self): 426 | raise NotImplementedError 427 | -------------------------------------------------------------------------------- /modules/bs4/builder/_html5lib.py.bak: -------------------------------------------------------------------------------- 1 | # Use of this source code is governed by a BSD-style license that can be 2 | # found in the LICENSE file. 3 | 4 | __all__ = [ 5 | 'HTML5TreeBuilder', 6 | ] 7 | 8 | import warnings 9 | import re 10 | from bs4.builder import ( 11 | PERMISSIVE, 12 | HTML, 13 | HTML_5, 14 | HTMLTreeBuilder, 15 | ) 16 | from bs4.element import ( 17 | NamespacedAttribute, 18 | whitespace_re, 19 | ) 20 | import html5lib 21 | from html5lib.constants import ( 22 | namespaces, 23 | prefixes, 24 | ) 25 | from bs4.element import ( 26 | Comment, 27 | Doctype, 28 | NavigableString, 29 | Tag, 30 | ) 31 | 32 | try: 33 | # Pre-0.99999999 34 | from html5lib.treebuilders import _base as treebuilder_base 35 | new_html5lib = False 36 | except ImportError, e: 37 | # 0.99999999 and up 38 | from html5lib.treebuilders import base as treebuilder_base 39 | new_html5lib = True 40 | 41 | class HTML5TreeBuilder(HTMLTreeBuilder): 42 | """Use html5lib to build a tree.""" 43 | 44 | NAME = "html5lib" 45 | 46 | features = [NAME, PERMISSIVE, HTML_5, HTML] 47 | 48 | def prepare_markup(self, markup, user_specified_encoding, 49 | document_declared_encoding=None, exclude_encodings=None): 50 | # Store the user-specified encoding for use later on. 51 | self.user_specified_encoding = user_specified_encoding 52 | 53 | # document_declared_encoding and exclude_encodings aren't used 54 | # ATM because the html5lib TreeBuilder doesn't use 55 | # UnicodeDammit. 56 | if exclude_encodings: 57 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 58 | yield (markup, None, None, False) 59 | 60 | # These methods are defined by Beautiful Soup. 61 | def feed(self, markup): 62 | if self.soup.parse_only is not None: 63 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 64 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 65 | 66 | extra_kwargs = dict() 67 | if not isinstance(markup, unicode): 68 | if new_html5lib: 69 | extra_kwargs['override_encoding'] = self.user_specified_encoding 70 | else: 71 | extra_kwargs['encoding'] = self.user_specified_encoding 72 | doc = parser.parse(markup, **extra_kwargs) 73 | 74 | # Set the character encoding detected by the tokenizer. 75 | if isinstance(markup, unicode): 76 | # We need to special-case this because html5lib sets 77 | # charEncoding to UTF-8 if it gets Unicode input. 78 | doc.original_encoding = None 79 | else: 80 | original_encoding = parser.tokenizer.stream.charEncoding[0] 81 | if not isinstance(original_encoding, basestring): 82 | # In 0.99999999 and up, the encoding is an html5lib 83 | # Encoding object. We want to use a string for compatibility 84 | # with other tree builders. 85 | original_encoding = original_encoding.name 86 | doc.original_encoding = original_encoding 87 | 88 | def create_treebuilder(self, namespaceHTMLElements): 89 | self.underlying_builder = TreeBuilderForHtml5lib( 90 | namespaceHTMLElements, self.soup) 91 | return self.underlying_builder 92 | 93 | def test_fragment_to_document(self, fragment): 94 | """See `TreeBuilder`.""" 95 | return u'%s' % fragment 96 | 97 | 98 | class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): 99 | 100 | def __init__(self, namespaceHTMLElements, soup=None): 101 | if soup: 102 | self.soup = soup 103 | else: 104 | from bs4 import BeautifulSoup 105 | self.soup = BeautifulSoup("", "html.parser") 106 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 107 | 108 | def documentClass(self): 109 | self.soup.reset() 110 | return Element(self.soup, self.soup, None) 111 | 112 | def insertDoctype(self, token): 113 | name = token["name"] 114 | publicId = token["publicId"] 115 | systemId = token["systemId"] 116 | 117 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 118 | self.soup.object_was_parsed(doctype) 119 | 120 | def elementClass(self, name, namespace): 121 | tag = self.soup.new_tag(name, namespace) 122 | return Element(tag, self.soup, namespace) 123 | 124 | def commentClass(self, data): 125 | return TextNode(Comment(data), self.soup) 126 | 127 | def fragmentClass(self): 128 | from bs4 import BeautifulSoup 129 | self.soup = BeautifulSoup("", "html.parser") 130 | self.soup.name = "[document_fragment]" 131 | return Element(self.soup, self.soup, None) 132 | 133 | def appendChild(self, node): 134 | # XXX This code is not covered by the BS4 tests. 135 | self.soup.append(node.element) 136 | 137 | def getDocument(self): 138 | return self.soup 139 | 140 | def getFragment(self): 141 | return treebuilder_base.TreeBuilder.getFragment(self).element 142 | 143 | def testSerializer(self, element): 144 | from bs4 import BeautifulSoup 145 | rv = [] 146 | doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') 147 | 148 | def serializeElement(element, indent=0): 149 | if isinstance(element, BeautifulSoup): 150 | pass 151 | if isinstance(element, Doctype): 152 | m = doctype_re.match(element) 153 | if m: 154 | name = m.group(1) 155 | if m.lastindex > 1: 156 | publicId = m.group(2) or "" 157 | systemId = m.group(3) or m.group(4) or "" 158 | rv.append("""|%s""" % 159 | (' ' * indent, name, publicId, systemId)) 160 | else: 161 | rv.append("|%s" % (' ' * indent, name)) 162 | else: 163 | rv.append("|%s" % (' ' * indent,)) 164 | elif isinstance(element, Comment): 165 | rv.append("|%s" % (' ' * indent, element)) 166 | elif isinstance(element, NavigableString): 167 | rv.append("|%s\"%s\"" % (' ' * indent, element)) 168 | else: 169 | if element.namespace: 170 | name = "%s %s" % (prefixes[element.namespace], 171 | element.name) 172 | else: 173 | name = element.name 174 | rv.append("|%s<%s>" % (' ' * indent, name)) 175 | if element.attrs: 176 | attributes = [] 177 | for name, value in element.attrs.items(): 178 | if isinstance(name, NamespacedAttribute): 179 | name = "%s %s" % (prefixes[name.namespace], name.name) 180 | if isinstance(value, list): 181 | value = " ".join(value) 182 | attributes.append((name, value)) 183 | 184 | for name, value in sorted(attributes): 185 | rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) 186 | indent += 2 187 | for child in element.children: 188 | serializeElement(child, indent) 189 | serializeElement(element, 0) 190 | 191 | return "\n".join(rv) 192 | 193 | class AttrList(object): 194 | def __init__(self, element): 195 | self.element = element 196 | self.attrs = dict(self.element.attrs) 197 | def __iter__(self): 198 | return list(self.attrs.items()).__iter__() 199 | def __setitem__(self, name, value): 200 | # If this attribute is a multi-valued attribute for this element, 201 | # turn its value into a list. 202 | list_attr = HTML5TreeBuilder.cdata_list_attributes 203 | if (name in list_attr['*'] 204 | or (self.element.name in list_attr 205 | and name in list_attr[self.element.name])): 206 | # A node that is being cloned may have already undergone 207 | # this procedure. 208 | if not isinstance(value, list): 209 | value = whitespace_re.split(value) 210 | self.element[name] = value 211 | def items(self): 212 | return list(self.attrs.items()) 213 | def keys(self): 214 | return list(self.attrs.keys()) 215 | def __len__(self): 216 | return len(self.attrs) 217 | def __getitem__(self, name): 218 | return self.attrs[name] 219 | def __contains__(self, name): 220 | return name in list(self.attrs.keys()) 221 | 222 | 223 | class Element(treebuilder_base.Node): 224 | def __init__(self, element, soup, namespace): 225 | treebuilder_base.Node.__init__(self, element.name) 226 | self.element = element 227 | self.soup = soup 228 | self.namespace = namespace 229 | 230 | def appendChild(self, node): 231 | string_child = child = None 232 | if isinstance(node, basestring): 233 | # Some other piece of code decided to pass in a string 234 | # instead of creating a TextElement object to contain the 235 | # string. 236 | string_child = child = node 237 | elif isinstance(node, Tag): 238 | # Some other piece of code decided to pass in a Tag 239 | # instead of creating an Element object to contain the 240 | # Tag. 241 | child = node 242 | elif node.element.__class__ == NavigableString: 243 | string_child = child = node.element 244 | node.parent = self 245 | else: 246 | child = node.element 247 | node.parent = self 248 | 249 | if not isinstance(child, basestring) and child.parent is not None: 250 | node.element.extract() 251 | 252 | if (string_child and self.element.contents 253 | and self.element.contents[-1].__class__ == NavigableString): 254 | # We are appending a string onto another string. 255 | # TODO This has O(n^2) performance, for input like 256 | # "aaa..." 257 | old_element = self.element.contents[-1] 258 | new_element = self.soup.new_string(old_element + string_child) 259 | old_element.replace_with(new_element) 260 | self.soup._most_recent_element = new_element 261 | else: 262 | if isinstance(node, basestring): 263 | # Create a brand new NavigableString from this string. 264 | child = self.soup.new_string(node) 265 | 266 | # Tell Beautiful Soup to act as if it parsed this element 267 | # immediately after the parent's last descendant. (Or 268 | # immediately after the parent, if it has no children.) 269 | if self.element.contents: 270 | most_recent_element = self.element._last_descendant(False) 271 | elif self.element.next_element is not None: 272 | # Something from further ahead in the parse tree is 273 | # being inserted into this earlier element. This is 274 | # very annoying because it means an expensive search 275 | # for the last element in the tree. 276 | most_recent_element = self.soup._last_descendant() 277 | else: 278 | most_recent_element = self.element 279 | 280 | self.soup.object_was_parsed( 281 | child, parent=self.element, 282 | most_recent_element=most_recent_element) 283 | 284 | def getAttributes(self): 285 | if isinstance(self.element, Comment): 286 | return {} 287 | return AttrList(self.element) 288 | 289 | def setAttributes(self, attributes): 290 | 291 | if attributes is not None and len(attributes) > 0: 292 | 293 | converted_attributes = [] 294 | for name, value in list(attributes.items()): 295 | if isinstance(name, tuple): 296 | new_name = NamespacedAttribute(*name) 297 | del attributes[name] 298 | attributes[new_name] = value 299 | 300 | self.soup.builder._replace_cdata_list_attribute_values( 301 | self.name, attributes) 302 | for name, value in attributes.items(): 303 | self.element[name] = value 304 | 305 | # The attributes may contain variables that need substitution. 306 | # Call set_up_substitutions manually. 307 | # 308 | # The Tag constructor called this method when the Tag was created, 309 | # but we just set/changed the attributes, so call it again. 310 | self.soup.builder.set_up_substitutions(self.element) 311 | attributes = property(getAttributes, setAttributes) 312 | 313 | def insertText(self, data, insertBefore=None): 314 | text = TextNode(self.soup.new_string(data), self.soup) 315 | if insertBefore: 316 | self.insertBefore(text, insertBefore) 317 | else: 318 | self.appendChild(text) 319 | 320 | def insertBefore(self, node, refNode): 321 | index = self.element.index(refNode.element) 322 | if (node.element.__class__ == NavigableString and self.element.contents 323 | and self.element.contents[index-1].__class__ == NavigableString): 324 | # (See comments in appendChild) 325 | old_node = self.element.contents[index-1] 326 | new_str = self.soup.new_string(old_node + node.element) 327 | old_node.replace_with(new_str) 328 | else: 329 | self.element.insert(index, node.element) 330 | node.parent = self 331 | 332 | def removeChild(self, node): 333 | node.element.extract() 334 | 335 | def reparentChildren(self, new_parent): 336 | """Move all of this tag's children into another tag.""" 337 | # print "MOVE", self.element.contents 338 | # print "FROM", self.element 339 | # print "TO", new_parent.element 340 | 341 | element = self.element 342 | new_parent_element = new_parent.element 343 | # Determine what this tag's next_element will be once all the children 344 | # are removed. 345 | final_next_element = element.next_sibling 346 | 347 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) 348 | if len(new_parent_element.contents) > 0: 349 | # The new parent already contains children. We will be 350 | # appending this tag's children to the end. 351 | new_parents_last_child = new_parent_element.contents[-1] 352 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 353 | else: 354 | # The new parent contains no children. 355 | new_parents_last_child = None 356 | new_parents_last_descendant_next_element = new_parent_element.next_element 357 | 358 | to_append = element.contents 359 | if len(to_append) > 0: 360 | # Set the first child's previous_element and previous_sibling 361 | # to elements within the new parent 362 | first_child = to_append[0] 363 | if new_parents_last_descendant: 364 | first_child.previous_element = new_parents_last_descendant 365 | else: 366 | first_child.previous_element = new_parent_element 367 | first_child.previous_sibling = new_parents_last_child 368 | if new_parents_last_descendant: 369 | new_parents_last_descendant.next_element = first_child 370 | else: 371 | new_parent_element.next_element = first_child 372 | if new_parents_last_child: 373 | new_parents_last_child.next_sibling = first_child 374 | 375 | # Find the very last element being moved. It is now the 376 | # parent's last descendant. It has no .next_sibling and 377 | # its .next_element is whatever the previous last 378 | # descendant had. 379 | last_childs_last_descendant = to_append[-1]._last_descendant(False, True) 380 | 381 | last_childs_last_descendant.next_element = new_parents_last_descendant_next_element 382 | if new_parents_last_descendant_next_element: 383 | # TODO: This code has no test coverage and I'm not sure 384 | # how to get html5lib to go through this path, but it's 385 | # just the other side of the previous line. 386 | new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant 387 | last_childs_last_descendant.next_sibling = None 388 | 389 | for child in to_append: 390 | child.parent = new_parent_element 391 | new_parent_element.contents.append(child) 392 | 393 | # Now that this element has no children, change its .next_element. 394 | element.contents = [] 395 | element.next_element = final_next_element 396 | 397 | # print "DONE WITH MOVE" 398 | # print "FROM", self.element 399 | # print "TO", new_parent_element 400 | 401 | def cloneNode(self): 402 | tag = self.soup.new_tag(self.element.name, self.namespace) 403 | node = Element(tag, self.soup, self.namespace) 404 | for key,value in self.attributes: 405 | node.attributes[key] = value 406 | return node 407 | 408 | def hasContent(self): 409 | return self.element.contents 410 | 411 | def getNameTuple(self): 412 | if self.namespace == None: 413 | return namespaces["html"], self.name 414 | else: 415 | return self.namespace, self.name 416 | 417 | nameTuple = property(getNameTuple) 418 | 419 | class TextNode(Element): 420 | def __init__(self, element, soup): 421 | treebuilder_base.Node.__init__(self, None) 422 | self.element = element 423 | self.soup = soup 424 | 425 | def cloneNode(self): 426 | raise NotImplementedError 427 | -------------------------------------------------------------------------------- /modules/bs4/builder/_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | __all__ = [ 7 | 'HTMLParserTreeBuilder', 8 | ] 9 | 10 | from html.parser import HTMLParser 11 | 12 | try: 13 | from html.parser import HTMLParseError 14 | except ImportError as e: 15 | # HTMLParseError is removed in Python 3.5. Since it can never be 16 | # thrown in 3.5, we can just define our own class as a placeholder. 17 | class HTMLParseError(Exception): 18 | pass 19 | 20 | import sys 21 | import warnings 22 | 23 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 24 | # argument, which we'd like to set to False. Unfortunately, 25 | # http://bugs.python.org/issue13273 makes strict=True a better bet 26 | # before Python 3.2.3. 27 | # 28 | # At the end of this file, we monkeypatch HTMLParser so that 29 | # strict=True works well on Python 3.2.2. 30 | major, minor, release = sys.version_info[:3] 31 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 32 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 33 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 34 | 35 | 36 | from bs4.element import ( 37 | CData, 38 | Comment, 39 | Declaration, 40 | Doctype, 41 | ProcessingInstruction, 42 | ) 43 | from bs4.dammit import EntitySubstitution, UnicodeDammit 44 | 45 | from bs4.builder import ( 46 | HTML, 47 | HTMLTreeBuilder, 48 | STRICT, 49 | ) 50 | 51 | 52 | HTMLPARSER = 'html.parser' 53 | 54 | class BeautifulSoupHTMLParser(HTMLParser): 55 | def handle_starttag(self, name, attrs): 56 | # XXX namespace 57 | attr_dict = {} 58 | for key, value in attrs: 59 | # Change None attribute values to the empty string 60 | # for consistency with the other tree builders. 61 | if value is None: 62 | value = '' 63 | attr_dict[key] = value 64 | attrvalue = '""' 65 | self.soup.handle_starttag(name, None, None, attr_dict) 66 | 67 | def handle_endtag(self, name): 68 | self.soup.handle_endtag(name) 69 | 70 | def handle_data(self, data): 71 | self.soup.handle_data(data) 72 | 73 | def handle_charref(self, name): 74 | # XXX workaround for a bug in HTMLParser. Remove this once 75 | # it's fixed in all supported versions. 76 | # http://bugs.python.org/issue13633 77 | if name.startswith('x'): 78 | real_name = int(name.lstrip('x'), 16) 79 | elif name.startswith('X'): 80 | real_name = int(name.lstrip('X'), 16) 81 | else: 82 | real_name = int(name) 83 | 84 | try: 85 | data = chr(real_name) 86 | except (ValueError, OverflowError) as e: 87 | data = "\N{REPLACEMENT CHARACTER}" 88 | 89 | self.handle_data(data) 90 | 91 | def handle_entityref(self, name): 92 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 93 | if character is not None: 94 | data = character 95 | else: 96 | data = "&%s;" % name 97 | self.handle_data(data) 98 | 99 | def handle_comment(self, data): 100 | self.soup.endData() 101 | self.soup.handle_data(data) 102 | self.soup.endData(Comment) 103 | 104 | def handle_decl(self, data): 105 | self.soup.endData() 106 | if data.startswith("DOCTYPE "): 107 | data = data[len("DOCTYPE "):] 108 | elif data == 'DOCTYPE': 109 | # i.e. "" 110 | data = '' 111 | self.soup.handle_data(data) 112 | self.soup.endData(Doctype) 113 | 114 | def unknown_decl(self, data): 115 | if data.upper().startswith('CDATA['): 116 | cls = CData 117 | data = data[len('CDATA['):] 118 | else: 119 | cls = Declaration 120 | self.soup.endData() 121 | self.soup.handle_data(data) 122 | self.soup.endData(cls) 123 | 124 | def handle_pi(self, data): 125 | self.soup.endData() 126 | self.soup.handle_data(data) 127 | self.soup.endData(ProcessingInstruction) 128 | 129 | 130 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 131 | 132 | is_xml = False 133 | picklable = True 134 | NAME = HTMLPARSER 135 | features = [NAME, HTML, STRICT] 136 | 137 | def __init__(self, *args, **kwargs): 138 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 139 | kwargs['strict'] = False 140 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 141 | kwargs['convert_charrefs'] = False 142 | self.parser_args = (args, kwargs) 143 | 144 | def prepare_markup(self, markup, user_specified_encoding=None, 145 | document_declared_encoding=None, exclude_encodings=None): 146 | """ 147 | :return: A 4-tuple (markup, original encoding, encoding 148 | declared within markup, whether any characters had to be 149 | replaced with REPLACEMENT CHARACTER). 150 | """ 151 | if isinstance(markup, str): 152 | yield (markup, None, None, False) 153 | return 154 | 155 | try_encodings = [user_specified_encoding, document_declared_encoding] 156 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 157 | exclude_encodings=exclude_encodings) 158 | yield (dammit.markup, dammit.original_encoding, 159 | dammit.declared_html_encoding, 160 | dammit.contains_replacement_characters) 161 | 162 | def feed(self, markup): 163 | args, kwargs = self.parser_args 164 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 165 | parser.soup = self.soup 166 | try: 167 | parser.feed(markup) 168 | except HTMLParseError as e: 169 | warnings.warn(RuntimeWarning( 170 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 171 | raise e 172 | 173 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 174 | # 3.2.3 code. This ensures they don't treat markup like

as a 175 | # string. 176 | # 177 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 178 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 179 | import re 180 | attrfind_tolerant = re.compile( 181 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 182 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 183 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 184 | 185 | locatestarttagend = re.compile(r""" 186 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 187 | (?:\s+ # whitespace before attribute name 188 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 189 | (?:\s*=\s* # value indicator 190 | (?:'[^']*' # LITA-enclosed value 191 | |\"[^\"]*\" # LIT-enclosed value 192 | |[^'\">\s]+ # bare value 193 | ) 194 | )? 195 | ) 196 | )* 197 | \s* # trailing whitespace 198 | """, re.VERBOSE) 199 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 200 | 201 | from html.parser import tagfind, attrfind 202 | 203 | def parse_starttag(self, i): 204 | self.__starttag_text = None 205 | endpos = self.check_for_whole_start_tag(i) 206 | if endpos < 0: 207 | return endpos 208 | rawdata = self.rawdata 209 | self.__starttag_text = rawdata[i:endpos] 210 | 211 | # Now parse the data between i+1 and j into a tag and attrs 212 | attrs = [] 213 | match = tagfind.match(rawdata, i+1) 214 | assert match, 'unexpected call to parse_starttag()' 215 | k = match.end() 216 | self.lasttag = tag = rawdata[i+1:k].lower() 217 | while k < endpos: 218 | if self.strict: 219 | m = attrfind.match(rawdata, k) 220 | else: 221 | m = attrfind_tolerant.match(rawdata, k) 222 | if not m: 223 | break 224 | attrname, rest, attrvalue = m.group(1, 2, 3) 225 | if not rest: 226 | attrvalue = None 227 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 228 | attrvalue[:1] == '"' == attrvalue[-1:]: 229 | attrvalue = attrvalue[1:-1] 230 | if attrvalue: 231 | attrvalue = self.unescape(attrvalue) 232 | attrs.append((attrname.lower(), attrvalue)) 233 | k = m.end() 234 | 235 | end = rawdata[k:endpos].strip() 236 | if end not in (">", "/>"): 237 | lineno, offset = self.getpos() 238 | if "\n" in self.__starttag_text: 239 | lineno = lineno + self.__starttag_text.count("\n") 240 | offset = len(self.__starttag_text) \ 241 | - self.__starttag_text.rfind("\n") 242 | else: 243 | offset = offset + len(self.__starttag_text) 244 | if self.strict: 245 | self.error("junk characters in start tag: %r" 246 | % (rawdata[k:endpos][:20],)) 247 | self.handle_data(rawdata[i:endpos]) 248 | return endpos 249 | if end.endswith('/>'): 250 | # XHTML-style empty tag: 251 | self.handle_startendtag(tag, attrs) 252 | else: 253 | self.handle_starttag(tag, attrs) 254 | if tag in self.CDATA_CONTENT_ELEMENTS: 255 | self.set_cdata_mode(tag) 256 | return endpos 257 | 258 | def set_cdata_mode(self, elem): 259 | self.cdata_elem = elem.lower() 260 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 261 | 262 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 263 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 264 | 265 | CONSTRUCTOR_TAKES_STRICT = True 266 | -------------------------------------------------------------------------------- /modules/bs4/builder/_htmlparser.py.bak: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | 6 | __all__ = [ 7 | 'HTMLParserTreeBuilder', 8 | ] 9 | 10 | from HTMLParser import HTMLParser 11 | 12 | try: 13 | from HTMLParser import HTMLParseError 14 | except ImportError, e: 15 | # HTMLParseError is removed in Python 3.5. Since it can never be 16 | # thrown in 3.5, we can just define our own class as a placeholder. 17 | class HTMLParseError(Exception): 18 | pass 19 | 20 | import sys 21 | import warnings 22 | 23 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 24 | # argument, which we'd like to set to False. Unfortunately, 25 | # http://bugs.python.org/issue13273 makes strict=True a better bet 26 | # before Python 3.2.3. 27 | # 28 | # At the end of this file, we monkeypatch HTMLParser so that 29 | # strict=True works well on Python 3.2.2. 30 | major, minor, release = sys.version_info[:3] 31 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 32 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 33 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 34 | 35 | 36 | from bs4.element import ( 37 | CData, 38 | Comment, 39 | Declaration, 40 | Doctype, 41 | ProcessingInstruction, 42 | ) 43 | from bs4.dammit import EntitySubstitution, UnicodeDammit 44 | 45 | from bs4.builder import ( 46 | HTML, 47 | HTMLTreeBuilder, 48 | STRICT, 49 | ) 50 | 51 | 52 | HTMLPARSER = 'html.parser' 53 | 54 | class BeautifulSoupHTMLParser(HTMLParser): 55 | def handle_starttag(self, name, attrs): 56 | # XXX namespace 57 | attr_dict = {} 58 | for key, value in attrs: 59 | # Change None attribute values to the empty string 60 | # for consistency with the other tree builders. 61 | if value is None: 62 | value = '' 63 | attr_dict[key] = value 64 | attrvalue = '""' 65 | self.soup.handle_starttag(name, None, None, attr_dict) 66 | 67 | def handle_endtag(self, name): 68 | self.soup.handle_endtag(name) 69 | 70 | def handle_data(self, data): 71 | self.soup.handle_data(data) 72 | 73 | def handle_charref(self, name): 74 | # XXX workaround for a bug in HTMLParser. Remove this once 75 | # it's fixed in all supported versions. 76 | # http://bugs.python.org/issue13633 77 | if name.startswith('x'): 78 | real_name = int(name.lstrip('x'), 16) 79 | elif name.startswith('X'): 80 | real_name = int(name.lstrip('X'), 16) 81 | else: 82 | real_name = int(name) 83 | 84 | try: 85 | data = unichr(real_name) 86 | except (ValueError, OverflowError), e: 87 | data = u"\N{REPLACEMENT CHARACTER}" 88 | 89 | self.handle_data(data) 90 | 91 | def handle_entityref(self, name): 92 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 93 | if character is not None: 94 | data = character 95 | else: 96 | data = "&%s;" % name 97 | self.handle_data(data) 98 | 99 | def handle_comment(self, data): 100 | self.soup.endData() 101 | self.soup.handle_data(data) 102 | self.soup.endData(Comment) 103 | 104 | def handle_decl(self, data): 105 | self.soup.endData() 106 | if data.startswith("DOCTYPE "): 107 | data = data[len("DOCTYPE "):] 108 | elif data == 'DOCTYPE': 109 | # i.e. "" 110 | data = '' 111 | self.soup.handle_data(data) 112 | self.soup.endData(Doctype) 113 | 114 | def unknown_decl(self, data): 115 | if data.upper().startswith('CDATA['): 116 | cls = CData 117 | data = data[len('CDATA['):] 118 | else: 119 | cls = Declaration 120 | self.soup.endData() 121 | self.soup.handle_data(data) 122 | self.soup.endData(cls) 123 | 124 | def handle_pi(self, data): 125 | self.soup.endData() 126 | self.soup.handle_data(data) 127 | self.soup.endData(ProcessingInstruction) 128 | 129 | 130 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 131 | 132 | is_xml = False 133 | picklable = True 134 | NAME = HTMLPARSER 135 | features = [NAME, HTML, STRICT] 136 | 137 | def __init__(self, *args, **kwargs): 138 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 139 | kwargs['strict'] = False 140 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 141 | kwargs['convert_charrefs'] = False 142 | self.parser_args = (args, kwargs) 143 | 144 | def prepare_markup(self, markup, user_specified_encoding=None, 145 | document_declared_encoding=None, exclude_encodings=None): 146 | """ 147 | :return: A 4-tuple (markup, original encoding, encoding 148 | declared within markup, whether any characters had to be 149 | replaced with REPLACEMENT CHARACTER). 150 | """ 151 | if isinstance(markup, unicode): 152 | yield (markup, None, None, False) 153 | return 154 | 155 | try_encodings = [user_specified_encoding, document_declared_encoding] 156 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 157 | exclude_encodings=exclude_encodings) 158 | yield (dammit.markup, dammit.original_encoding, 159 | dammit.declared_html_encoding, 160 | dammit.contains_replacement_characters) 161 | 162 | def feed(self, markup): 163 | args, kwargs = self.parser_args 164 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 165 | parser.soup = self.soup 166 | try: 167 | parser.feed(markup) 168 | except HTMLParseError, e: 169 | warnings.warn(RuntimeWarning( 170 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 171 | raise e 172 | 173 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 174 | # 3.2.3 code. This ensures they don't treat markup like

as a 175 | # string. 176 | # 177 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 178 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 179 | import re 180 | attrfind_tolerant = re.compile( 181 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 182 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 183 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 184 | 185 | locatestarttagend = re.compile(r""" 186 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 187 | (?:\s+ # whitespace before attribute name 188 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 189 | (?:\s*=\s* # value indicator 190 | (?:'[^']*' # LITA-enclosed value 191 | |\"[^\"]*\" # LIT-enclosed value 192 | |[^'\">\s]+ # bare value 193 | ) 194 | )? 195 | ) 196 | )* 197 | \s* # trailing whitespace 198 | """, re.VERBOSE) 199 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 200 | 201 | from html.parser import tagfind, attrfind 202 | 203 | def parse_starttag(self, i): 204 | self.__starttag_text = None 205 | endpos = self.check_for_whole_start_tag(i) 206 | if endpos < 0: 207 | return endpos 208 | rawdata = self.rawdata 209 | self.__starttag_text = rawdata[i:endpos] 210 | 211 | # Now parse the data between i+1 and j into a tag and attrs 212 | attrs = [] 213 | match = tagfind.match(rawdata, i+1) 214 | assert match, 'unexpected call to parse_starttag()' 215 | k = match.end() 216 | self.lasttag = tag = rawdata[i+1:k].lower() 217 | while k < endpos: 218 | if self.strict: 219 | m = attrfind.match(rawdata, k) 220 | else: 221 | m = attrfind_tolerant.match(rawdata, k) 222 | if not m: 223 | break 224 | attrname, rest, attrvalue = m.group(1, 2, 3) 225 | if not rest: 226 | attrvalue = None 227 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 228 | attrvalue[:1] == '"' == attrvalue[-1:]: 229 | attrvalue = attrvalue[1:-1] 230 | if attrvalue: 231 | attrvalue = self.unescape(attrvalue) 232 | attrs.append((attrname.lower(), attrvalue)) 233 | k = m.end() 234 | 235 | end = rawdata[k:endpos].strip() 236 | if end not in (">", "/>"): 237 | lineno, offset = self.getpos() 238 | if "\n" in self.__starttag_text: 239 | lineno = lineno + self.__starttag_text.count("\n") 240 | offset = len(self.__starttag_text) \ 241 | - self.__starttag_text.rfind("\n") 242 | else: 243 | offset = offset + len(self.__starttag_text) 244 | if self.strict: 245 | self.error("junk characters in start tag: %r" 246 | % (rawdata[k:endpos][:20],)) 247 | self.handle_data(rawdata[i:endpos]) 248 | return endpos 249 | if end.endswith('/>'): 250 | # XHTML-style empty tag: 251 | self.handle_startendtag(tag, attrs) 252 | else: 253 | self.handle_starttag(tag, attrs) 254 | if tag in self.CDATA_CONTENT_ELEMENTS: 255 | self.set_cdata_mode(tag) 256 | return endpos 257 | 258 | def set_cdata_mode(self, elem): 259 | self.cdata_elem = elem.lower() 260 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 261 | 262 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 263 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 264 | 265 | CONSTRUCTOR_TAKES_STRICT = True 266 | -------------------------------------------------------------------------------- /modules/bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | # Use of this source code is governed by a BSD-style license that can be 2 | # found in the LICENSE file. 3 | __all__ = [ 4 | 'LXMLTreeBuilderForXML', 5 | 'LXMLTreeBuilder', 6 | ] 7 | 8 | from io import BytesIO 9 | from io import StringIO 10 | import collections 11 | from lxml import etree 12 | from bs4.element import ( 13 | Comment, 14 | Doctype, 15 | NamespacedAttribute, 16 | ProcessingInstruction, 17 | XMLProcessingInstruction, 18 | ) 19 | from bs4.builder import ( 20 | FAST, 21 | HTML, 22 | HTMLTreeBuilder, 23 | PERMISSIVE, 24 | ParserRejectedMarkup, 25 | TreeBuilder, 26 | XML) 27 | from bs4.dammit import EncodingDetector 28 | 29 | LXML = 'lxml' 30 | 31 | class LXMLTreeBuilderForXML(TreeBuilder): 32 | DEFAULT_PARSER_CLASS = etree.XMLParser 33 | 34 | is_xml = True 35 | processing_instruction_class = XMLProcessingInstruction 36 | 37 | NAME = "lxml-xml" 38 | ALTERNATE_NAMES = ["xml"] 39 | 40 | # Well, it's permissive by XML parser standards. 41 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 42 | 43 | CHUNK_SIZE = 512 44 | 45 | # This namespace mapping is specified in the XML Namespace 46 | # standard. 47 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 48 | 49 | def default_parser(self, encoding): 50 | # This can either return a parser object or a class, which 51 | # will be instantiated with default arguments. 52 | if self._default_parser is not None: 53 | return self._default_parser 54 | return etree.XMLParser( 55 | target=self, strip_cdata=False, recover=True, encoding=encoding) 56 | 57 | def parser_for(self, encoding): 58 | # Use the default parser. 59 | parser = self.default_parser(encoding) 60 | 61 | if isinstance(parser, collections.Callable): 62 | # Instantiate the parser with default arguments 63 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 64 | return parser 65 | 66 | def __init__(self, parser=None, empty_element_tags=None): 67 | # TODO: Issue a warning if parser is present but not a 68 | # callable, since that means there's no way to create new 69 | # parsers for different encodings. 70 | self._default_parser = parser 71 | if empty_element_tags is not None: 72 | self.empty_element_tags = set(empty_element_tags) 73 | self.soup = None 74 | self.nsmaps = [self.DEFAULT_NSMAPS] 75 | 76 | def _getNsTag(self, tag): 77 | # Split the namespace URL out of a fully-qualified lxml tag 78 | # name. Copied from lxml's src/lxml/sax.py. 79 | if tag[0] == '{': 80 | return tuple(tag[1:].split('}', 1)) 81 | else: 82 | return (None, tag) 83 | 84 | def prepare_markup(self, markup, user_specified_encoding=None, 85 | exclude_encodings=None, 86 | document_declared_encoding=None): 87 | """ 88 | :yield: A series of 4-tuples. 89 | (markup, encoding, declared encoding, 90 | has undergone character replacement) 91 | 92 | Each 4-tuple represents a strategy for parsing the document. 93 | """ 94 | # Instead of using UnicodeDammit to convert the bytestring to 95 | # Unicode using different encodings, use EncodingDetector to 96 | # iterate over the encodings, and tell lxml to try to parse 97 | # the document as each one in turn. 98 | is_html = not self.is_xml 99 | if is_html: 100 | self.processing_instruction_class = ProcessingInstruction 101 | else: 102 | self.processing_instruction_class = XMLProcessingInstruction 103 | 104 | if isinstance(markup, str): 105 | # We were given Unicode. Maybe lxml can parse Unicode on 106 | # this system? 107 | yield markup, None, document_declared_encoding, False 108 | 109 | if isinstance(markup, str): 110 | # No, apparently not. Convert the Unicode to UTF-8 and 111 | # tell lxml to parse it as UTF-8. 112 | yield (markup.encode("utf8"), "utf8", 113 | document_declared_encoding, False) 114 | 115 | try_encodings = [user_specified_encoding, document_declared_encoding] 116 | detector = EncodingDetector( 117 | markup, try_encodings, is_html, exclude_encodings) 118 | for encoding in detector.encodings: 119 | yield (detector.markup, encoding, document_declared_encoding, False) 120 | 121 | def feed(self, markup): 122 | if isinstance(markup, bytes): 123 | markup = BytesIO(markup) 124 | elif isinstance(markup, str): 125 | markup = StringIO(markup) 126 | 127 | # Call feed() at least once, even if the markup is empty, 128 | # or the parser won't be initialized. 129 | data = markup.read(self.CHUNK_SIZE) 130 | try: 131 | self.parser = self.parser_for(self.soup.original_encoding) 132 | self.parser.feed(data) 133 | while len(data) != 0: 134 | # Now call feed() on the rest of the data, chunk by chunk. 135 | data = markup.read(self.CHUNK_SIZE) 136 | if len(data) != 0: 137 | self.parser.feed(data) 138 | self.parser.close() 139 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 140 | raise ParserRejectedMarkup(str(e)) 141 | 142 | def close(self): 143 | self.nsmaps = [self.DEFAULT_NSMAPS] 144 | 145 | def start(self, name, attrs, nsmap={}): 146 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 147 | attrs = dict(attrs) 148 | nsprefix = None 149 | # Invert each namespace map as it comes in. 150 | if len(self.nsmaps) > 1: 151 | # There are no new namespaces for this tag, but 152 | # non-default namespaces are in play, so we need a 153 | # separate tag stack to know when they end. 154 | self.nsmaps.append(None) 155 | elif len(nsmap) > 0: 156 | # A new namespace mapping has come into play. 157 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 158 | self.nsmaps.append(inverted_nsmap) 159 | # Also treat the namespace mapping as a set of attributes on the 160 | # tag, so we can recreate it later. 161 | attrs = attrs.copy() 162 | for prefix, namespace in list(nsmap.items()): 163 | attribute = NamespacedAttribute( 164 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 165 | attrs[attribute] = namespace 166 | 167 | # Namespaces are in play. Find any attributes that came in 168 | # from lxml with namespaces attached to their names, and 169 | # turn then into NamespacedAttribute objects. 170 | new_attrs = {} 171 | for attr, value in list(attrs.items()): 172 | namespace, attr = self._getNsTag(attr) 173 | if namespace is None: 174 | new_attrs[attr] = value 175 | else: 176 | nsprefix = self._prefix_for_namespace(namespace) 177 | attr = NamespacedAttribute(nsprefix, attr, namespace) 178 | new_attrs[attr] = value 179 | attrs = new_attrs 180 | 181 | namespace, name = self._getNsTag(name) 182 | nsprefix = self._prefix_for_namespace(namespace) 183 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 184 | 185 | def _prefix_for_namespace(self, namespace): 186 | """Find the currently active prefix for the given namespace.""" 187 | if namespace is None: 188 | return None 189 | for inverted_nsmap in reversed(self.nsmaps): 190 | if inverted_nsmap is not None and namespace in inverted_nsmap: 191 | return inverted_nsmap[namespace] 192 | return None 193 | 194 | def end(self, name): 195 | self.soup.endData() 196 | completed_tag = self.soup.tagStack[-1] 197 | namespace, name = self._getNsTag(name) 198 | nsprefix = None 199 | if namespace is not None: 200 | for inverted_nsmap in reversed(self.nsmaps): 201 | if inverted_nsmap is not None and namespace in inverted_nsmap: 202 | nsprefix = inverted_nsmap[namespace] 203 | break 204 | self.soup.handle_endtag(name, nsprefix) 205 | if len(self.nsmaps) > 1: 206 | # This tag, or one of its parents, introduced a namespace 207 | # mapping, so pop it off the stack. 208 | self.nsmaps.pop() 209 | 210 | def pi(self, target, data): 211 | self.soup.endData() 212 | self.soup.handle_data(target + ' ' + data) 213 | self.soup.endData(self.processing_instruction_class) 214 | 215 | def data(self, content): 216 | self.soup.handle_data(content) 217 | 218 | def doctype(self, name, pubid, system): 219 | self.soup.endData() 220 | doctype = Doctype.for_name_and_ids(name, pubid, system) 221 | self.soup.object_was_parsed(doctype) 222 | 223 | def comment(self, content): 224 | "Handle comments as Comment objects." 225 | self.soup.endData() 226 | self.soup.handle_data(content) 227 | self.soup.endData(Comment) 228 | 229 | def test_fragment_to_document(self, fragment): 230 | """See `TreeBuilder`.""" 231 | return '\n%s' % fragment 232 | 233 | 234 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 235 | 236 | NAME = LXML 237 | ALTERNATE_NAMES = ["lxml-html"] 238 | 239 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 240 | is_xml = False 241 | processing_instruction_class = ProcessingInstruction 242 | 243 | def default_parser(self, encoding): 244 | return etree.HTMLParser 245 | 246 | def feed(self, markup): 247 | encoding = self.soup.original_encoding 248 | try: 249 | self.parser = self.parser_for(encoding) 250 | self.parser.feed(markup) 251 | self.parser.close() 252 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 253 | raise ParserRejectedMarkup(str(e)) 254 | 255 | 256 | def test_fragment_to_document(self, fragment): 257 | """See `TreeBuilder`.""" 258 | return '%s' % fragment 259 | -------------------------------------------------------------------------------- /modules/bs4/builder/_lxml.py.bak: -------------------------------------------------------------------------------- 1 | # Use of this source code is governed by a BSD-style license that can be 2 | # found in the LICENSE file. 3 | __all__ = [ 4 | 'LXMLTreeBuilderForXML', 5 | 'LXMLTreeBuilder', 6 | ] 7 | 8 | from io import BytesIO 9 | from StringIO import StringIO 10 | import collections 11 | from lxml import etree 12 | from bs4.element import ( 13 | Comment, 14 | Doctype, 15 | NamespacedAttribute, 16 | ProcessingInstruction, 17 | XMLProcessingInstruction, 18 | ) 19 | from bs4.builder import ( 20 | FAST, 21 | HTML, 22 | HTMLTreeBuilder, 23 | PERMISSIVE, 24 | ParserRejectedMarkup, 25 | TreeBuilder, 26 | XML) 27 | from bs4.dammit import EncodingDetector 28 | 29 | LXML = 'lxml' 30 | 31 | class LXMLTreeBuilderForXML(TreeBuilder): 32 | DEFAULT_PARSER_CLASS = etree.XMLParser 33 | 34 | is_xml = True 35 | processing_instruction_class = XMLProcessingInstruction 36 | 37 | NAME = "lxml-xml" 38 | ALTERNATE_NAMES = ["xml"] 39 | 40 | # Well, it's permissive by XML parser standards. 41 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 42 | 43 | CHUNK_SIZE = 512 44 | 45 | # This namespace mapping is specified in the XML Namespace 46 | # standard. 47 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 48 | 49 | def default_parser(self, encoding): 50 | # This can either return a parser object or a class, which 51 | # will be instantiated with default arguments. 52 | if self._default_parser is not None: 53 | return self._default_parser 54 | return etree.XMLParser( 55 | target=self, strip_cdata=False, recover=True, encoding=encoding) 56 | 57 | def parser_for(self, encoding): 58 | # Use the default parser. 59 | parser = self.default_parser(encoding) 60 | 61 | if isinstance(parser, collections.Callable): 62 | # Instantiate the parser with default arguments 63 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 64 | return parser 65 | 66 | def __init__(self, parser=None, empty_element_tags=None): 67 | # TODO: Issue a warning if parser is present but not a 68 | # callable, since that means there's no way to create new 69 | # parsers for different encodings. 70 | self._default_parser = parser 71 | if empty_element_tags is not None: 72 | self.empty_element_tags = set(empty_element_tags) 73 | self.soup = None 74 | self.nsmaps = [self.DEFAULT_NSMAPS] 75 | 76 | def _getNsTag(self, tag): 77 | # Split the namespace URL out of a fully-qualified lxml tag 78 | # name. Copied from lxml's src/lxml/sax.py. 79 | if tag[0] == '{': 80 | return tuple(tag[1:].split('}', 1)) 81 | else: 82 | return (None, tag) 83 | 84 | def prepare_markup(self, markup, user_specified_encoding=None, 85 | exclude_encodings=None, 86 | document_declared_encoding=None): 87 | """ 88 | :yield: A series of 4-tuples. 89 | (markup, encoding, declared encoding, 90 | has undergone character replacement) 91 | 92 | Each 4-tuple represents a strategy for parsing the document. 93 | """ 94 | # Instead of using UnicodeDammit to convert the bytestring to 95 | # Unicode using different encodings, use EncodingDetector to 96 | # iterate over the encodings, and tell lxml to try to parse 97 | # the document as each one in turn. 98 | is_html = not self.is_xml 99 | if is_html: 100 | self.processing_instruction_class = ProcessingInstruction 101 | else: 102 | self.processing_instruction_class = XMLProcessingInstruction 103 | 104 | if isinstance(markup, unicode): 105 | # We were given Unicode. Maybe lxml can parse Unicode on 106 | # this system? 107 | yield markup, None, document_declared_encoding, False 108 | 109 | if isinstance(markup, unicode): 110 | # No, apparently not. Convert the Unicode to UTF-8 and 111 | # tell lxml to parse it as UTF-8. 112 | yield (markup.encode("utf8"), "utf8", 113 | document_declared_encoding, False) 114 | 115 | try_encodings = [user_specified_encoding, document_declared_encoding] 116 | detector = EncodingDetector( 117 | markup, try_encodings, is_html, exclude_encodings) 118 | for encoding in detector.encodings: 119 | yield (detector.markup, encoding, document_declared_encoding, False) 120 | 121 | def feed(self, markup): 122 | if isinstance(markup, bytes): 123 | markup = BytesIO(markup) 124 | elif isinstance(markup, unicode): 125 | markup = StringIO(markup) 126 | 127 | # Call feed() at least once, even if the markup is empty, 128 | # or the parser won't be initialized. 129 | data = markup.read(self.CHUNK_SIZE) 130 | try: 131 | self.parser = self.parser_for(self.soup.original_encoding) 132 | self.parser.feed(data) 133 | while len(data) != 0: 134 | # Now call feed() on the rest of the data, chunk by chunk. 135 | data = markup.read(self.CHUNK_SIZE) 136 | if len(data) != 0: 137 | self.parser.feed(data) 138 | self.parser.close() 139 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 140 | raise ParserRejectedMarkup(str(e)) 141 | 142 | def close(self): 143 | self.nsmaps = [self.DEFAULT_NSMAPS] 144 | 145 | def start(self, name, attrs, nsmap={}): 146 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 147 | attrs = dict(attrs) 148 | nsprefix = None 149 | # Invert each namespace map as it comes in. 150 | if len(self.nsmaps) > 1: 151 | # There are no new namespaces for this tag, but 152 | # non-default namespaces are in play, so we need a 153 | # separate tag stack to know when they end. 154 | self.nsmaps.append(None) 155 | elif len(nsmap) > 0: 156 | # A new namespace mapping has come into play. 157 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 158 | self.nsmaps.append(inverted_nsmap) 159 | # Also treat the namespace mapping as a set of attributes on the 160 | # tag, so we can recreate it later. 161 | attrs = attrs.copy() 162 | for prefix, namespace in nsmap.items(): 163 | attribute = NamespacedAttribute( 164 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 165 | attrs[attribute] = namespace 166 | 167 | # Namespaces are in play. Find any attributes that came in 168 | # from lxml with namespaces attached to their names, and 169 | # turn then into NamespacedAttribute objects. 170 | new_attrs = {} 171 | for attr, value in attrs.items(): 172 | namespace, attr = self._getNsTag(attr) 173 | if namespace is None: 174 | new_attrs[attr] = value 175 | else: 176 | nsprefix = self._prefix_for_namespace(namespace) 177 | attr = NamespacedAttribute(nsprefix, attr, namespace) 178 | new_attrs[attr] = value 179 | attrs = new_attrs 180 | 181 | namespace, name = self._getNsTag(name) 182 | nsprefix = self._prefix_for_namespace(namespace) 183 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 184 | 185 | def _prefix_for_namespace(self, namespace): 186 | """Find the currently active prefix for the given namespace.""" 187 | if namespace is None: 188 | return None 189 | for inverted_nsmap in reversed(self.nsmaps): 190 | if inverted_nsmap is not None and namespace in inverted_nsmap: 191 | return inverted_nsmap[namespace] 192 | return None 193 | 194 | def end(self, name): 195 | self.soup.endData() 196 | completed_tag = self.soup.tagStack[-1] 197 | namespace, name = self._getNsTag(name) 198 | nsprefix = None 199 | if namespace is not None: 200 | for inverted_nsmap in reversed(self.nsmaps): 201 | if inverted_nsmap is not None and namespace in inverted_nsmap: 202 | nsprefix = inverted_nsmap[namespace] 203 | break 204 | self.soup.handle_endtag(name, nsprefix) 205 | if len(self.nsmaps) > 1: 206 | # This tag, or one of its parents, introduced a namespace 207 | # mapping, so pop it off the stack. 208 | self.nsmaps.pop() 209 | 210 | def pi(self, target, data): 211 | self.soup.endData() 212 | self.soup.handle_data(target + ' ' + data) 213 | self.soup.endData(self.processing_instruction_class) 214 | 215 | def data(self, content): 216 | self.soup.handle_data(content) 217 | 218 | def doctype(self, name, pubid, system): 219 | self.soup.endData() 220 | doctype = Doctype.for_name_and_ids(name, pubid, system) 221 | self.soup.object_was_parsed(doctype) 222 | 223 | def comment(self, content): 224 | "Handle comments as Comment objects." 225 | self.soup.endData() 226 | self.soup.handle_data(content) 227 | self.soup.endData(Comment) 228 | 229 | def test_fragment_to_document(self, fragment): 230 | """See `TreeBuilder`.""" 231 | return u'\n%s' % fragment 232 | 233 | 234 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 235 | 236 | NAME = LXML 237 | ALTERNATE_NAMES = ["lxml-html"] 238 | 239 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 240 | is_xml = False 241 | processing_instruction_class = ProcessingInstruction 242 | 243 | def default_parser(self, encoding): 244 | return etree.HTMLParser 245 | 246 | def feed(self, markup): 247 | encoding = self.soup.original_encoding 248 | try: 249 | self.parser = self.parser_for(encoding) 250 | self.parser.feed(markup) 251 | self.parser.close() 252 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 253 | raise ParserRejectedMarkup(str(e)) 254 | 255 | 256 | def test_fragment_to_document(self, fragment): 257 | """See `TreeBuilder`.""" 258 | return u'%s' % fragment 259 | -------------------------------------------------------------------------------- /modules/bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | __license__ = "MIT" 6 | 7 | import cProfile 8 | from io import StringIO 9 | from html.parser import HTMLParser 10 | import bs4 11 | from bs4 import BeautifulSoup, __version__ 12 | from bs4.builder import builder_registry 13 | 14 | import os 15 | import pstats 16 | import random 17 | import tempfile 18 | import time 19 | import traceback 20 | import sys 21 | import cProfile 22 | 23 | def diagnose(data): 24 | """Diagnostic suite for isolating common problems.""" 25 | print("Diagnostic running on Beautiful Soup %s" % __version__) 26 | print("Python version %s" % sys.version) 27 | 28 | basic_parsers = ["html.parser", "html5lib", "lxml"] 29 | for name in basic_parsers: 30 | for builder in builder_registry.builders: 31 | if name in builder.features: 32 | break 33 | else: 34 | basic_parsers.remove(name) 35 | print(( 36 | "I noticed that %s is not installed. Installing it may help." % 37 | name)) 38 | 39 | if 'lxml' in basic_parsers: 40 | basic_parsers.append(["lxml", "xml"]) 41 | try: 42 | from lxml import etree 43 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 44 | except ImportError as e: 45 | print ( 46 | "lxml is not installed or couldn't be imported.") 47 | 48 | 49 | if 'html5lib' in basic_parsers: 50 | try: 51 | import html5lib 52 | print("Found html5lib version %s" % html5lib.__version__) 53 | except ImportError as e: 54 | print ( 55 | "html5lib is not installed or couldn't be imported.") 56 | 57 | if hasattr(data, 'read'): 58 | data = data.read() 59 | elif os.path.exists(data): 60 | print('"%s" looks like a filename. Reading data from the file.' % data) 61 | with open(data) as fp: 62 | data = fp.read() 63 | elif data.startswith("http:") or data.startswith("https:"): 64 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) 65 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") 66 | return 67 | print() 68 | 69 | for parser in basic_parsers: 70 | print("Trying to parse your markup with %s" % parser) 71 | success = False 72 | try: 73 | soup = BeautifulSoup(data, parser) 74 | success = True 75 | except Exception as e: 76 | print("%s could not parse the markup." % parser) 77 | traceback.print_exc() 78 | if success: 79 | print("Here's what %s did with the markup:" % parser) 80 | print(soup.prettify()) 81 | 82 | print("-" * 80) 83 | 84 | def lxml_trace(data, html=True, **kwargs): 85 | """Print out the lxml events that occur during parsing. 86 | 87 | This lets you see how lxml parses a document when no Beautiful 88 | Soup code is running. 89 | """ 90 | from lxml import etree 91 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 92 | print(("%s, %4s, %s" % (event, element.tag, element.text))) 93 | 94 | class AnnouncingParser(HTMLParser): 95 | """Announces HTMLParser parse events, without doing anything else.""" 96 | 97 | def _p(self, s): 98 | print(s) 99 | 100 | def handle_starttag(self, name, attrs): 101 | self._p("%s START" % name) 102 | 103 | def handle_endtag(self, name): 104 | self._p("%s END" % name) 105 | 106 | def handle_data(self, data): 107 | self._p("%s DATA" % data) 108 | 109 | def handle_charref(self, name): 110 | self._p("%s CHARREF" % name) 111 | 112 | def handle_entityref(self, name): 113 | self._p("%s ENTITYREF" % name) 114 | 115 | def handle_comment(self, data): 116 | self._p("%s COMMENT" % data) 117 | 118 | def handle_decl(self, data): 119 | self._p("%s DECL" % data) 120 | 121 | def unknown_decl(self, data): 122 | self._p("%s UNKNOWN-DECL" % data) 123 | 124 | def handle_pi(self, data): 125 | self._p("%s PI" % data) 126 | 127 | def htmlparser_trace(data): 128 | """Print out the HTMLParser events that occur during parsing. 129 | 130 | This lets you see how HTMLParser parses a document when no 131 | Beautiful Soup code is running. 132 | """ 133 | parser = AnnouncingParser() 134 | parser.feed(data) 135 | 136 | _vowels = "aeiou" 137 | _consonants = "bcdfghjklmnpqrstvwxyz" 138 | 139 | def rword(length=5): 140 | "Generate a random word-like string." 141 | s = '' 142 | for i in range(length): 143 | if i % 2 == 0: 144 | t = _consonants 145 | else: 146 | t = _vowels 147 | s += random.choice(t) 148 | return s 149 | 150 | def rsentence(length=4): 151 | "Generate a random sentence-like string." 152 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 153 | 154 | def rdoc(num_elements=1000): 155 | """Randomly generate an invalid HTML document.""" 156 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 157 | elements = [] 158 | for i in range(num_elements): 159 | choice = random.randint(0,3) 160 | if choice == 0: 161 | # New tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("<%s>" % tag_name) 164 | elif choice == 1: 165 | elements.append(rsentence(random.randint(1,4))) 166 | elif choice == 2: 167 | # Close a tag. 168 | tag_name = random.choice(tag_names) 169 | elements.append("" % tag_name) 170 | return "" + "\n".join(elements) + "" 171 | 172 | def benchmark_parsers(num_elements=100000): 173 | """Very basic head-to-head performance benchmark.""" 174 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 175 | data = rdoc(num_elements) 176 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) 177 | 178 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 179 | success = False 180 | try: 181 | a = time.time() 182 | soup = BeautifulSoup(data, parser) 183 | b = time.time() 184 | success = True 185 | except Exception as e: 186 | print("%s could not parse the markup." % parser) 187 | traceback.print_exc() 188 | if success: 189 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 190 | 191 | from lxml import etree 192 | a = time.time() 193 | etree.HTML(data) 194 | b = time.time() 195 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) 196 | 197 | import html5lib 198 | parser = html5lib.HTMLParser() 199 | a = time.time() 200 | parser.parse(data) 201 | b = time.time() 202 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 203 | 204 | def profile(num_elements=100000, parser="lxml"): 205 | 206 | filehandle = tempfile.NamedTemporaryFile() 207 | filename = filehandle.name 208 | 209 | data = rdoc(num_elements) 210 | vars = dict(bs4=bs4, data=data, parser=parser) 211 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 212 | 213 | stats = pstats.Stats(filename) 214 | # stats.strip_dirs() 215 | stats.sort_stats("cumulative") 216 | stats.print_stats('_html5lib|bs4', 50) 217 | 218 | if __name__ == '__main__': 219 | diagnose(sys.stdin.read()) 220 | -------------------------------------------------------------------------------- /modules/bs4/diagnose.py.bak: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | 3 | # Use of this source code is governed by a BSD-style license that can be 4 | # found in the LICENSE file. 5 | __license__ = "MIT" 6 | 7 | import cProfile 8 | from StringIO import StringIO 9 | from HTMLParser import HTMLParser 10 | import bs4 11 | from bs4 import BeautifulSoup, __version__ 12 | from bs4.builder import builder_registry 13 | 14 | import os 15 | import pstats 16 | import random 17 | import tempfile 18 | import time 19 | import traceback 20 | import sys 21 | import cProfile 22 | 23 | def diagnose(data): 24 | """Diagnostic suite for isolating common problems.""" 25 | print "Diagnostic running on Beautiful Soup %s" % __version__ 26 | print "Python version %s" % sys.version 27 | 28 | basic_parsers = ["html.parser", "html5lib", "lxml"] 29 | for name in basic_parsers: 30 | for builder in builder_registry.builders: 31 | if name in builder.features: 32 | break 33 | else: 34 | basic_parsers.remove(name) 35 | print ( 36 | "I noticed that %s is not installed. Installing it may help." % 37 | name) 38 | 39 | if 'lxml' in basic_parsers: 40 | basic_parsers.append(["lxml", "xml"]) 41 | try: 42 | from lxml import etree 43 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 44 | except ImportError, e: 45 | print ( 46 | "lxml is not installed or couldn't be imported.") 47 | 48 | 49 | if 'html5lib' in basic_parsers: 50 | try: 51 | import html5lib 52 | print "Found html5lib version %s" % html5lib.__version__ 53 | except ImportError, e: 54 | print ( 55 | "html5lib is not installed or couldn't be imported.") 56 | 57 | if hasattr(data, 'read'): 58 | data = data.read() 59 | elif os.path.exists(data): 60 | print '"%s" looks like a filename. Reading data from the file.' % data 61 | with open(data) as fp: 62 | data = fp.read() 63 | elif data.startswith("http:") or data.startswith("https:"): 64 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 65 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 66 | return 67 | print 68 | 69 | for parser in basic_parsers: 70 | print "Trying to parse your markup with %s" % parser 71 | success = False 72 | try: 73 | soup = BeautifulSoup(data, parser) 74 | success = True 75 | except Exception, e: 76 | print "%s could not parse the markup." % parser 77 | traceback.print_exc() 78 | if success: 79 | print "Here's what %s did with the markup:" % parser 80 | print soup.prettify() 81 | 82 | print "-" * 80 83 | 84 | def lxml_trace(data, html=True, **kwargs): 85 | """Print out the lxml events that occur during parsing. 86 | 87 | This lets you see how lxml parses a document when no Beautiful 88 | Soup code is running. 89 | """ 90 | from lxml import etree 91 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 92 | print("%s, %4s, %s" % (event, element.tag, element.text)) 93 | 94 | class AnnouncingParser(HTMLParser): 95 | """Announces HTMLParser parse events, without doing anything else.""" 96 | 97 | def _p(self, s): 98 | print(s) 99 | 100 | def handle_starttag(self, name, attrs): 101 | self._p("%s START" % name) 102 | 103 | def handle_endtag(self, name): 104 | self._p("%s END" % name) 105 | 106 | def handle_data(self, data): 107 | self._p("%s DATA" % data) 108 | 109 | def handle_charref(self, name): 110 | self._p("%s CHARREF" % name) 111 | 112 | def handle_entityref(self, name): 113 | self._p("%s ENTITYREF" % name) 114 | 115 | def handle_comment(self, data): 116 | self._p("%s COMMENT" % data) 117 | 118 | def handle_decl(self, data): 119 | self._p("%s DECL" % data) 120 | 121 | def unknown_decl(self, data): 122 | self._p("%s UNKNOWN-DECL" % data) 123 | 124 | def handle_pi(self, data): 125 | self._p("%s PI" % data) 126 | 127 | def htmlparser_trace(data): 128 | """Print out the HTMLParser events that occur during parsing. 129 | 130 | This lets you see how HTMLParser parses a document when no 131 | Beautiful Soup code is running. 132 | """ 133 | parser = AnnouncingParser() 134 | parser.feed(data) 135 | 136 | _vowels = "aeiou" 137 | _consonants = "bcdfghjklmnpqrstvwxyz" 138 | 139 | def rword(length=5): 140 | "Generate a random word-like string." 141 | s = '' 142 | for i in range(length): 143 | if i % 2 == 0: 144 | t = _consonants 145 | else: 146 | t = _vowels 147 | s += random.choice(t) 148 | return s 149 | 150 | def rsentence(length=4): 151 | "Generate a random sentence-like string." 152 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 153 | 154 | def rdoc(num_elements=1000): 155 | """Randomly generate an invalid HTML document.""" 156 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 157 | elements = [] 158 | for i in range(num_elements): 159 | choice = random.randint(0,3) 160 | if choice == 0: 161 | # New tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("<%s>" % tag_name) 164 | elif choice == 1: 165 | elements.append(rsentence(random.randint(1,4))) 166 | elif choice == 2: 167 | # Close a tag. 168 | tag_name = random.choice(tag_names) 169 | elements.append("" % tag_name) 170 | return "" + "\n".join(elements) + "" 171 | 172 | def benchmark_parsers(num_elements=100000): 173 | """Very basic head-to-head performance benchmark.""" 174 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 175 | data = rdoc(num_elements) 176 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 177 | 178 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 179 | success = False 180 | try: 181 | a = time.time() 182 | soup = BeautifulSoup(data, parser) 183 | b = time.time() 184 | success = True 185 | except Exception, e: 186 | print "%s could not parse the markup." % parser 187 | traceback.print_exc() 188 | if success: 189 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 190 | 191 | from lxml import etree 192 | a = time.time() 193 | etree.HTML(data) 194 | b = time.time() 195 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 196 | 197 | import html5lib 198 | parser = html5lib.HTMLParser() 199 | a = time.time() 200 | parser.parse(data) 201 | b = time.time() 202 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) 203 | 204 | def profile(num_elements=100000, parser="lxml"): 205 | 206 | filehandle = tempfile.NamedTemporaryFile() 207 | filename = filehandle.name 208 | 209 | data = rdoc(num_elements) 210 | vars = dict(bs4=bs4, data=data, parser=parser) 211 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 212 | 213 | stats = pstats.Stats(filename) 214 | # stats.strip_dirs() 215 | stats.sort_stats("cumulative") 216 | stats.print_stats('_html5lib|bs4', 50) 217 | 218 | if __name__ == '__main__': 219 | diagnose(sys.stdin.read()) 220 | -------------------------------------------------------------------------------- /modules/bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /modules/bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | import warnings 5 | 6 | from bs4 import BeautifulSoup 7 | from bs4.builder import ( 8 | builder_registry as registry, 9 | HTMLParserTreeBuilder, 10 | TreeBuilderRegistry, 11 | ) 12 | 13 | try: 14 | from bs4.builder import HTML5TreeBuilder 15 | HTML5LIB_PRESENT = True 16 | except ImportError: 17 | HTML5LIB_PRESENT = False 18 | 19 | try: 20 | from bs4.builder import ( 21 | LXMLTreeBuilderForXML, 22 | LXMLTreeBuilder, 23 | ) 24 | LXML_PRESENT = True 25 | except ImportError: 26 | LXML_PRESENT = False 27 | 28 | 29 | class BuiltInRegistryTest(unittest.TestCase): 30 | """Test the built-in registry with the default builders registered.""" 31 | 32 | def test_combination(self): 33 | if LXML_PRESENT: 34 | self.assertEqual(registry.lookup('fast', 'html'), 35 | LXMLTreeBuilder) 36 | 37 | if LXML_PRESENT: 38 | self.assertEqual(registry.lookup('permissive', 'xml'), 39 | LXMLTreeBuilderForXML) 40 | self.assertEqual(registry.lookup('strict', 'html'), 41 | HTMLParserTreeBuilder) 42 | if HTML5LIB_PRESENT: 43 | self.assertEqual(registry.lookup('html5lib', 'html'), 44 | HTML5TreeBuilder) 45 | 46 | def test_lookup_by_markup_type(self): 47 | if LXML_PRESENT: 48 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 49 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 50 | else: 51 | self.assertEqual(registry.lookup('xml'), None) 52 | if HTML5LIB_PRESENT: 53 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 54 | else: 55 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 56 | 57 | def test_named_library(self): 58 | if LXML_PRESENT: 59 | self.assertEqual(registry.lookup('lxml', 'xml'), 60 | LXMLTreeBuilderForXML) 61 | self.assertEqual(registry.lookup('lxml', 'html'), 62 | LXMLTreeBuilder) 63 | if HTML5LIB_PRESENT: 64 | self.assertEqual(registry.lookup('html5lib'), 65 | HTML5TreeBuilder) 66 | 67 | self.assertEqual(registry.lookup('html.parser'), 68 | HTMLParserTreeBuilder) 69 | 70 | def test_beautifulsoup_constructor_does_lookup(self): 71 | 72 | with warnings.catch_warnings(record=True) as w: 73 | # This will create a warning about not explicitly 74 | # specifying a parser, but we'll ignore it. 75 | 76 | # You can pass in a string. 77 | BeautifulSoup("", features="html") 78 | # Or a list of strings. 79 | BeautifulSoup("", features=["html", "fast"]) 80 | 81 | # You'll get an exception if BS can't find an appropriate 82 | # builder. 83 | self.assertRaises(ValueError, BeautifulSoup, 84 | "", features="no-such-feature") 85 | 86 | class RegistryTest(unittest.TestCase): 87 | """Test the TreeBuilderRegistry class in general.""" 88 | 89 | def setUp(self): 90 | self.registry = TreeBuilderRegistry() 91 | 92 | def builder_for_features(self, *feature_list): 93 | cls = type('Builder_' + '_'.join(feature_list), 94 | (object,), {'features' : feature_list}) 95 | 96 | self.registry.register(cls) 97 | return cls 98 | 99 | def test_register_with_no_features(self): 100 | builder = self.builder_for_features() 101 | 102 | # Since the builder advertises no features, you can't find it 103 | # by looking up features. 104 | self.assertEqual(self.registry.lookup('foo'), None) 105 | 106 | # But you can find it by doing a lookup with no features, if 107 | # this happens to be the only registered builder. 108 | self.assertEqual(self.registry.lookup(), builder) 109 | 110 | def test_register_with_features_makes_lookup_succeed(self): 111 | builder = self.builder_for_features('foo', 'bar') 112 | self.assertEqual(self.registry.lookup('foo'), builder) 113 | self.assertEqual(self.registry.lookup('bar'), builder) 114 | 115 | def test_lookup_fails_when_no_builder_implements_feature(self): 116 | builder = self.builder_for_features('foo', 'bar') 117 | self.assertEqual(self.registry.lookup('baz'), None) 118 | 119 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 120 | builder1 = self.builder_for_features('foo') 121 | builder2 = self.builder_for_features('bar') 122 | self.assertEqual(self.registry.lookup(), builder2) 123 | 124 | def test_lookup_fails_when_no_tree_builders_registered(self): 125 | self.assertEqual(self.registry.lookup(), None) 126 | 127 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 128 | has_one = self.builder_for_features('foo') 129 | has_the_other = self.builder_for_features('bar') 130 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 131 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 132 | lacks_one = self.builder_for_features('bar') 133 | has_the_other = self.builder_for_features('foo') 134 | 135 | # There are two builders featuring 'foo' and 'bar', but 136 | # the one that also features 'quux' was registered later. 137 | self.assertEqual(self.registry.lookup('foo', 'bar'), 138 | has_both_late) 139 | 140 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 141 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 142 | has_both_early) 143 | 144 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 145 | builder1 = self.builder_for_features('foo', 'bar') 146 | builder2 = self.builder_for_features('foo', 'baz') 147 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 148 | -------------------------------------------------------------------------------- /modules/bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /modules/bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError as e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual("

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_reparented_markup_containing_identical_whitespace_nodes(self): 88 | """Verify that we keep the two whitespace nodes in this 89 | document distinct when reparenting the adjacent tags. 90 | """ 91 | markup = '
' 92 | soup = self.soup(markup) 93 | space1, space2 = soup.find_all(string=' ') 94 | tbody1, tbody2 = soup.find_all('tbody') 95 | assert space1.next_element is tbody1 96 | assert tbody2.next_element is space2 97 | 98 | def test_reparented_markup_containing_children(self): 99 | markup = '' 100 | soup = self.soup(markup) 101 | noscript = soup.noscript 102 | self.assertEqual("target", noscript.next_element) 103 | target = soup.find(string='target') 104 | 105 | # The 'aftermath' string was duplicated; we want the second one. 106 | final_aftermath = soup.find_all(string='aftermath')[-1] 107 | 108 | # The