├── .rspec ├── lib ├── html2doc │ ├── version.rb │ ├── xml.rb │ ├── notes.rb │ ├── base.rb │ ├── lists.rb │ ├── math.rb │ ├── mime.rb │ └── wordstyle.css └── html2doc.rb ├── .gitattributes ├── spec ├── 19160-6.png ├── 19160-7.gif ├── 19160-8.jpg ├── examples │ ├── rice_images │ │ ├── rice_image1.gif │ │ ├── rice_image1.png │ │ ├── rice_image2.png │ │ ├── rice_image3_1.png │ │ ├── rice_image3_2.png │ │ └── rice_image3_3.png │ └── header.html ├── odf.svg ├── spec_helper.rb ├── header.html ├── header_img.html ├── wordstyle-custom-lists.css ├── wordstyle-nopagesize.css └── wordstyle-custom.css ├── Rakefile ├── .hound.yml ├── bin ├── setup ├── console ├── rspec └── html2doc ├── .gitignore ├── Gemfile ├── .rubocop.yml ├── .github └── workflows │ ├── rake.yml │ └── release.yml ├── html2doc.gemspec ├── LICENSE ├── CODE_OF_CONDUCT.md └── README.adoc /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /lib/html2doc/version.rb: -------------------------------------------------------------------------------- 1 | class Html2Doc 2 | VERSION = "1.10.1".freeze 3 | end 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | rfc2629*.* linguist-vendored 2 | mathml2omml*.* linguist-vendored 3 | -------------------------------------------------------------------------------- /spec/19160-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/19160-6.png -------------------------------------------------------------------------------- /spec/19160-7.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/19160-7.gif -------------------------------------------------------------------------------- /spec/19160-8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/19160-8.jpg -------------------------------------------------------------------------------- /spec/examples/rice_images/rice_image1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/examples/rice_images/rice_image1.gif -------------------------------------------------------------------------------- /spec/examples/rice_images/rice_image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/examples/rice_images/rice_image1.png -------------------------------------------------------------------------------- /spec/examples/rice_images/rice_image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/examples/rice_images/rice_image2.png -------------------------------------------------------------------------------- /spec/examples/rice_images/rice_image3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/examples/rice_images/rice_image3_1.png -------------------------------------------------------------------------------- /spec/examples/rice_images/rice_image3_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/examples/rice_images/rice_image3_2.png -------------------------------------------------------------------------------- /spec/examples/rice_images/rice_image3_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metanorma/html2doc/HEAD/spec/examples/rice_images/rice_image3_3.png -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task default: :spec 7 | -------------------------------------------------------------------------------- /.hound.yml: -------------------------------------------------------------------------------- 1 | # Auto-generated by Cimas: Do not edit it manually! 2 | # See https://github.com/metanorma/cimas 3 | ruby: 4 | enabled: true 5 | config_file: .rubocop.yml 6 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /spec/odf.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | 10 | # rspec failure tracking 11 | .rspec_status 12 | 13 | .rubocop-https--* 14 | -------------------------------------------------------------------------------- /lib/html2doc.rb: -------------------------------------------------------------------------------- 1 | require_relative "html2doc/version" 2 | require_relative "html2doc/base" 3 | require_relative "html2doc/mime" 4 | require_relative "html2doc/notes" 5 | require_relative "html2doc/math" 6 | require_relative "html2doc/lists" 7 | require_relative "html2doc/xml" 8 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | Encoding.default_external = Encoding::UTF_8 2 | Encoding.default_internal = Encoding::UTF_8 3 | 4 | source "https://rubygems.org" 5 | git_source(:github) { |repo| "https://github.com/#{repo}" } 6 | 7 | gemspec 8 | 9 | eval_gemfile("Gemfile.devel") rescue nil 10 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | # Auto-generated by Cimas: Do not edit it manually! 2 | # See https://github.com/metanorma/cimas 3 | inherit_from: 4 | - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml 5 | 6 | # local repo-specific modifications 7 | # ... 8 | 9 | AllCops: 10 | TargetRubyVersion: 2.5 11 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "html2doc" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /.github/workflows/rake.yml: -------------------------------------------------------------------------------- 1 | # Auto-generated by Cimas: Do not edit it manually! 2 | # See https://github.com/metanorma/cimas 3 | name: rake 4 | 5 | on: 6 | push: 7 | branches: [ master, main ] 8 | tags: [ v* ] 9 | pull_request: 10 | 11 | jobs: 12 | rake: 13 | uses: metanorma/ci/.github/workflows/generic-rake.yml@main 14 | secrets: 15 | pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }} 16 | -------------------------------------------------------------------------------- /bin/rspec: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # This file was generated by Bundler. 4 | # 5 | # The application 'rspec' is installed as part of a gem, and 6 | # this file is here to facilitate running it. 7 | # 8 | 9 | require "pathname" 10 | ENV["BUNDLE_GEMFILE"] ||= File.expand_path( 11 | "../../Gemfile", Pathname.new(__FILE__).realpath 12 | ) 13 | 14 | require "rubygems" 15 | require "bundler/setup" 16 | 17 | load Gem.bin_path("rspec-core", "rspec") 18 | 19 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "simplecov" 2 | SimpleCov.start do 3 | add_filter "/spec/" 4 | end 5 | 6 | require "bundler/setup" 7 | require "rspec/match_fuzzy" 8 | require "html2doc" 9 | require "rspec/matchers" 10 | require "equivalent-xml" 11 | 12 | RSpec.configure do |config| 13 | # Enable flags like --only-failures and --next-failure 14 | config.example_status_persistence_file_path = ".rspec_status" 15 | 16 | # Disable RSpec exposing methods globally on `Module` and `main` 17 | config.disable_monkey_patching! 18 | 19 | config.expect_with :rspec do |c| 20 | c.syntax = :expect 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /bin/html2doc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "html2doc" 4 | require "optparse" 5 | 6 | options = {} 7 | OptionParser.new do |opts| 8 | opts.banner = "Usage: bin/html2doc filename [options]" 9 | 10 | opts.on("--stylesheet FILE.CSS", "Use the provided stylesheet") do |v| 11 | options[:stylesheet] = v 12 | end 13 | opts.on("--header HEADER.HTML", "Use the provided stylesheet") do |v| 14 | options[:header] = v 15 | end 16 | end.parse! 17 | 18 | if ARGV.length < 1 19 | puts "Usage: bin/html2doc filename [options]" 20 | exit 21 | end 22 | 23 | Html2Doc.process( 24 | filename: ARGV[0].gsub(/\.html?$/, ""), 25 | stylesheet: options[:stylesheet], 26 | header: options[:header], 27 | ).process(File.read(ARGV[0], encoding: "utf-8")) 28 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # Auto-generated by Cimas: Do not edit it manually! 2 | # See https://github.com/metanorma/cimas 3 | name: release 4 | 5 | on: 6 | workflow_dispatch: 7 | inputs: 8 | next_version: 9 | description: | 10 | Next release version. Possible values: x.y.z, major, minor, patch (or pre|rc|etc). 11 | Also, you can pass 'skip' to skip 'git tag' and do 'gem push' for the current version 12 | required: true 13 | default: 'skip' 14 | repository_dispatch: 15 | types: [ do-release ] 16 | 17 | jobs: 18 | release: 19 | uses: metanorma/ci/.github/workflows/rubygems-release.yml@main 20 | with: 21 | next_version: ${{ github.event.inputs.next_version }} 22 | secrets: 23 | rubygems-api-key: ${{ secrets.METANORMA_CI_RUBYGEMS_API_KEY }} 24 | pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }} 25 | 26 | -------------------------------------------------------------------------------- /html2doc.gemspec: -------------------------------------------------------------------------------- 1 | lib = File.expand_path("lib", __dir__) 2 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 3 | require "html2doc/version" 4 | 5 | Gem::Specification.new do |spec| 6 | spec.name = "html2doc" 7 | spec.version = Html2Doc::VERSION 8 | spec.authors = ["Ribose Inc."] 9 | spec.email = ["open.source@ribose.com"] 10 | 11 | spec.summary = "Convert HTML document to Microsoft Word document" 12 | spec.description = <<~DESCRIPTION 13 | Convert HTML document to Microsoft Word document. 14 | 15 | This gem is in active development. 16 | DESCRIPTION 17 | 18 | spec.homepage = "https://github.com/metanorma/html2doc" 19 | spec.licenses = ["CC-BY-SA-3.0", "BSD-2-Clause"] 20 | 21 | spec.bindir = "bin" 22 | spec.require_paths = ["lib"] 23 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 24 | f.match(%r{^(test|spec|features|bin|.github)/}) \ 25 | || f.match(%r{Rakefile|bin/rspec}) 26 | end 27 | spec.required_ruby_version = Gem::Requirement.new(">= 2.7.0") 28 | 29 | spec.add_dependency "base64" 30 | spec.add_dependency "htmlentities", "~> 4.3.4" 31 | spec.add_dependency "lutaml-model", "~> 0.7.0" 32 | spec.add_dependency "metanorma-utils", ">= 1.9.0" 33 | spec.add_dependency "mime-types" 34 | spec.add_dependency "nokogiri", "~> 1.18.3" 35 | spec.add_dependency "plane1converter", "~> 0.0.1" 36 | spec.add_dependency "plurimath", "~> 0.9.0" 37 | spec.add_dependency "thread_safe" 38 | spec.add_dependency "uuidtools" 39 | spec.add_dependency "unitsml" 40 | spec.add_dependency "vectory", "~> 0.8" 41 | 42 | spec.add_development_dependency "debug" 43 | spec.add_development_dependency "equivalent-xml", "~> 0.6" 44 | spec.add_development_dependency "guard", "~> 2.14" 45 | spec.add_development_dependency "guard-rspec", "~> 4.7" 46 | spec.add_development_dependency "rake", "~> 12.0" 47 | spec.add_development_dependency "rspec", "~> 3.6" 48 | spec.add_development_dependency "rspec-match_fuzzy", "~> 0.2.0" 49 | spec.add_development_dependency "rubocop", "~> 1" 50 | spec.add_development_dependency "rubocop-performance" 51 | spec.add_development_dependency "simplecov", "~> 0.15" 52 | spec.add_development_dependency "timecop", "~> 0.9" 53 | end 54 | -------------------------------------------------------------------------------- /lib/html2doc/xml.rb: -------------------------------------------------------------------------------- 1 | class Html2Doc 2 | NOKOHEAD = <<~HERE.freeze 3 | 5 | 6 | 7 | 8 | HERE 9 | 10 | def to_xhtml(xml) 11 | xml.gsub!(/<\?xml[^<>]*>/, "") 12 | unless /' + xml 15 | end 16 | xml = xml.gsub(/") 17 | .gsub(//, "") 18 | Nokogiri::XML.parse(xml) 19 | end 20 | 21 | DOCTYPE = <<~DOCTYPE.freeze 22 | 23 | DOCTYPE 24 | 25 | def from_xhtml(xml) 26 | xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "") 27 | .sub(DOCTYPE, "").gsub(%{ />}, "/>") 28 | .gsub(//, "/, "") 30 | .gsub("\n-->\n", "\n-->\n") 31 | end 32 | 33 | def msword_fix(doc) 34 | # brain damage in MSWord parser 35 | doc.gsub!(%r{}, 36 | "") 37 | doc.gsub!(%r{}, 38 | '') 39 | doc.gsub!(%r{
}, 40 | '
') 41 | doc.gsub!(%r{(") 42 | doc.gsub!(%r{}, "/>") 46 | doc.gsub!(%r{>}, "/>") 47 | doc.gsub!(%r{>}, "/>") 48 | doc.gsub!(%r{>}, "/>") 49 | doc.gsub!(%r{>}, "/>") 50 | doc.gsub!(%r{>}, "/>") 51 | doc.gsub!(%r{>}, "/>") 52 | doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2") 53 | doc.gsub!(%r{&tab;|&tab;}, 54 | '  ') 55 | doc.split(%r{(|)}).each_slice(4).map do |a| 56 | a.size > 2 and a[2] = a[2].gsub(/>\s+<") 57 | a 58 | end.join 59 | end 60 | 61 | PRINT_VIEW = <<~XML.freeze 62 | 63 | 64 | 65 | Print 66 | 100 67 | 68 | 69 | 70 | 71 | XML 72 | 73 | def namespace(root) 74 | { o: "urn:schemas-microsoft-com:office:office", 75 | w: "urn:schemas-microsoft-com:office:word", 76 | v: "urn:schemas-microsoft-com:vml", 77 | m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) } 78 | end 79 | 80 | def rootnamespace(root) 81 | root.add_namespace(nil, "http://www.w3.org/TR/REC-html40") 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This software is dual-licensed: 2 | 3 | 1. Distributed under a Creative Commons Attribution-ShareAlike 3.0 4 | Unported License http://creativecommons.org/licenses/by-sa/3.0/ 5 | 6 | 2. http://www.opensource.org/licenses/BSD-2-Clause 7 | 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | 13 | * Redistributions of source code must retain the above copyright notice, this 14 | list of conditions and the following disclaimer. 15 | 16 | * Redistributions in binary form must reproduce the above copyright notice, 17 | this list of conditions and the following disclaimer in the documentation 18 | and/or other materials provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | 32 | 33 | 34 | LICENSE FOR STYLESHEETS DERIVED FROM https://github.com/TEIC/Stylesheets 35 | 36 | This software is dual-licensed: 37 | 38 | 1. Distributed under a Creative Commons Attribution-ShareAlike 3.0 39 | Unported License http://creativecommons.org/licenses/by-sa/3.0/ 40 | 41 | 2. http://www.opensource.org/licenses/BSD-2-Clause 42 | 43 | All rights reserved. 44 | 45 | Redistribution and use in source and binary forms, with or without 46 | modification, are permitted provided that the following conditions are 47 | met: 48 | 49 | * Redistributions of source code must retain the above copyright 50 | notice, this list of conditions and the following disclaimer. 51 | 52 | * Redistributions in binary form must reproduce the above copyright 53 | notice, this list of conditions and the following disclaimer in the 54 | documentation and/or other materials provided with the distribution. 55 | 56 | This software is provided by the copyright holders and contributors 57 | "as is" and any express or implied warranties, including, but not 58 | limited to, the implied warranties of merchantability and fitness for 59 | a particular purpose are disclaimed. In no event shall the copyright 60 | holder or contributors be liable for any direct, indirect, incidental, 61 | special, exemplary, or consequential damages (including, but not 62 | limited to, procurement of substitute goods or services; loss of use, 63 | data, or profits; or business interruption) however caused and on any 64 | theory of liability, whether in contract, strict liability, or tort 65 | (including negligence or otherwise) arising in any way out of the use 66 | of this software, even if advised of the possibility of such damage. 67 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at ronald.tse@ribose.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /lib/html2doc/notes.rb: -------------------------------------------------------------------------------- 1 | require "uuidtools" 2 | 3 | class Html2Doc 4 | def footnotes(docxml) 5 | #i = 1 6 | indexes = {} 7 | @footnote_idx = 1 8 | fn = [] 9 | docxml.xpath("//a").each do |a| 10 | process_footnote_link(docxml, a, indexes, fn) or next 11 | #i += 1 12 | end 13 | process_footnote_texts(docxml, fn, indexes) 14 | end 15 | 16 | # Currently cannot deal with separate footnote containers in each chapter 17 | # We may eventually need to support that 18 | def process_footnote_texts(docxml, footnotes, indexes) 19 | body = docxml.at("//body") 20 | list = body.add_child("
") 21 | footnotes.each do |f| 22 | #require 'debug'; binding.b 23 | fn = list.first.add_child(footnote_container(docxml, indexes[f["id"]])) 24 | f.parent = fn.first 25 | f["id"] = "" 26 | footnote_div_to_p(f) 27 | end 28 | footnote_cleanup(docxml) 29 | end 30 | 31 | def footnote_div_to_p(elem) 32 | if %w{div aside}.include? elem.name 33 | if elem.at(".//p") 34 | elem.replace(elem.children) 35 | else 36 | elem.name = "p" 37 | elem["class"] = "MsoFootnoteText" 38 | end 39 | end 40 | end 41 | 42 | FN = ""\ 43 | "".freeze 44 | 45 | def footnote_container(docxml, idx) 46 | ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0) 47 | &.gsub(/>\n<") || FN 48 | <<~DIV 49 | 52 | DIV 53 | end 54 | 55 | def process_footnote_link(docxml, elem, indexes, footnote) 56 | footnote?(elem) or return false 57 | href = elem["href"].gsub(/^#/, "") 58 | #require "debug"; binding.b 59 | note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']") 60 | note.nil? and return false 61 | unless indexes[href] 62 | indexes[href] = @footnote_idx 63 | @footnote_idx += 1 64 | end 65 | set_footnote_link_attrs(elem, indexes[href]) 66 | if elem.at("./span[@class = 'MsoFootnoteReference']") 67 | process_footnote_link1(elem) 68 | else elem.children = FN 69 | end 70 | footnote << transform_footnote_text(note) 71 | end 72 | 73 | def process_footnote_link1(elem) 74 | elem.children.each do |c| 75 | if c.name == "span" && c["class"] == "MsoFootnoteReference" 76 | c.replace(FN) 77 | else 78 | c.wrap("") 79 | end 80 | end 81 | end 82 | 83 | def transform_footnote_text(note) 84 | #note["id"] = "" 85 | note.xpath(".//div").each { |div| div.replace(div.children) } 86 | note.xpath(".//aside | .//p").each do |p| 87 | p.name = "p" 88 | p["class"] = "MsoFootnoteText" 89 | end 90 | note.remove 91 | end 92 | 93 | def footnote?(elem) 94 | elem["epub:type"]&.casecmp("footnote")&.zero? || 95 | elem["class"]&.casecmp("footnote")&.zero? 96 | end 97 | 98 | def set_footnote_link_attrs(elem, idx) 99 | elem["style"] = "mso-footnote-id:ftn#{idx}" 100 | elem["href"] = "#_ftn#{idx}" 101 | elem["name"] = "_ftnref#{idx}" 102 | elem["title"] = "" 103 | end 104 | 105 | # We expect that the content of the footnote text received is one or 106 | # more text containers, p or aside or div (which we have already converted 107 | # to p). We do not expect any or links back to text; if they 108 | # are present in the HTML, they need to have been cleaned out before 109 | # passing to this gem 110 | def footnote_cleanup(docxml) 111 | docxml.xpath('//div[@style="mso-element:footnote"]/a') 112 | .each do |x| 113 | n = x.next_element 114 | n&.children&.first&.add_previous_sibling(x.remove) 115 | end 116 | docxml 117 | end 118 | end 119 | -------------------------------------------------------------------------------- /lib/html2doc/base.rb: -------------------------------------------------------------------------------- 1 | require "uuidtools" 2 | require "htmlentities" 3 | require "nokogiri" 4 | require "fileutils" 5 | 6 | class Html2Doc 7 | def initialize(hash) 8 | @filename = hash[:filename] 9 | @dir = hash[:dir] 10 | @dir1 = create_dir(@filename, @dir) 11 | @header_file = hash[:header_file] 12 | @asciimathdelims = hash[:asciimathdelims] 13 | @imagedir = hash[:imagedir] 14 | @debug = hash[:debug] 15 | @liststyles = hash[:liststyles] 16 | @stylesheet = read_stylesheet(hash[:stylesheet]) 17 | @c = HTMLEntities.new 18 | end 19 | 20 | def process(result) 21 | result = process_html(result) 22 | process_header(@header_file) 23 | generate_filelist(@filename, @dir1) 24 | File.open("#{@filename}.htm", "w:UTF-8") { |f| f.write(result) } 25 | mime_package result, @filename, @dir1 26 | rm_temp_files(@filename, @dir, @dir1) unless @debug 27 | end 28 | 29 | def process_header(headerfile) 30 | headerfile.nil? and return 31 | doc = File.read(headerfile, encoding: "utf-8") 32 | doc = header_image_cleanup(doc, @dir1, @filename, 33 | File.dirname(@filename)) 34 | File.open("#{@dir1}/header.html", "w:UTF-8") { |f| f.write(doc) } 35 | end 36 | 37 | def clear_dir(dir) 38 | Dir.foreach(dir) do |f| 39 | fn = File.join(dir, f) 40 | File.delete(fn) if f != "." && f != ".." 41 | end 42 | dir 43 | end 44 | 45 | def create_dir(filename, dir) 46 | dir and return clear_dir(dir) 47 | dir = "#{filename}_files" 48 | FileUtils.mkdir_p(dir) 49 | clear_dir(dir) 50 | end 51 | 52 | def process_html(result) 53 | docxml = to_xhtml(result) 54 | define_head(cleanup(docxml)) 55 | msword_fix(from_xhtml(docxml)) 56 | end 57 | 58 | def rm_temp_files(filename, dir, dir1) 59 | FileUtils.rm "#{filename}.htm" 60 | FileUtils.rm_f "#{dir1}/header.html" 61 | FileUtils.rm_r dir1 unless dir 62 | end 63 | 64 | def cleanup(docxml) 65 | locate_landscape(docxml) 66 | namespace(docxml.root) 67 | image_cleanup(docxml, @dir1, @imagedir) 68 | mathml_to_ooml(docxml) 69 | lists(docxml, @liststyles) 70 | footnotes(docxml) 71 | bookmarks(docxml) 72 | msonormal(docxml) 73 | docxml 74 | end 75 | 76 | def locate_landscape(_docxml) 77 | @landscape = @stylesheet.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m) 78 | .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") } 79 | end 80 | 81 | def define_head1(docxml, _dir) 82 | docxml.xpath("//*[local-name() = 'head']").each do |h| 83 | h.children.first.add_previous_sibling <<~XML 84 | #{PRINT_VIEW} 85 | 86 | XML 87 | end 88 | end 89 | 90 | def filename_substitute(head, header_filename) 91 | return if header_filename.nil? 92 | 93 | head.xpath(".//*[local-name() = 'style']").each do |s| 94 | s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m| 95 | /FILENAME/.match?(m) ? "url(cid:header.html)" : m 96 | end 97 | s.replace(s1) 98 | end 99 | end 100 | 101 | def stylesheet(_filename, _header_filename, _cssname) 102 | stylesheet = "#{@stylesheet}\n#{@newliststyledefs}" 103 | xml = Nokogiri::XML("") and xml_found = false 179 | /^\s*@page\s+#{klass}/.match?(l) and found = true 180 | found && /^\s*\{?size:/.match?(l) and ret += l 181 | found && /^\s*\{?margin:/.match?(l) and ret += l 182 | if found && /}/.match?(l) 183 | !ret.blank? && (!in_xml || xml_found) and return ret 184 | ret = "" 185 | found = false 186 | end 187 | end 188 | nil 189 | end 190 | 191 | def units_to_px(measure) 192 | m = /^(\S+)(pt|cm)/.match(measure) 193 | ret = case m[2] 194 | when "px" then (m[1].to_f * 0.75) 195 | when "pt" then m[1].to_f 196 | when "cm" then (m[1].to_f * 28.346456693) 197 | when "in" then (m[1].to_f * 72) 198 | end 199 | ret.to_i 200 | end 201 | 202 | # do not parse the header through Nokogiri, since it will contain 203 | # non-XML like 204 | def header_image_cleanup(doc, dir, filename, localdir) 205 | doc.split(%r{(]*>|]*>)}).each_slice(2).map do |a| 206 | header_image_cleanup1(a, dir, filename, localdir) 207 | end.join 208 | end 209 | 210 | def header_image_cleanup1(a, dir, _filename, localdir) 211 | if a.size == 2 && !(/ src="https?:/.match a[1]) && 212 | !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1]) 213 | m = / src=['"](?[^"']+)['"]/.match a[1] 214 | m2 = /\.(?[a-zA-Z_0-9]+)$/.match m[:src] 215 | new_filename = "#{mkuuid}.#{m2[:suffix]}" 216 | FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename) 217 | a[1].sub!(%r{ src=['"](?[^"']+)['"]}, " src='cid:#{new_filename}'") 218 | end 219 | a.join 220 | end 221 | 222 | def generate_filelist(filename, dir) 223 | File.open(File.join(dir, "filelist.xml"), "w") do |f| 224 | f.write %{ 225 | } 226 | Dir.entries(dir).sort.each do |item| 227 | (item == "." || item == ".." || /^\./.match(item)) and next 228 | f.write %{ \n} 229 | end 230 | f.write("\n") 231 | end 232 | end 233 | end 234 | -------------------------------------------------------------------------------- /spec/header.html: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 18 | 19 | 20 | 21 | 22 |
23 | 24 |

26 | 27 |


28 | 29 |

30 | 31 |
32 | 33 |
34 | 35 |

37 | 38 |


39 | 40 |

41 | 42 |
43 | 44 |
45 | 46 |

48 | 49 |


50 | 51 |

52 | 53 |
54 | 55 |
56 | 57 |

59 | 60 |


61 | 62 |

63 | 64 |
65 | 66 |
67 | 68 |

ISO/IEC CD 17301-1:2016(E)

70 | 71 |
72 | 73 |
74 | 75 |

© 77 | ISO/IEC 2016 – All rights reserved

79 | 80 |
81 | 82 |
83 | 84 |

2                                                                                                                                                                           © 96 | ISO/IEC 2016 – All rights reserved

97 | 98 |
99 | 100 |
101 | 102 |

ISO/IEC CD 17301-1:2016(E)

104 | 105 |
106 | 107 |
108 | 109 |

ISO/IEC CD 17301-1:2016(E)

111 | 112 |
113 | 114 |
115 | 116 |

ii                                                                                                                                                                           © 127 | ISO/IEC 2016 – All rights reserved

128 | 129 |
130 | 131 |
132 | 133 |

© ISO/IEC 2016 – All 135 | rights reserved                                                                                                                                                                          iii

144 | 145 |
146 | 147 |
148 | 149 |

2                                                                                                                                                                           © 161 | ISO/IEC 2016 – All rights reserved

162 | 163 |
164 | 165 |
166 | 167 |

© ISO/IEC 2016 – All 169 | rights reserved                                                                                                                                                                           3

179 | 180 |
181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /spec/examples/header.html: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 18 | 19 | 20 | 21 | 22 |
23 | 24 |

26 | 27 |


28 | 29 |

30 | 31 |
32 | 33 |
34 | 35 |

37 | 38 |


39 | 40 |

41 | 42 |
43 | 44 |
45 | 46 |

48 | 49 |


50 | 51 |

52 | 53 |
54 | 55 |
56 | 57 |

59 | 60 |


61 | 62 |

63 | 64 |
65 | 66 |
67 | 68 |

ISO/IEC CD 17301-1:2016(E)

70 | 71 |
72 | 73 |
74 | 75 |

© 77 | ISO/IEC 2016 – All rights reserved

79 | 80 |
81 | 82 |
83 | 84 |

2                                                                                                                                                                           © 96 | ISO/IEC 2016 – All rights reserved

97 | 98 |
99 | 100 |
101 | 102 |

ISO/IEC CD 17301-1:2016(E)

104 | 105 |
106 | 107 |
108 | 109 |

ISO/IEC CD 17301-1:2016(E)

111 | 112 |
113 | 114 |
115 | 116 |

ii                                                                                                                                                                           © 127 | ISO/IEC 2016 – All rights reserved

128 | 129 |
130 | 131 |
132 | 133 |

© ISO/IEC 2016 – All 135 | rights reserved                                                                                                                                                                          iii

144 | 145 |
146 | 147 |
148 | 149 |

2                                                                                                                                                                           © 161 | ISO/IEC 2016 – All rights reserved

162 | 163 |
164 | 165 |
166 | 167 |

© ISO/IEC 2016 – All 169 | rights reserved                                                                                                                                                                           3

179 | 180 |
181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /spec/header_img.html: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 18 | 19 | 20 | 21 | 22 |
23 | 24 |

26 | 27 |


28 | 29 |

30 | 31 |
32 | 33 |
34 | 35 |

37 | 38 |


39 | 40 |

41 | 42 |
43 | 44 |
45 | 46 |

48 | 49 |


50 | 51 |

52 | 53 |
54 | 55 |
56 | 57 |

59 | 60 |


61 | 62 |

63 | 64 |
65 | 66 |
67 | 68 |

ISO/IEC CD 17301-1:2016(E)

70 | 71 | 72 | 73 |
74 | 75 |
76 | 77 |

© 79 | ISO/IEC 2016 – All rights reserved

81 | 82 |
83 | 84 |
85 | 86 |

2                                                                                                                                                                           © 98 | ISO/IEC 2016 – All rights reserved

99 | 100 |
101 | 102 |
103 | 104 |

ISO/IEC CD 17301-1:2016(E)

106 | 107 |
108 | 109 |
110 | 111 |

ISO/IEC CD 17301-1:2016(E)

113 | 114 |
115 | 116 |
117 | 118 |

ii                                                                                                                                                                           © 129 | ISO/IEC 2016 – All rights reserved

130 | 131 |
132 | 133 |
134 | 135 |

© ISO/IEC 2016 – All 137 | rights reserved                                                                                                                                                                          iii

146 | 147 |
148 | 149 |
150 | 151 |

2                                                                                                                                                                           © 163 | ISO/IEC 2016 – All rights reserved

164 | 165 |
166 | 167 |
168 | 169 |

© ISO/IEC 2016 – All 171 | rights reserved                                                                                                                                                                           3

181 | 182 |
183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /spec/wordstyle-custom-lists.css: -------------------------------------------------------------------------------- 1 | @list l2 2 | {mso-list-id:_; 3 | mso-list-template-ids:_;} 4 | @list l2:level1 5 | {mso-level-style-link:"Heading 1"; 6 | mso-level-text:%1; 7 | mso-level-tab-stop:21.6pt; 8 | mso-level-number-position:left; 9 | margin-left:21.6pt; 10 | text-indent:-21.6pt; 11 | mso-bidi-font-family:"Cambria"; 12 | mso-ansi-font-weight:bold; 13 | mso-ansi-font-style:normal;} 14 | @list l2:level2 15 | {mso-level-start-at:3; 16 | mso-level-style-link:"Heading 2"; 17 | mso-level-text:"%1\.%2"; 18 | mso-level-tab-stop:18.0pt; 19 | mso-level-number-position:left; 20 | margin-left:0cm; 21 | text-indent:0cm; 22 | mso-bidi-font-family:"Cambria"; 23 | mso-ansi-font-weight:bold; 24 | mso-ansi-font-style:normal;} 25 | @list l2:level3 26 | {mso-level-style-link:"Heading 3"; 27 | mso-level-text:"%1\.%2\.%3"; 28 | mso-level-tab-stop:36.0pt; 29 | mso-level-number-position:left; 30 | margin-left:0cm; 31 | text-indent:0cm; 32 | mso-bidi-font-family:"Cambria"; 33 | mso-ansi-font-weight:bold; 34 | mso-ansi-font-style:normal;} 35 | @list l2:level4 36 | {mso-level-style-link:"Heading 4"; 37 | mso-level-text:"%1\.%2\.%3\.%4"; 38 | mso-level-tab-stop:54.0pt; 39 | mso-level-number-position:left; 40 | margin-left:0cm; 41 | text-indent:0cm; 42 | mso-bidi-font-family:"Cambria"; 43 | mso-ansi-font-weight:bold; 44 | mso-ansi-font-style:normal;} 45 | @list l2:level5 46 | {mso-level-style-link:"Heading 5"; 47 | mso-level-text:"%1\.%2\.%3\.%4\.%5"; 48 | mso-level-tab-stop:54.0pt; 49 | mso-level-number-position:left; 50 | margin-left:0cm; 51 | text-indent:0cm; 52 | mso-bidi-font-family:"Cambria"; 53 | mso-ansi-font-weight:bold; 54 | mso-ansi-font-style:normal;} 55 | @list l2:level6 56 | {mso-level-style-link:"Heading 6"; 57 | mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6"; 58 | mso-level-tab-stop:72.0pt; 59 | mso-level-number-position:left; 60 | margin-left:0cm; 61 | text-indent:0cm; 62 | mso-bidi-font-family:"Cambria"; 63 | mso-ansi-font-weight:bold; 64 | mso-ansi-font-style:normal;} 65 | @list l2:level7 66 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7"; 67 | mso-level-tab-stop:72.0pt; 68 | mso-level-number-position:left; 69 | margin-left:0cm; 70 | text-indent:0cm; 71 | mso-bidi-font-family:"Cambria";} 72 | @list l2:level8 73 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8"; 74 | mso-level-tab-stop:90.0pt; 75 | mso-level-number-position:left; 76 | margin-left:0cm; 77 | text-indent:0cm; 78 | mso-bidi-font-family:"Cambria";} 79 | @list l2:level9 80 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8\.%9"; 81 | mso-level-tab-stop:90.0pt; 82 | mso-level-number-position:left; 83 | margin-left:0cm; 84 | text-indent:0cm; 85 | mso-bidi-font-family:"Cambria";} 86 | @list l3 87 | {mso-list-id:_; 88 | mso-list-template-ids:_;} 89 | @list l3:level1 90 | {mso-level-style-link:"Heading 1"; 91 | mso-level-text:%1; 92 | mso-level-tab-stop:21.6pt; 93 | mso-level-number-position:left; 94 | margin-left:21.6pt; 95 | text-indent:-21.6pt; 96 | mso-bidi-font-family:"Cambria"; 97 | mso-ansi-font-weight:bold; 98 | mso-ansi-font-style:normal;} 99 | @list l3:level2 100 | {mso-level-style-link:"Heading 2"; 101 | mso-level-text:"%1\.%2"; 102 | mso-level-tab-stop:18.0pt; 103 | mso-level-number-position:left; 104 | margin-left:0cm; 105 | text-indent:0cm; 106 | mso-bidi-font-family:"Cambria"; 107 | mso-ansi-font-weight:bold; 108 | mso-ansi-font-style:normal;} 109 | @list l3:level3 110 | {mso-level-style-link:"Heading 3"; 111 | mso-level-text:"%1\.%2\.%3"; 112 | mso-level-tab-stop:36.0pt; 113 | mso-level-number-position:left; 114 | margin-left:0cm; 115 | text-indent:0cm; 116 | mso-bidi-font-family:"Cambria"; 117 | mso-ansi-font-weight:bold; 118 | mso-ansi-font-style:normal;} 119 | @list l3:level4 120 | {mso-level-start-at:5; 121 | mso-level-style-link:"Heading 4"; 122 | mso-level-text:"%1\.%2\.%3\.%4"; 123 | mso-level-tab-stop:54.0pt; 124 | mso-level-number-position:left; 125 | margin-left:0cm; 126 | text-indent:0cm; 127 | mso-bidi-font-family:"Cambria"; 128 | mso-ansi-font-weight:bold; 129 | mso-ansi-font-style:normal;} 130 | @list l3:level5 131 | {mso-level-style-link:"Heading 5"; 132 | mso-level-text:"%1\.%2\.%3\.%4\.%5"; 133 | mso-level-tab-stop:54.0pt; 134 | mso-level-number-position:left; 135 | margin-left:0cm; 136 | text-indent:0cm; 137 | mso-bidi-font-family:"Cambria"; 138 | mso-ansi-font-weight:bold; 139 | mso-ansi-font-style:normal;} 140 | @list l3:level6 141 | {mso-level-style-link:"Heading 6"; 142 | mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6"; 143 | mso-level-tab-stop:72.0pt; 144 | mso-level-number-position:left; 145 | margin-left:0cm; 146 | text-indent:0cm; 147 | mso-bidi-font-family:"Cambria"; 148 | mso-ansi-font-weight:bold; 149 | mso-ansi-font-style:normal;} 150 | @list l3:level7 151 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7"; 152 | mso-level-tab-stop:72.0pt; 153 | mso-level-number-position:left; 154 | margin-left:0cm; 155 | text-indent:0cm; 156 | mso-bidi-font-family:"Cambria";} 157 | @list l3:level8 158 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8"; 159 | mso-level-tab-stop:90.0pt; 160 | mso-level-number-position:left; 161 | margin-left:0cm; 162 | text-indent:0cm; 163 | mso-bidi-font-family:"Cambria";} 164 | @list l3:level9 165 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8\.%9"; 166 | mso-level-tab-stop:90.0pt; 167 | mso-level-number-position:left; 168 | margin-left:0cm; 169 | text-indent:0cm; 170 | mso-bidi-font-family:"Cambria";} 171 | @list l4 172 | {mso-list-id:_; 173 | mso-list-template-ids:_;} 174 | @list l4:level1 175 | {mso-level-style-link:"Heading 1"; 176 | mso-level-text:%1; 177 | mso-level-tab-stop:21.6pt; 178 | mso-level-number-position:left; 179 | margin-left:21.6pt; 180 | text-indent:-21.6pt; 181 | mso-bidi-font-family:"Cambria"; 182 | mso-ansi-font-weight:bold; 183 | mso-ansi-font-style:normal;} 184 | @list l4:level2 185 | {mso-level-style-link:"Heading 2"; 186 | mso-level-text:"%1\.%2"; 187 | mso-level-tab-stop:18.0pt; 188 | mso-level-number-position:left; 189 | margin-left:0cm; 190 | text-indent:0cm; 191 | mso-bidi-font-family:"Cambria"; 192 | mso-ansi-font-weight:bold; 193 | mso-ansi-font-style:normal;} 194 | @list l4:level3 195 | {mso-level-style-link:"Heading 3"; 196 | mso-level-text:"%1\.%2\.%3"; 197 | mso-level-tab-stop:36.0pt; 198 | mso-level-number-position:left; 199 | margin-left:0cm; 200 | text-indent:0cm; 201 | mso-bidi-font-family:"Cambria"; 202 | mso-ansi-font-weight:bold; 203 | mso-ansi-font-style:normal;} 204 | @list l4:level4 205 | {mso-level-style-link:"Heading 4"; 206 | mso-level-text:"%1\.%2\.%3\.%4"; 207 | mso-level-tab-stop:54.0pt; 208 | mso-level-number-position:left; 209 | margin-left:0cm; 210 | text-indent:0cm; 211 | mso-bidi-font-family:"Cambria"; 212 | mso-ansi-font-weight:bold; 213 | mso-ansi-font-style:normal;} 214 | @list l4:level5 215 | {mso-level-start-at:7; 216 | mso-level-style-link:"Heading 5"; 217 | mso-level-text:"%1\.%2\.%3\.%4\.%5"; 218 | mso-level-tab-stop:54.0pt; 219 | mso-level-number-position:left; 220 | margin-left:0cm; 221 | text-indent:0cm; 222 | mso-bidi-font-family:"Cambria"; 223 | mso-ansi-font-weight:bold; 224 | mso-ansi-font-style:normal;} 225 | @list l4:level6 226 | {mso-level-style-link:"Heading 6"; 227 | mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6"; 228 | mso-level-tab-stop:72.0pt; 229 | mso-level-number-position:left; 230 | margin-left:0cm; 231 | text-indent:0cm; 232 | mso-bidi-font-family:"Cambria"; 233 | mso-ansi-font-weight:bold; 234 | mso-ansi-font-style:normal;} 235 | @list l4:level7 236 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7"; 237 | mso-level-tab-stop:72.0pt; 238 | mso-level-number-position:left; 239 | margin-left:0cm; 240 | text-indent:0cm; 241 | mso-bidi-font-family:"Cambria";} 242 | @list l4:level8 243 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8"; 244 | mso-level-tab-stop:90.0pt; 245 | mso-level-number-position:left; 246 | margin-left:0cm; 247 | text-indent:0cm; 248 | mso-bidi-font-family:"Cambria";} 249 | @list l4:level9 250 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8\.%9"; 251 | mso-level-tab-stop:90.0pt; 252 | mso-level-number-position:left; 253 | margin-left:0cm; 254 | text-indent:0cm; 255 | mso-bidi-font-family:"Cambria";} 256 | @list l5 257 | {mso-list-id:_; 258 | mso-list-template-ids:_;} 259 | @list l5:level1 260 | {mso-level-start-at:2; 261 | mso-level-style-link:"Heading 1"; 262 | mso-level-text:%1; 263 | mso-level-tab-stop:21.6pt; 264 | mso-level-number-position:left; 265 | margin-left:21.6pt; 266 | text-indent:-21.6pt; 267 | mso-bidi-font-family:"Cambria"; 268 | mso-ansi-font-weight:bold; 269 | mso-ansi-font-style:normal;} 270 | @list l5:level2 271 | {mso-level-style-link:"Heading 2"; 272 | mso-level-text:"%1\.%2"; 273 | mso-level-tab-stop:18.0pt; 274 | mso-level-number-position:left; 275 | margin-left:0cm; 276 | text-indent:0cm; 277 | mso-bidi-font-family:"Cambria"; 278 | mso-ansi-font-weight:bold; 279 | mso-ansi-font-style:normal;} 280 | @list l5:level3 281 | {mso-level-style-link:"Heading 3"; 282 | mso-level-text:"%1\.%2\.%3"; 283 | mso-level-tab-stop:36.0pt; 284 | mso-level-number-position:left; 285 | margin-left:0cm; 286 | text-indent:0cm; 287 | mso-bidi-font-family:"Cambria"; 288 | mso-ansi-font-weight:bold; 289 | mso-ansi-font-style:normal;} 290 | @list l5:level4 291 | {mso-level-style-link:"Heading 4"; 292 | mso-level-text:"%1\.%2\.%3\.%4"; 293 | mso-level-tab-stop:54.0pt; 294 | mso-level-number-position:left; 295 | margin-left:0cm; 296 | text-indent:0cm; 297 | mso-bidi-font-family:"Cambria"; 298 | mso-ansi-font-weight:bold; 299 | mso-ansi-font-style:normal;} 300 | @list l5:level5 301 | {mso-level-style-link:"Heading 5"; 302 | mso-level-text:"%1\.%2\.%3\.%4\.%5"; 303 | mso-level-tab-stop:54.0pt; 304 | mso-level-number-position:left; 305 | margin-left:0cm; 306 | text-indent:0cm; 307 | mso-bidi-font-family:"Cambria"; 308 | mso-ansi-font-weight:bold; 309 | mso-ansi-font-style:normal;} 310 | @list l5:level6 311 | {mso-level-style-link:"Heading 6"; 312 | mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6"; 313 | mso-level-tab-stop:72.0pt; 314 | mso-level-number-position:left; 315 | margin-left:0cm; 316 | text-indent:0cm; 317 | mso-bidi-font-family:"Cambria"; 318 | mso-ansi-font-weight:bold; 319 | mso-ansi-font-style:normal;} 320 | @list l5:level7 321 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7"; 322 | mso-level-tab-stop:72.0pt; 323 | mso-level-number-position:left; 324 | margin-left:0cm; 325 | text-indent:0cm; 326 | mso-bidi-font-family:"Cambria";} 327 | @list l5:level8 328 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8"; 329 | mso-level-tab-stop:90.0pt; 330 | mso-level-number-position:left; 331 | margin-left:0cm; 332 | text-indent:0cm; 333 | mso-bidi-font-family:"Cambria";} 334 | @list l5:level9 335 | {mso-level-text:"%1\.%2\.%3\.%4\.%5\.%6\.%7\.%8\.%9"; 336 | mso-level-tab-stop:90.0pt; 337 | mso-level-number-position:left; 338 | margin-left:0cm; 339 | text-indent:0cm; 340 | mso-bidi-font-family:"Cambria";} 341 | -------------------------------------------------------------------------------- /README.adoc: -------------------------------------------------------------------------------- 1 | = Html2Doc 2 | 3 | https://github.com/metanorma/html2doc/workflows/main/badge.svg 4 | 5 | image:https://img.shields.io/gem/v/html2doc.svg["Gem Version", link="https://rubygems.org/gems/html2doc"] 6 | image:https://github.com/metanorma/html2doc/workflows/rake/badge.svg["Build Status", link="https://github.com/metanorma/html2doc/actions?workflow=rake"] 7 | // image:https://codeclimate.com/github/metanorma/html2doc/badges/gpa.svg["Code Climate", link="https://codeclimate.com/github/metanorma/html2doc"] 8 | image:https://img.shields.io/github/issues-pr-raw/metanorma/html2doc.svg["Pull Requests", link="https://github.com/metanorma/html2doc/pulls"] 9 | image:https://img.shields.io/github/commits-since/metanorma/html2doc/latest.svg["Commits since latest",link="https://github.com/metanorma/html2doc/releases"] 10 | 11 | == Purpose 12 | 13 | Gem to convert an HTML document into a Word document (.doc) format. This is intended for automated generation of Microsoft Word documents, given HTML documents, which are much more readily crafted. 14 | 15 | == Origin 16 | 17 | This gem originated out of https://github.com/metanorma/metanorma-iso, which creates a Word document from a automatically generated HTML document (created in turn by processing Asciidoc). 18 | 19 | This work is driven by the Word document generation procedure documented in http://sebsauvage.net/wiki/doku.php?id=word_document_generation. For more on the approach taken, and on alternative approaches, see https://github.com/metanorma/html2doc/wiki/Why-not-docx%3F 20 | 21 | == Functions 22 | 23 | The gem currently does the following: 24 | 25 | * Convert any AsciiMath and MathML to Word's native mathematical formatting language, OOXML. Word supports copy-pasting MathML into Word and converting it into OOXML; however the conversion is not infallible (we have in the past found problems with `\sum`: Word claims parameters were missing, and inserting dotted squares to indicate as much), and you may need to post-edit the OOXML. 26 | ** The gem does attempt to repair the MathML input, to bring it in line with Word's OOXML's expectations. If you find any issues with AsciiMath or MathML input, please raise an issue. 27 | * Identify any footnotes in the document (defined as hyperlinks with attributes `class = "Footnote"` or `epub:type = "footnote"`), and render them as Microsoft Word footnotes. 28 | ** The corresponding footnote content is any `div` or `aside` element with the same `@id` attribute as the footnote points to; e.g. `
3
`, pointing to `