├── .gitignore ├── .rspec ├── .travis.yml ├── Gemfile ├── Guardfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── circle.yml ├── googledoc_markdown.gemspec ├── lib ├── googledoc_markdown.rb └── googledoc_markdown │ ├── converter.rb │ ├── helpers.rb │ └── version.rb └── spec ├── fixtures ├── headings │ ├── input.html │ ├── output.html │ └── output.md ├── linked_space │ ├── input.html │ ├── output.html │ └── output.md └── simple │ ├── input.html │ ├── output.html │ └── output.md ├── googledoc_markdown └── converter_spec.rb ├── googledoc_markdown_helpers_spec.rb ├── googledoc_markdown_spec.rb └── spec_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .DS_Store 3 | /.bundle/ 4 | /.yardoc 5 | /Gemfile.lock 6 | /_yardoc/ 7 | /coverage/ 8 | /doc/ 9 | /pkg/ 10 | /spec/reports/ 11 | /tmp/ 12 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.3.0 4 | - 2.2 5 | - 2.1 6 | - 2.0 7 | before_install: gem install bundler -v 1.10.6 8 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | guard :rspec, cmd: 'bundle exec rspec' do 2 | watch(%r{^spec/.+_spec\.rb$}) 3 | watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" } 4 | watch('spec/spec_helper.rb') { "spec" } 5 | end 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Ivar Vong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # googledoc_markdown 2 | 3 | [![Circle CI](https://circleci.com/gh/ivarvong/googledoc_markdown.svg?style=svg)](https://circleci.com/gh/ivarvong/googledoc_markdown) 4 | [![Build Status](https://travis-ci.org/ivarvong/googledoc_markdown.svg?branch=master)](https://travis-ci.org/ivarvong/googledoc_markdown) 5 | [![Dependency Status](https://gemnasium.com/ivarvong/googledoc_markdown.svg)](https://gemnasium.com/ivarvong/googledoc_markdown) 6 | [![Code Climate](https://codeclimate.com/github/ivarvong/googledoc_markdown/badges/gpa.svg)](https://codeclimate.com/github/ivarvong/googledoc_markdown) 7 | 8 | ## Why? 9 | 10 | At [The Marshall Project](https://www.themarshallproject.org/), stories are edited in Google Docs. I wrote a quick tool to convert the HTML export from a Google Doc to Markdown. (Internally, our stories are stored as Markdown). Turns out, parsing CSS with regexes is not a great idea. This gem is the next iteration. 11 | 12 | Here's the strategy: 13 | 14 | 1. Inline the CSS for `font-weight: bold;` and `font-style: italic;` based on the `.c01` (etc) classes with the `roadie` gem. 15 | 2. Parse the inline styles into a hash of CSS properties with the `css_parser` gem. 16 | 3. Wrap the `` with either a `` or `` based on the CSS properties on it. A single `` may get wrapped multiple times if the text is both bold and italic, for example. Then remove all the ``s. 17 | 4. Pass this cleaned HTML to `kramdown` to yield markdown. 18 | 19 | ## Installation 20 | 21 | Add this line to your application's Gemfile: 22 | 23 | ```ruby 24 | gem 'googledoc_markdown', github: 'ivarvong/googledoc_markdown', tag: 'v0.1.1' 25 | ``` 26 | 27 | And then execute: 28 | 29 | $ bundle 30 | 31 | ## Usage 32 | 33 | This gem is not stable and probably shouldn't be used yet. [The spec](https://github.com/ivarvong/googledoc_markdown/blob/master/spec/googledoc_markdown/converter_spec.rb) might be useful. 34 | 35 | ```ruby 36 | require 'googledoc_markdown' 37 | 38 | converter = GoogledocMarkdown::Converter.new(html: your_google_doc_html) 39 | markdown = converter.to_markdown 40 | ``` 41 | 42 | ## Development 43 | 44 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `guard` to run the tests. 45 | 46 | ## Contributing 47 | 48 | Bug reports and pull requests are welcome on GitHub at https://github.com/ivarvong/googledoc_markdown. 49 | 50 | ## License 51 | 52 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 53 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "googledoc_markdown" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start 15 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | 5 | bundle install 6 | 7 | # Do any other automated setup that you need to do here 8 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | machine: 2 | ruby: 3 | version: 2.3.0 4 | dependencies: 5 | pre: 6 | - gem install bundler -v 1.10.6 7 | test: 8 | override: 9 | - RAILS_ENV=test bundle exec rspec -r rspec_junit_formatter --format RspecJunitFormatter -o $CIRCLE_TEST_REPORTS/rspec/junit.xml 10 | -------------------------------------------------------------------------------- /googledoc_markdown.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'googledoc_markdown/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "googledoc_markdown" 8 | spec.version = GoogledocMarkdown::VERSION 9 | spec.authors = ["Ivar Vong"] 10 | spec.email = ["ivar@ivarvong.com"] 11 | 12 | spec.summary = %q{Convert Google Document HTML to Markdown} 13 | spec.homepage = "https://github.com/ivarvong/googledoc_markdown" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 17 | spec.bindir = "exe" 18 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_runtime_dependency "roadie", "~> 3.1" 22 | spec.add_runtime_dependency "css_parser", "~> 1.3" 23 | spec.add_runtime_dependency "nokogiri", "~> 1.6" 24 | spec.add_runtime_dependency "kramdown", "~> 1.9" 25 | 26 | spec.add_development_dependency "bundler", "~> 1.10" 27 | spec.add_development_dependency "rake", "~> 10.0" 28 | spec.add_development_dependency "rspec" 29 | spec.add_development_dependency "rspec_junit_formatter", "0.2.2" 30 | spec.add_development_dependency "codeclimate-test-reporter" 31 | spec.add_development_dependency "guard-rspec" 32 | end 33 | -------------------------------------------------------------------------------- /lib/googledoc_markdown.rb: -------------------------------------------------------------------------------- 1 | require "googledoc_markdown/version" 2 | require "googledoc_markdown/converter" 3 | require "googledoc_markdown/helpers" 4 | -------------------------------------------------------------------------------- /lib/googledoc_markdown/converter.rb: -------------------------------------------------------------------------------- 1 | require 'roadie' 2 | require 'css_parser' 3 | require 'nokogiri' 4 | require 'kramdown' 5 | require 'cgi' 6 | require 'json' 7 | 8 | class GoogledocMarkdown::Converter 9 | 10 | def initialize html: nil 11 | @html = html.to_s 12 | end 13 | 14 | def to_html 15 | # public interface 16 | inlined = inline_styles(@html) 17 | body = body_for(inlined) 18 | doc = Nokogiri::HTML.fragment(body) 19 | 20 | doc.css('*').each do |el| 21 | 22 | rules = css_rules(el['style']) 23 | el.delete('style') 24 | el.delete('class') 25 | 26 | if rules['font-weight'] == 'bold' 27 | el.inner_html = rewrap(tag: 'strong', html: el.inner_html) 28 | end 29 | 30 | if rules['font-style'] == 'italic' 31 | el.inner_html = rewrap(tag: 'em', html: el.inner_html) 32 | end 33 | 34 | end 35 | 36 | doc.css('h1, h2, h3').each do |heading| 37 | heading.inner_html = heading.inner_text 38 | end 39 | 40 | doc.css('ol').each do |ol| 41 | ol.delete('start') 42 | end 43 | 44 | doc.css('span').each do |span| 45 | #span.swap(span.children) 46 | span.add_next_sibling(span.children.to_html) 47 | span.remove 48 | end 49 | 50 | # first, fix the url by extracting the 'q' query param 51 | doc.css('a').each do |a| 52 | a.delete('style') 53 | a.delete('class') 54 | a['href'] = parse_link(a['href']) 55 | end 56 | 57 | doc.css('a').each do |el| 58 | if is_empty?(el) 59 | el.add_next_sibling(el.children.to_html) 60 | el.remove 61 | end 62 | end 63 | 64 | doc.css('p, em, strong').each do |el| 65 | if el.children.all?{ |child| is_empty?(child) } 66 | el.remove 67 | end 68 | end 69 | 70 | doc.to_html.lstrip 71 | end 72 | 73 | def to_markdown 74 | # public interface 75 | options = { 76 | input: :html, 77 | remove_span_html_tags: true, # TODO: this may be a noop because it's on the wrong kramdown converter 78 | line_width: 90000, # TODO: prevent line wrapping in a nicer way than this 79 | } 80 | Kramdown::Document.new(to_html, options).to_kramdown 81 | end 82 | 83 | def is_empty?(node) 84 | /^[[:space:]]*$/.match(node.content) != nil 85 | end 86 | 87 | def rewrap(tag: nil, html: nil) 88 | leading, content, trailing = partition_whitespace(html) 89 | [leading, "<#{tag}>", content, "", trailing].join('') 90 | end 91 | 92 | def partition_whitespace(input) 93 | if input.strip == '' 94 | return ['', input, ''] 95 | end 96 | 97 | re = /\A(\s{0,})(\S|\S.*\S)(\s{0,})\z/ 98 | matches = re.match(input) 99 | leading, content, trailing = matches[1], matches[2], matches[3] 100 | return [leading, content, trailing] 101 | end 102 | 103 | def css_rules style_string 104 | declarations = {} 105 | rule_set = CssParser::RuleSet.new(nil, style_string) 106 | rule_set.each_declaration do |property, value, _| 107 | declarations[property] = value 108 | end 109 | declarations 110 | end 111 | 112 | def inline_styles html 113 | Roadie::Document.new(html).transform 114 | end 115 | 116 | def body_for html 117 | Nokogiri::HTML(html).css('body').inner_html.gsub(" ", ' ') 118 | end 119 | 120 | def parse_link href 121 | # un-Google-ify the link 122 | uri = URI.parse(href) rescue nil 123 | params = CGI.parse(uri.query) rescue nil 124 | params['q'].first 125 | rescue 126 | href 127 | end 128 | 129 | end 130 | -------------------------------------------------------------------------------- /lib/googledoc_markdown/helpers.rb: -------------------------------------------------------------------------------- 1 | class GoogledocMarkdown::Helpers 2 | 3 | def self.id_from_url(url) 4 | re = /\/document\/d\/(.+?)(\/|$)/ 5 | matches = url.match(re) 6 | return matches.nil? ? nil : matches[1] 7 | end 8 | 9 | end 10 | -------------------------------------------------------------------------------- /lib/googledoc_markdown/version.rb: -------------------------------------------------------------------------------- 1 | module GoogledocMarkdown 2 | VERSION = "0.1.1" 3 | end 4 | -------------------------------------------------------------------------------- /spec/fixtures/headings/input.html: -------------------------------------------------------------------------------- 1 |

Heading one

Heading two

Heading three

Normal text

  1. Numbered bullet point one
  2. And two
  3. And three.

More normal bold text.

  • Unordered bullet point
  • A second one

Final normal text.

2 | -------------------------------------------------------------------------------- /spec/fixtures/headings/output.html: -------------------------------------------------------------------------------- 1 |

2 | Heading one 3 |

4 | 5 |

6 | Heading two 7 |

8 | 9 |

10 | Heading three 11 |

12 | 13 |

Normal text

14 | 15 |
    16 |
  1. Numbered bullet point one
  2. 17 |
  3. And two
  4. 18 |
  5. And three.
  6. 19 |
20 | 21 |

More normal bold text.

22 | 23 |
    24 |
  • Unordered bullet point
  • 25 |
  • A second one
  • 26 |
27 | 28 |

Final normal text.

29 | -------------------------------------------------------------------------------- /spec/fixtures/headings/output.md: -------------------------------------------------------------------------------- 1 | # Heading one 2 | 3 | ## Heading two 4 | 5 | ### Heading three 6 | 7 | Normal text 8 | 9 | 1. Numbered bullet point one 10 | 2. And two 11 | 3. And three. 12 | 13 | More normal **bold text**. 14 | 15 | * Unordered bullet point 16 | * A second one 17 | 18 | Final normal text. 19 | 20 | -------------------------------------------------------------------------------- /spec/fixtures/linked_space/input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 8 | 9 | 10 | 11 |

some text

12 |

 

13 |

more text tktk tktk not tktktk.

14 |

15 |

footer

16 | 17 | 18 | -------------------------------------------------------------------------------- /spec/fixtures/linked_space/output.html: -------------------------------------------------------------------------------- 1 |

some text

2 | 3 |

more text tktk tktk not tktktk.

4 | 5 |

footer

6 | -------------------------------------------------------------------------------- /spec/fixtures/linked_space/output.md: -------------------------------------------------------------------------------- 1 | some text 2 | 3 | more text [tktk][1] tktk *not* tktktk. 4 | 5 | footer 6 | 7 | 8 | 9 | [1]: http://www.ussc.gov 10 | -------------------------------------------------------------------------------- /spec/fixtures/simple/input.html: -------------------------------------------------------------------------------- 1 |

This is some text in a document, some bold, some ital, some both, and a link.

A second paragraph looks like this.

2 | -------------------------------------------------------------------------------- /spec/fixtures/simple/output.html: -------------------------------------------------------------------------------- 1 |

This is some text in a document, some bold, some ital, some both, and a link.

2 | 3 |

A second paragraph looks like this.

4 | -------------------------------------------------------------------------------- /spec/fixtures/simple/output.md: -------------------------------------------------------------------------------- 1 | This is some text in a document, some **bold**, some *ital*, some ***both***, and a [link][1]. 2 | 3 | A second paragraph looks like this. 4 | 5 | 6 | 7 | [1]: https://www.themarshallproject.org 8 | -------------------------------------------------------------------------------- /spec/googledoc_markdown/converter_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe GoogledocMarkdown::Converter do 4 | 5 | def load_fixture path 6 | File.read(File.join(File.dirname(__dir__), "../spec/fixtures/#{path}")) 7 | end 8 | 9 | def fixtures 10 | Dir.glob(File.join(File.dirname(__dir__), "../spec/fixtures/*")).map do |path| 11 | File.basename(path) 12 | end 13 | end 14 | 15 | it 'has fixtures' do 16 | expect(fixtures.count).to be > 0 17 | end 18 | 19 | it 'extracts google-wrapped links' do 20 | converter = GoogledocMarkdown::Converter.new 21 | expect(converter.parse_link('https://www.google.com/url?q=http://vice.com/&sa=D')).to eq('http://vice.com/') 22 | end 23 | 24 | it 'passes through non-google links' do 25 | converter = GoogledocMarkdown::Converter.new 26 | expect(converter.parse_link('not-a-link')).to eq('not-a-link') 27 | end 28 | 29 | it 'partitions whitespace in [leading, content, trailing]' do 30 | converter = GoogledocMarkdown::Converter.new 31 | 32 | four_spaces = ' ' 33 | empty_string = '' 34 | expect(converter.partition_whitespace(four_spaces)).to eq([empty_string, four_spaces, empty_string]) 35 | expect(converter.partition_whitespace(empty_string)).to eq([empty_string, empty_string, empty_string]) 36 | 37 | expect(converter.partition_whitespace(' my string ')).to eq([" ", "my string", " "]) 38 | expect(converter.partition_whitespace('a')).to eq(["", "a", ""]) 39 | expect(converter.partition_whitespace(' a ')).to eq([" ", "a", " "]) 40 | expect(converter.partition_whitespace('only trailing ')).to eq(["", "only trailing", " "]) 41 | expect(converter.partition_whitespace(' only leading')).to eq([" ", "only leading", ""]) 42 | end 43 | 44 | it 'converts font-weight:bold; tags to / **' do 45 | html = "bold" 46 | converter = GoogledocMarkdown::Converter.new(html: html) 47 | expect(converter.to_html).to eq("bold") 48 | expect(converter.to_markdown).to eq("**bold**") 49 | end 50 | 51 | it 'converts font-style:italic; tags to / *' do 52 | html = "italic" 53 | converter = GoogledocMarkdown::Converter.new(html: html) 54 | expect(converter.to_html).to eq("italic") 55 | expect(converter.to_markdown).to eq("*italic*") 56 | end 57 | 58 | it "removes with empty contents, preserves whitespace" do 59 | fixture = "

first second third fourth

" 60 | converter = GoogledocMarkdown::Converter.new(html: fixture) 61 | expect(converter.to_html).to eq("

first second third fourth

") 62 | end 63 | 64 | it "converts fixtures to html" do 65 | fixtures.each do |fixture| 66 | converter = GoogledocMarkdown::Converter.new(html: load_fixture("#{fixture}/input.html")) 67 | expect(converter.to_html).to eq(load_fixture("#{fixture}/output.html")) 68 | end 69 | end 70 | 71 | it "converts fixtures to markdown" do 72 | fixtures.each do |fixture| 73 | converter = GoogledocMarkdown::Converter.new(html: load_fixture("#{fixture}/input.html")) 74 | expect(converter.to_markdown).to eq(load_fixture("#{fixture}/output.md")) 75 | end 76 | end 77 | 78 | end 79 | -------------------------------------------------------------------------------- /spec/googledoc_markdown_helpers_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe GoogledocMarkdown::Helpers do 4 | 5 | it "can extract a doc id from a url" do 6 | id = "1ASI7Bfmbp5XHxeNnZeHQlqDeQ" 7 | url_v1 = "https://docs.google.com/document/d/#{id}/edit" 8 | url_v2 = "https://docs.google.com/document/d/#{id}" 9 | expect(GoogledocMarkdown::Helpers.id_from_url(url_v1)).to eq(id) 10 | expect(GoogledocMarkdown::Helpers.id_from_url(url_v2)).to eq(id) 11 | end 12 | 13 | it "returns nil for a non-google-drive url" do 14 | expect(GoogledocMarkdown::Helpers.id_from_url("https://nytimes.com")).to eq(nil) 15 | end 16 | 17 | end 18 | -------------------------------------------------------------------------------- /spec/googledoc_markdown_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe GoogledocMarkdown do 4 | 5 | it 'has a version number' do 6 | expect(GoogledocMarkdown::VERSION).not_to be nil 7 | end 8 | 9 | end 10 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | if ENV['CODECLIMATE_REPO_TOKEN'] 2 | require "codeclimate-test-reporter" 3 | puts "Starting CodeClimate::TestReporter" 4 | CodeClimate::TestReporter.start 5 | end 6 | 7 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 8 | require 'googledoc_markdown' 9 | --------------------------------------------------------------------------------