├── .gitignore ├── Gemfile ├── LICENSE.txt ├── README.rdoc ├── Rakefile ├── lib ├── ruby_powerpoint.rb └── ruby_powerpoint │ ├── paragraph.rb │ ├── presentation.rb │ ├── slide.rb │ └── version.rb ├── ruby_powerpoint.gemspec └── spec ├── fixtures ├── invalid.xls ├── rime.pptx └── sample.pptx └── test_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in ruby_powerpoint.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 TODO: Write your name 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | {Gem Version}[https://badge.fury.io/rb/ruby_powerpoint] 2 | {Downloads}[https://ruby-gem-downloads-badge.herokuapp.com/ruby_powerpoint?type=total&total_label=downloads] 3 | 4 | = RubyPowerpoint -- Parser for Powerpoint (pptx) files. 5 | 6 | ruby_powerpoint is a Ruby gem that can extract title, content and images from Powerpoint (pptx) slides. 7 | 8 | 9 | == Installation 10 | 11 | RubyPowerpoint can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command: 12 | 13 | gem install ruby_powerpoint 14 | 15 | To use it in Rails, add this line to your Gemfile: 16 | 17 | gem "ruby_powerpoint" 18 | 19 | 20 | == Basic Usage 21 | RubyPowerpoint can parse a PowerPoint file (pptx) by extracting text and images from each slide: 22 | 23 | require 'ruby_powerpoint' 24 | 25 | deck = RubyPowerpoint::Presentation.new "specs/fixtures/sample.pptx" 26 | 27 | deck.slides.each do |slide| 28 | slide.content # => ["Presentation Notes...", "12345"] 29 | slide.title # => "Prsentation Header" 30 | slide.images # => ["\xE3=\xA8h\x8E\x17\...."] Byte Stream 31 | # Saving the image byte stream to a file: 32 | File.open('temp.jpg', 'w'){|f| f.puts slide.images[0].read} 33 | end 34 | 35 | 36 | == Contributing 37 | 38 | Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request. 39 | 40 | After forking and then cloning the repository locally, install Bundler and then use it 41 | to install the development gem dependencies: 42 | 43 | gem install bundler 44 | bundle install 45 | 46 | Once this is complete, you should be able to run the test suite: 47 | 48 | rake 49 | 50 | 51 | == Bug Reporting 52 | 53 | Please use the {Issues}[https://github.com/pythonicrubyist/ruby_powerpoint/issues] page to report bugs or suggest new enhancements. 54 | 55 | 56 | == License 57 | 58 | RubyPowerpoint has been published under {MIT License}[https://github.com/pythonicrubyist/ruby_powerpoint/blob/master/LICENSE.txt] 59 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require 'rspec/core/rake_task' 3 | 4 | RSpec::Core::RakeTask.new('spec') 5 | 6 | # If you want to make this the default task 7 | task :default => :spec -------------------------------------------------------------------------------- /lib/ruby_powerpoint.rb: -------------------------------------------------------------------------------- 1 | require "ruby_powerpoint/version" 2 | require 'ruby_powerpoint/presentation' 3 | require 'ruby_powerpoint/slide' 4 | require 'ruby_powerpoint/paragraph' 5 | 6 | module RubyPowerpoint 7 | # Your code goes here... 8 | end 9 | -------------------------------------------------------------------------------- /lib/ruby_powerpoint/paragraph.rb: -------------------------------------------------------------------------------- 1 | module RubyPowerpoint 2 | class RubyPowerpoint::Paragraph 3 | def initialize slide, paragraph_xml 4 | @slide = slide 5 | @presentation = slide.presentation 6 | @paragraph_xml = paragraph_xml 7 | end 8 | 9 | def content 10 | content_element @paragraph_xml 11 | end 12 | 13 | private 14 | 15 | def content_element(xml) 16 | xml.xpath('.//a:t').collect{ |node| node.text } 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/ruby_powerpoint/presentation.rb: -------------------------------------------------------------------------------- 1 | require 'zip/filesystem' 2 | require 'nokogiri' 3 | 4 | module RubyPowerpoint 5 | 6 | class RubyPowerpoint::Presentation 7 | 8 | attr_reader :files 9 | 10 | def initialize path 11 | raise 'Not a valid file format.' unless (['.pptx'].include? File.extname(path).downcase) 12 | @files = Zip::File.open path 13 | end 14 | 15 | def slides 16 | slides = Array.new 17 | @files.each do |f| 18 | if f.name.include? 'ppt/slides/slide' 19 | slides.push RubyPowerpoint::Slide.new(self, f.name) 20 | end 21 | end 22 | slides.sort{|a,b| a.slide_num <=> b.slide_num} 23 | end 24 | 25 | def close 26 | @files.close 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/ruby_powerpoint/slide.rb: -------------------------------------------------------------------------------- 1 | require 'zip/filesystem' 2 | require 'nokogiri' 3 | 4 | module RubyPowerpoint 5 | class RubyPowerpoint::Slide 6 | 7 | attr_reader :presentation, 8 | :slide_number, 9 | :slide_number, 10 | :slide_file_name 11 | 12 | def initialize presentation, slide_xml_path 13 | @presentation = presentation 14 | @slide_xml_path = slide_xml_path 15 | @slide_number = extract_slide_number_from_path slide_xml_path 16 | @slide_notes_xml_path = "ppt/notesSlides/notesSlide#{@slide_number}.xml" 17 | @slide_file_name = extract_slide_file_name_from_path slide_xml_path 18 | 19 | parse_slide 20 | parse_slide_notes 21 | parse_relation 22 | end 23 | 24 | def parse_slide 25 | slide_doc = @presentation.files.file.open @slide_xml_path 26 | @slide_xml = Nokogiri::XML::Document.parse slide_doc 27 | end 28 | 29 | def parse_slide_notes 30 | slide_notes_doc = @presentation.files.file.open @slide_notes_xml_path rescue nil 31 | @slide_notes_xml = Nokogiri::XML::Document.parse(slide_notes_doc) if slide_notes_doc 32 | end 33 | 34 | def parse_relation 35 | @relation_xml_path = "ppt/slides/_rels/#{@slide_file_name}.rels" 36 | if @presentation.files.file.exist? @relation_xml_path 37 | relation_doc = @presentation.files.file.open @relation_xml_path 38 | @relation_xml = Nokogiri::XML::Document.parse relation_doc 39 | end 40 | end 41 | 42 | def content 43 | content_elements @slide_xml 44 | end 45 | 46 | def notes_content 47 | content_elements @slide_notes_xml 48 | end 49 | 50 | def title 51 | title_elements = title_elements(@slide_xml) 52 | title_elements.join(" ") if title_elements.length > 0 53 | end 54 | 55 | def images 56 | image_elements(@relation_xml) 57 | .map.each do |node| 58 | @presentation.files.file.open( 59 | node['Target'].gsub('..', 'ppt')) 60 | end 61 | end 62 | 63 | def slide_num 64 | @slide_xml_path.match(/slide([0-9]*)\.xml$/)[1].to_i 65 | end 66 | 67 | def paragraphs 68 | paragraph_element @slide_xml 69 | end 70 | 71 | private 72 | 73 | def extract_slide_number_from_path path 74 | path.gsub('ppt/slides/slide', '').gsub('.xml', '').to_i 75 | end 76 | 77 | def extract_slide_file_name_from_path path 78 | path.gsub('ppt/slides/', '') 79 | end 80 | 81 | def title_elements(xml) 82 | shape_elements(xml).select{ |shape| element_is_title(shape) } 83 | end 84 | 85 | def content_elements(xml) 86 | xml.xpath('//a:t').collect{ |node| node.text } 87 | end 88 | 89 | def image_elements(xml) 90 | xml.css('Relationship').select{ |node| element_is_image(node) } 91 | end 92 | 93 | def shape_elements(xml) 94 | xml.xpath('//p:sp') 95 | end 96 | 97 | def paragraph_element(xml) 98 | xml.xpath('//a:p').collect{ |node| RubyPowerpoint::Paragraph.new(self, node) } 99 | end 100 | 101 | def element_is_title(shape) 102 | shape.xpath('.//p:nvSpPr/p:nvPr/p:ph').select{ |prop| prop['type'] == 'title' || prop['type'] == 'ctrTitle' }.length > 0 103 | end 104 | 105 | def element_is_image(node) 106 | node['Type'].include? 'image' 107 | end 108 | end 109 | end 110 | -------------------------------------------------------------------------------- /lib/ruby_powerpoint/version.rb: -------------------------------------------------------------------------------- 1 | module RubyPowerpoint 2 | VERSION = "1.4.4" 3 | end 4 | -------------------------------------------------------------------------------- /ruby_powerpoint.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'ruby_powerpoint/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "ruby_powerpoint" 8 | spec.version = RubyPowerpoint::VERSION 9 | spec.authors = ["pythonicrubyist"] 10 | spec.email = ["pythonicrubyist@gmail.com"] 11 | spec.description = %q{A Ruby gem that can extract text and images from PowerPoint (pptx) files.} 12 | spec.summary = %q{ruby_powerpoint is a Ruby gem that can extract title, content and images from Powerpoint (pptx) slides.} 13 | spec.homepage = "https://github.com/pythonicrubyist/ruby_powerpoint" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files`.split($/) 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | 21 | spec.required_ruby_version = '>= 1.9.2' 22 | 23 | spec.add_development_dependency "bundler", "~> 1.3" 24 | spec.add_development_dependency "rake" 25 | spec.add_development_dependency 'rspec', '~> 3' 26 | 27 | spec.add_dependency 'nokogiri', '~> 1.6' 28 | spec.add_dependency 'rubyzip', '~> 1.0' 29 | end 30 | -------------------------------------------------------------------------------- /spec/fixtures/invalid.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonicrubyist/ruby_powerpoint/152d6c13a6ce3d86f25cd63972ef3c6bf5c20f95/spec/fixtures/invalid.xls -------------------------------------------------------------------------------- /spec/fixtures/rime.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonicrubyist/ruby_powerpoint/152d6c13a6ce3d86f25cd63972ef3c6bf5c20f95/spec/fixtures/rime.pptx -------------------------------------------------------------------------------- /spec/fixtures/sample.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonicrubyist/ruby_powerpoint/152d6c13a6ce3d86f25cd63972ef3c6bf5c20f95/spec/fixtures/sample.pptx -------------------------------------------------------------------------------- /spec/test_spec.rb: -------------------------------------------------------------------------------- 1 | require 'ruby_powerpoint' 2 | 3 | describe 'RubyPowerpoint trying to parsing an invalid file.' do 4 | it 'not open an XLS file successfully.' do 5 | expect { RubyPowerpoint::Presentation.new 'specs/fixtures/invalid.xls' }.to raise_error 'Not a valid file format.' 6 | end 7 | end 8 | 9 | describe 'RubyPowerpoint parsing a sample PPTX file' do 10 | before(:all) do 11 | @deck = RubyPowerpoint::Presentation.new 'spec/fixtures/sample.pptx' 12 | end 13 | 14 | after(:all) do 15 | @deck.close 16 | end 17 | 18 | it 'parse a PPTX file successfully.' do 19 | expect(@deck).to_not be_nil 20 | expect(@deck.slides).to_not eql [] 21 | expect(@deck.slides.first.content).to eql ["Some test ", "Powerpoint"] 22 | expect(@deck.slides.first.content).to eql ["Some test ", "Powerpoint"] 23 | image_byte_stream_1 = @deck.slides.first.images.first.read 24 | File.open('temp_1.jpg', 'w'){|f| f.puts image_byte_stream_1} 25 | 26 | expect(@deck.slides.first.images.first).to_not eql nil #"ppt/media/image1.jpeg" 27 | expect(@deck.slides.last.title).to eql "Some title here" 28 | expect(@deck.slides.last.content).to eql ["Some title here", "Some txt here", "Some ", "more text here."] 29 | image_byte_stream_2 = @deck.slides.last.images.first.read 30 | File.open('temp_2.jpg', 'w'){|f| f.puts image_byte_stream_2} 31 | end 32 | 33 | it "it parses Slide Notes of a PPTX slides" do 34 | notes_content = @deck.slides[0].notes_content 35 | expect(notes_content).to eql ["Testing", " Multiline Notes.", "To be extracted here.", "Multiline notes extracted.", "1"] 36 | end 37 | 38 | end 39 | 40 | describe 'open rime.pptx file' do 41 | before(:all) do 42 | @deck = RubyPowerpoint::Presentation.new 'spec/fixtures/rime.pptx' 43 | end 44 | 45 | after(:all) do 46 | @deck.close 47 | end 48 | 49 | it 'opened rime.pptx successfully' do 50 | expect(@deck).to_not be_nil 51 | expect(@deck.slides).to_not eql [] 52 | end 53 | 54 | it 'should have the right number of slides' do 55 | expect(@deck.slides.length).to eql 12 56 | end 57 | 58 | it 'the old content method should work the same way' do 59 | expect(@deck.slides[0].content).to eql ["The Rime of the Ancient Mariner", "(text of 1834)", "http://rpo.library.utoronto.ca/poems/rime-ancient-mariner-text-1834"] 60 | end 61 | 62 | context 'the titles should be right' do 63 | it 'should be able to get a main slide (usually centered)' do 64 | expect(@deck.slides[0].title).to eql "The Rime of the Ancient Mariner" 65 | end 66 | it 'should be able to get regular slide titles' do 67 | expect(@deck.slides[1].title).to eql "Argument" 68 | expect(@deck.slides[2].title).to eql "PART I" 69 | expect(@deck.slides[3].title).to eql "PART II" 70 | expect(@deck.slides[4].title).to eql "Part III" 71 | expect(@deck.slides[8].title).to eql "There's more" 72 | end 73 | it 'should return nil if the slide has no title' do 74 | expect(@deck.slides[5].title).to be_nil 75 | expect(@deck.slides[6].title).to be_nil 76 | end 77 | 78 | it 'should only get one title even if there are two things that visually look like titles' do 79 | expect(@deck.slides[7].title).to eql "What if we have two" 80 | end 81 | 82 | context 'when slide contains paragraph' do 83 | before(:all) do 84 | @slide = @deck.slides[1] 85 | end 86 | 87 | it 'should return the list of paragraphs' do 88 | expect(@slide.paragraphs.count).to eql 2 89 | end 90 | 91 | it 'should return the content of the paragraph' do 92 | expect(@slide.paragraphs[0].content).to eq ['Argument'] 93 | end 94 | end 95 | end 96 | end 97 | --------------------------------------------------------------------------------