├── .gitignore
├── Gemfile
├── LICENSE.txt
├── README.rdoc
├── Rakefile
├── lib
├── ruby_powerpoint.rb
└── ruby_powerpoint
│ ├── paragraph.rb
│ ├── presentation.rb
│ ├── slide.rb
│ └── version.rb
├── ruby_powerpoint.gemspec
└── spec
├── fixtures
├── invalid.xls
├── rime.pptx
└── sample.pptx
└── test_spec.rb
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | .bundle
4 | .config
5 | .yardoc
6 | Gemfile.lock
7 | InstalledFiles
8 | _yardoc
9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in ruby_powerpoint.gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013 TODO: Write your name
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
1 | {
}[https://badge.fury.io/rb/ruby_powerpoint]
2 | {
}[https://ruby-gem-downloads-badge.herokuapp.com/ruby_powerpoint?type=total&total_label=downloads]
3 |
4 | = RubyPowerpoint -- Parser for Powerpoint (pptx) files.
5 |
6 | ruby_powerpoint is a Ruby gem that can extract title, content and images from Powerpoint (pptx) slides.
7 |
8 |
9 | == Installation
10 |
11 | RubyPowerpoint can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command:
12 |
13 | gem install ruby_powerpoint
14 |
15 | To use it in Rails, add this line to your Gemfile:
16 |
17 | gem "ruby_powerpoint"
18 |
19 |
20 | == Basic Usage
21 | RubyPowerpoint can parse a PowerPoint file (pptx) by extracting text and images from each slide:
22 |
23 | require 'ruby_powerpoint'
24 |
25 | deck = RubyPowerpoint::Presentation.new "specs/fixtures/sample.pptx"
26 |
27 | deck.slides.each do |slide|
28 | slide.content # => ["Presentation Notes...", "12345"]
29 | slide.title # => "Prsentation Header"
30 | slide.images # => ["\xE3=\xA8h\x8E\x17\...."] Byte Stream
31 | # Saving the image byte stream to a file:
32 | File.open('temp.jpg', 'w'){|f| f.puts slide.images[0].read}
33 | end
34 |
35 |
36 | == Contributing
37 |
38 | Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request.
39 |
40 | After forking and then cloning the repository locally, install Bundler and then use it
41 | to install the development gem dependencies:
42 |
43 | gem install bundler
44 | bundle install
45 |
46 | Once this is complete, you should be able to run the test suite:
47 |
48 | rake
49 |
50 |
51 | == Bug Reporting
52 |
53 | Please use the {Issues}[https://github.com/pythonicrubyist/ruby_powerpoint/issues] page to report bugs or suggest new enhancements.
54 |
55 |
56 | == License
57 |
58 | RubyPowerpoint has been published under {MIT License}[https://github.com/pythonicrubyist/ruby_powerpoint/blob/master/LICENSE.txt]
59 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | require 'rspec/core/rake_task'
3 |
4 | RSpec::Core::RakeTask.new('spec')
5 |
6 | # If you want to make this the default task
7 | task :default => :spec
--------------------------------------------------------------------------------
/lib/ruby_powerpoint.rb:
--------------------------------------------------------------------------------
1 | require "ruby_powerpoint/version"
2 | require 'ruby_powerpoint/presentation'
3 | require 'ruby_powerpoint/slide'
4 | require 'ruby_powerpoint/paragraph'
5 |
6 | module RubyPowerpoint
7 | # Your code goes here...
8 | end
9 |
--------------------------------------------------------------------------------
/lib/ruby_powerpoint/paragraph.rb:
--------------------------------------------------------------------------------
1 | module RubyPowerpoint
2 | class RubyPowerpoint::Paragraph
3 | def initialize slide, paragraph_xml
4 | @slide = slide
5 | @presentation = slide.presentation
6 | @paragraph_xml = paragraph_xml
7 | end
8 |
9 | def content
10 | content_element @paragraph_xml
11 | end
12 |
13 | private
14 |
15 | def content_element(xml)
16 | xml.xpath('.//a:t').collect{ |node| node.text }
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/lib/ruby_powerpoint/presentation.rb:
--------------------------------------------------------------------------------
1 | require 'zip/filesystem'
2 | require 'nokogiri'
3 |
4 | module RubyPowerpoint
5 |
6 | class RubyPowerpoint::Presentation
7 |
8 | attr_reader :files
9 |
10 | def initialize path
11 | raise 'Not a valid file format.' unless (['.pptx'].include? File.extname(path).downcase)
12 | @files = Zip::File.open path
13 | end
14 |
15 | def slides
16 | slides = Array.new
17 | @files.each do |f|
18 | if f.name.include? 'ppt/slides/slide'
19 | slides.push RubyPowerpoint::Slide.new(self, f.name)
20 | end
21 | end
22 | slides.sort{|a,b| a.slide_num <=> b.slide_num}
23 | end
24 |
25 | def close
26 | @files.close
27 | end
28 | end
29 | end
30 |
--------------------------------------------------------------------------------
/lib/ruby_powerpoint/slide.rb:
--------------------------------------------------------------------------------
1 | require 'zip/filesystem'
2 | require 'nokogiri'
3 |
4 | module RubyPowerpoint
5 | class RubyPowerpoint::Slide
6 |
7 | attr_reader :presentation,
8 | :slide_number,
9 | :slide_number,
10 | :slide_file_name
11 |
12 | def initialize presentation, slide_xml_path
13 | @presentation = presentation
14 | @slide_xml_path = slide_xml_path
15 | @slide_number = extract_slide_number_from_path slide_xml_path
16 | @slide_notes_xml_path = "ppt/notesSlides/notesSlide#{@slide_number}.xml"
17 | @slide_file_name = extract_slide_file_name_from_path slide_xml_path
18 |
19 | parse_slide
20 | parse_slide_notes
21 | parse_relation
22 | end
23 |
24 | def parse_slide
25 | slide_doc = @presentation.files.file.open @slide_xml_path
26 | @slide_xml = Nokogiri::XML::Document.parse slide_doc
27 | end
28 |
29 | def parse_slide_notes
30 | slide_notes_doc = @presentation.files.file.open @slide_notes_xml_path rescue nil
31 | @slide_notes_xml = Nokogiri::XML::Document.parse(slide_notes_doc) if slide_notes_doc
32 | end
33 |
34 | def parse_relation
35 | @relation_xml_path = "ppt/slides/_rels/#{@slide_file_name}.rels"
36 | if @presentation.files.file.exist? @relation_xml_path
37 | relation_doc = @presentation.files.file.open @relation_xml_path
38 | @relation_xml = Nokogiri::XML::Document.parse relation_doc
39 | end
40 | end
41 |
42 | def content
43 | content_elements @slide_xml
44 | end
45 |
46 | def notes_content
47 | content_elements @slide_notes_xml
48 | end
49 |
50 | def title
51 | title_elements = title_elements(@slide_xml)
52 | title_elements.join(" ") if title_elements.length > 0
53 | end
54 |
55 | def images
56 | image_elements(@relation_xml)
57 | .map.each do |node|
58 | @presentation.files.file.open(
59 | node['Target'].gsub('..', 'ppt'))
60 | end
61 | end
62 |
63 | def slide_num
64 | @slide_xml_path.match(/slide([0-9]*)\.xml$/)[1].to_i
65 | end
66 |
67 | def paragraphs
68 | paragraph_element @slide_xml
69 | end
70 |
71 | private
72 |
73 | def extract_slide_number_from_path path
74 | path.gsub('ppt/slides/slide', '').gsub('.xml', '').to_i
75 | end
76 |
77 | def extract_slide_file_name_from_path path
78 | path.gsub('ppt/slides/', '')
79 | end
80 |
81 | def title_elements(xml)
82 | shape_elements(xml).select{ |shape| element_is_title(shape) }
83 | end
84 |
85 | def content_elements(xml)
86 | xml.xpath('//a:t').collect{ |node| node.text }
87 | end
88 |
89 | def image_elements(xml)
90 | xml.css('Relationship').select{ |node| element_is_image(node) }
91 | end
92 |
93 | def shape_elements(xml)
94 | xml.xpath('//p:sp')
95 | end
96 |
97 | def paragraph_element(xml)
98 | xml.xpath('//a:p').collect{ |node| RubyPowerpoint::Paragraph.new(self, node) }
99 | end
100 |
101 | def element_is_title(shape)
102 | shape.xpath('.//p:nvSpPr/p:nvPr/p:ph').select{ |prop| prop['type'] == 'title' || prop['type'] == 'ctrTitle' }.length > 0
103 | end
104 |
105 | def element_is_image(node)
106 | node['Type'].include? 'image'
107 | end
108 | end
109 | end
110 |
--------------------------------------------------------------------------------
/lib/ruby_powerpoint/version.rb:
--------------------------------------------------------------------------------
1 | module RubyPowerpoint
2 | VERSION = "1.4.4"
3 | end
4 |
--------------------------------------------------------------------------------
/ruby_powerpoint.gemspec:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | lib = File.expand_path('../lib', __FILE__)
3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4 | require 'ruby_powerpoint/version'
5 |
6 | Gem::Specification.new do |spec|
7 | spec.name = "ruby_powerpoint"
8 | spec.version = RubyPowerpoint::VERSION
9 | spec.authors = ["pythonicrubyist"]
10 | spec.email = ["pythonicrubyist@gmail.com"]
11 | spec.description = %q{A Ruby gem that can extract text and images from PowerPoint (pptx) files.}
12 | spec.summary = %q{ruby_powerpoint is a Ruby gem that can extract title, content and images from Powerpoint (pptx) slides.}
13 | spec.homepage = "https://github.com/pythonicrubyist/ruby_powerpoint"
14 | spec.license = "MIT"
15 |
16 | spec.files = `git ls-files`.split($/)
17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19 | spec.require_paths = ["lib"]
20 |
21 | spec.required_ruby_version = '>= 1.9.2'
22 |
23 | spec.add_development_dependency "bundler", "~> 1.3"
24 | spec.add_development_dependency "rake"
25 | spec.add_development_dependency 'rspec', '~> 3'
26 |
27 | spec.add_dependency 'nokogiri', '~> 1.6'
28 | spec.add_dependency 'rubyzip', '~> 1.0'
29 | end
30 |
--------------------------------------------------------------------------------
/spec/fixtures/invalid.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonicrubyist/ruby_powerpoint/152d6c13a6ce3d86f25cd63972ef3c6bf5c20f95/spec/fixtures/invalid.xls
--------------------------------------------------------------------------------
/spec/fixtures/rime.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonicrubyist/ruby_powerpoint/152d6c13a6ce3d86f25cd63972ef3c6bf5c20f95/spec/fixtures/rime.pptx
--------------------------------------------------------------------------------
/spec/fixtures/sample.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonicrubyist/ruby_powerpoint/152d6c13a6ce3d86f25cd63972ef3c6bf5c20f95/spec/fixtures/sample.pptx
--------------------------------------------------------------------------------
/spec/test_spec.rb:
--------------------------------------------------------------------------------
1 | require 'ruby_powerpoint'
2 |
3 | describe 'RubyPowerpoint trying to parsing an invalid file.' do
4 | it 'not open an XLS file successfully.' do
5 | expect { RubyPowerpoint::Presentation.new 'specs/fixtures/invalid.xls' }.to raise_error 'Not a valid file format.'
6 | end
7 | end
8 |
9 | describe 'RubyPowerpoint parsing a sample PPTX file' do
10 | before(:all) do
11 | @deck = RubyPowerpoint::Presentation.new 'spec/fixtures/sample.pptx'
12 | end
13 |
14 | after(:all) do
15 | @deck.close
16 | end
17 |
18 | it 'parse a PPTX file successfully.' do
19 | expect(@deck).to_not be_nil
20 | expect(@deck.slides).to_not eql []
21 | expect(@deck.slides.first.content).to eql ["Some test ", "Powerpoint"]
22 | expect(@deck.slides.first.content).to eql ["Some test ", "Powerpoint"]
23 | image_byte_stream_1 = @deck.slides.first.images.first.read
24 | File.open('temp_1.jpg', 'w'){|f| f.puts image_byte_stream_1}
25 |
26 | expect(@deck.slides.first.images.first).to_not eql nil #"ppt/media/image1.jpeg"
27 | expect(@deck.slides.last.title).to eql "Some title here"
28 | expect(@deck.slides.last.content).to eql ["Some title here", "Some txt here", "Some ", "more text here."]
29 | image_byte_stream_2 = @deck.slides.last.images.first.read
30 | File.open('temp_2.jpg', 'w'){|f| f.puts image_byte_stream_2}
31 | end
32 |
33 | it "it parses Slide Notes of a PPTX slides" do
34 | notes_content = @deck.slides[0].notes_content
35 | expect(notes_content).to eql ["Testing", " Multiline Notes.", "To be extracted here.", "Multiline notes extracted.", "1"]
36 | end
37 |
38 | end
39 |
40 | describe 'open rime.pptx file' do
41 | before(:all) do
42 | @deck = RubyPowerpoint::Presentation.new 'spec/fixtures/rime.pptx'
43 | end
44 |
45 | after(:all) do
46 | @deck.close
47 | end
48 |
49 | it 'opened rime.pptx successfully' do
50 | expect(@deck).to_not be_nil
51 | expect(@deck.slides).to_not eql []
52 | end
53 |
54 | it 'should have the right number of slides' do
55 | expect(@deck.slides.length).to eql 12
56 | end
57 |
58 | it 'the old content method should work the same way' do
59 | expect(@deck.slides[0].content).to eql ["The Rime of the Ancient Mariner", "(text of 1834)", "http://rpo.library.utoronto.ca/poems/rime-ancient-mariner-text-1834"]
60 | end
61 |
62 | context 'the titles should be right' do
63 | it 'should be able to get a main slide (usually centered)' do
64 | expect(@deck.slides[0].title).to eql "The Rime of the Ancient Mariner"
65 | end
66 | it 'should be able to get regular slide titles' do
67 | expect(@deck.slides[1].title).to eql "Argument"
68 | expect(@deck.slides[2].title).to eql "PART I"
69 | expect(@deck.slides[3].title).to eql "PART II"
70 | expect(@deck.slides[4].title).to eql "Part III"
71 | expect(@deck.slides[8].title).to eql "There's more"
72 | end
73 | it 'should return nil if the slide has no title' do
74 | expect(@deck.slides[5].title).to be_nil
75 | expect(@deck.slides[6].title).to be_nil
76 | end
77 |
78 | it 'should only get one title even if there are two things that visually look like titles' do
79 | expect(@deck.slides[7].title).to eql "What if we have two"
80 | end
81 |
82 | context 'when slide contains paragraph' do
83 | before(:all) do
84 | @slide = @deck.slides[1]
85 | end
86 |
87 | it 'should return the list of paragraphs' do
88 | expect(@slide.paragraphs.count).to eql 2
89 | end
90 |
91 | it 'should return the content of the paragraph' do
92 | expect(@slide.paragraphs[0].content).to eq ['Argument']
93 | end
94 | end
95 | end
96 | end
97 |
--------------------------------------------------------------------------------