├── .gitignore
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.md
├── Rakefile
├── fluent-plugin-google-cloud-storage.gemspec
├── lib
└── fluent
│ └── plugin
│ └── out_google_cloud_storage.rb
└── test
├── helper.rb
└── plugin
└── test_out_google_cloud_storage.rb
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | /.config
4 | /coverage/
5 | /InstalledFiles
6 | /pkg/
7 | /spec/reports/
8 | /test/tmp/
9 | /test/version_tmp/
10 | /tmp/
11 |
12 | ## Specific to RubyMotion:
13 | .dat*
14 | .repl_history
15 | build/
16 |
17 | ## Documentation cache and generated files:
18 | /.yardoc/
19 | /_yardoc/
20 | /doc/
21 | /rdoc/
22 |
23 | ## Environment normalisation:
24 | /.bundle/
25 | /lib/bundler/man/
26 |
27 | # for a library or gem, you might want to ignore these files since the code is
28 | # intended to run in multiple environments; otherwise, check them in:
29 | Gemfile.lock
30 | .ruby-version
31 | .ruby-gemset
32 |
33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34 | .rvmrc
35 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in fluent-plugin-webhdfs.gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | PATH
2 | remote: .
3 | specs:
4 | fluent-plugin-google-cloud-storage (0.3.3)
5 | fluent-mixin-config-placeholders (>= 0.3.0)
6 | fluent-mixin-plaintextformatter (>= 0.2.1)
7 | fluentd (>= 0.10.53)
8 | google-api-client (~> 0.7)
9 |
10 | GEM
11 | remote: https://rubygems.org/
12 | specs:
13 | addressable (2.3.6)
14 | autoparse (0.3.3)
15 | addressable (>= 2.3.1)
16 | extlib (>= 0.9.15)
17 | multi_json (>= 1.0.0)
18 | cool.io (1.2.4)
19 | extlib (0.9.16)
20 | faraday (0.9.0)
21 | multipart-post (>= 1.2, < 3)
22 | fluent-mixin-config-placeholders (0.3.0)
23 | fluentd
24 | uuidtools (>= 2.1.5)
25 | fluent-mixin-plaintextformatter (0.2.6)
26 | fluentd
27 | ltsv
28 | fluentd (0.10.53)
29 | cool.io (>= 1.1.1, < 2.0.0, != 1.2.0)
30 | http_parser.rb (>= 0.5.1, < 0.7.0)
31 | json (>= 1.4.3)
32 | msgpack (>= 0.4.4, < 0.6.0, != 0.5.3, != 0.5.2, != 0.5.1, != 0.5.0)
33 | sigdump (~> 0.2.2)
34 | yajl-ruby (~> 1.0)
35 | google-api-client (0.7.1)
36 | addressable (>= 2.3.2)
37 | autoparse (>= 0.3.3)
38 | extlib (>= 0.9.15)
39 | faraday (>= 0.9.0)
40 | jwt (>= 0.1.5)
41 | launchy (>= 2.1.1)
42 | multi_json (>= 1.0.0)
43 | retriable (>= 1.4)
44 | signet (>= 0.5.0)
45 | uuidtools (>= 2.1.0)
46 | http_parser.rb (0.6.0)
47 | json (1.8.1)
48 | jwt (1.0.0)
49 | launchy (2.4.2)
50 | addressable (~> 2.3)
51 | ltsv (0.1.0)
52 | msgpack (0.5.9)
53 | multi_json (1.10.1)
54 | multipart-post (2.0.0)
55 | rake (10.3.2)
56 | retriable (1.4.1)
57 | sigdump (0.2.2)
58 | signet (0.5.1)
59 | addressable (>= 2.2.3)
60 | faraday (>= 0.9.0.rc5)
61 | jwt (>= 0.1.5)
62 | multi_json (>= 1.0.0)
63 | uuidtools (2.1.5)
64 | yajl-ruby (1.2.1)
65 |
66 | PLATFORMS
67 | ruby
68 |
69 | DEPENDENCIES
70 | fluent-plugin-google-cloud-storage!
71 | rake
72 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012- TAGOMORI Satoshi
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # fluent-plugin-google-cloud-storage
2 |
3 | [](https://badge.fury.io/rb/fluent-plugin-google-cloud-storage)
4 |
5 | [Fluentd](http://fluentd.org/) output plugin to write data into a [Google Cloud
6 | Storage](https://cloud.google.com/storage/) bucket.
7 |
8 | GoogleCloudStorageOutput slices data by time (specified unit), and store these
9 | data as file of plain text. You can specify to:
10 |
11 | * format whole data as serialized JSON, single attribute or separated multi attributes
12 | * or LTSV, labeled-TSV (see http://ltsv.org/ )
13 | * include time as line header, or not
14 | * include tag as line header, or not
15 | * change field separator (default: TAB)
16 | * add new line as termination, or not
17 |
18 | And you can specify output file path as 'path path/to/dir/access.%Y%m%d.log', then get 'path/to/dir/access.20120316.log' in your GCS bucket.
19 |
20 | ## Configuration
21 |
22 | ### Examples
23 |
24 | #### Complete Example
25 |
26 | # tail
27 |
28 | type tail
29 | format none
30 | path /tmp/test.log
31 | pos_file /var/log/td-agent/test.pos
32 | tag tail.test
33 |
34 |
35 | # post to GCS
36 |
37 | type google_cloud_storage
38 | service_email xxx.xxx.com
39 | service_pkcs12_path /etc/td-agent/My_First_Project-xxx.p12
40 | project_id handy-compass-xxx
41 | bucket_id test_bucket
42 | path tail.test/%Y/%m/%d/%H/${hostname}/${chunk_id}.log.gz
43 | output_include_time false
44 | output_include_tag false
45 | buffer_path /var/log/td-agent/buffer/tail.test
46 | # flush_interval 600s
47 | buffer_chunk_limit 128m
48 | time_slice_wait 300s
49 | compress gzip
50 |
51 |
52 | #### More Examples
53 |
54 | To store data by `time,tag,json` (same with 'type file') with GCS:
55 |
56 |
57 | type google_cloud_storage
58 | service_email SERVICE_ACCOUNT_EMAIL
59 | service_pkcs12_path /path/to/key.p12
60 | project_id name-of-project
61 | bucket_id name-of-bucket
62 | path path/to/access.%Y%m%d_%H.${chunk_id}.log
63 |
64 |
65 | To specify the pkcs12 file's password, use `service_pkcs12_password`:
66 |
67 |
68 | type google_cloud_storage
69 | service_email SERVICE_ACCOUNT_EMAIL
70 | service_pkcs12_path /path/to/key.p12
71 | service_pkcs12_password SECRET_PASSWORD
72 | project_id name-of-project
73 | bucket_id name-of-bucket
74 | path path/to/access.%Y%m%d_%H.${chunk_id}.log
75 |
76 |
77 | If you want JSON object only (without time or tag or both on header of lines), specify it by `output_include_time` or `output_include_tag` (default true):
78 |
79 |
80 | type google_cloud_storage
81 | service_email SERVICE_ACCOUNT_EMAIL
82 | service_pkcs12_path /path/to/key.p12
83 | project_id name-of-project
84 | bucket_id name-of-bucket
85 | path path/to/access.%Y%m%d_%H.${chunk_id}.log
86 | output_include_time false
87 | output_include_tag false
88 |
89 |
90 | To store data as LTSV without time and tag over WebHDFS:
91 |
92 |
93 | type google_cloud_storage
94 | # ...
95 | output_data_type ltsv
96 |
97 |
98 | Store data as TSV (TAB separated values) of specified keys, without time, with tag (removed prefix 'access'):
99 |
100 |
101 | type google_cloud_storage
102 | # ...
103 |
104 | field_separator TAB # or 'SPACE', 'COMMA' or 'SOH'(Start Of Heading: \001)
105 | output_include_time false
106 | output_include_tag true
107 | remove_prefix access
108 |
109 | output_data_type attr:path,status,referer,agent,bytes
110 |
111 |
112 | If message doesn't have specified attribute, fluent-plugin-webhdfs outputs 'NULL' instead of values.
113 |
114 | To store data compressed (gzip only now):
115 |
116 |
117 | type google_cloud_storage
118 | # ...
119 |
120 | compress gzip
121 |
122 |
123 | ### Major Caveat
124 |
125 | As GCS does not support appending to files, if you have multiple fluentd nodes,
126 | you most likely each to log to separate files. You can use '${hostname}' or
127 | '${uuid:random}' placeholders in configuration for this purpose.
128 |
129 | Note the `${chunk_id}` placeholder in the following paths. The plugin requires the presence
130 | of the placeholder to guarantee that each flush will not overwrite an existing
131 | file.
132 |
133 | For hostname:
134 |
135 |
136 | type google_cloud_storage
137 | # ...
138 | path log/access/%Y%m%d/${hostname}.${chunk_id}.log
139 |
140 |
141 | Or with random filename (to avoid duplicated file name only):
142 |
143 |
144 | type google_cloud_storage
145 | # ...
146 | path log/access/%Y%m%d/${uuid:random}.${chunk_id}.log
147 |
148 |
149 | With the configurations above, you can handle all of files of
150 | '/log/access/20120820/*' as specified timeslice access logs.
151 |
152 | ## TODO
153 |
154 | * docs?
155 | * patches welcome!
156 |
157 | ## Copyright
158 |
159 | * Copyright (c) 2014- Hsiu-Fan Wang (hfwang@porkbuns.net)
160 | * License
161 | * Apache License, Version 2.0
162 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env rake
2 | require "bundler/gem_tasks"
3 |
4 | require 'rake/testtask'
5 | Rake::TestTask.new(:test) do |test|
6 | test.libs << 'lib' << 'test'
7 | test.pattern = 'test/**/test_*.rb'
8 | test.verbose = true
9 | end
10 |
11 | task :default => :test
12 |
--------------------------------------------------------------------------------
/fluent-plugin-google-cloud-storage.gemspec:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 |
3 | Gem::Specification.new do |gem|
4 | gem.name = "fluent-plugin-google-cloud-storage"
5 | gem.version = "1.0.0"
6 | gem.authors = ["Hsiu-Fan Wang"]
7 | gem.email = ["hfwang@porkbuns.net"]
8 | gem.summary = %q{Fluentd plugin to write data to Google Cloud Storage}
9 | gem.description = %q{Google Cloud Storage fluentd output}
10 | gem.homepage = "https://github.com/hfwang/fluent-plugin-google-cloud-storage"
11 | gem.license = "APLv2"
12 |
13 | gem.files = `git ls-files`.split($\)
14 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16 | gem.require_paths = ["lib"]
17 |
18 | gem.add_development_dependency "rake"
19 | gem.add_runtime_dependency "fluentd", '>= 0.10.53'
20 | gem.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
21 | gem.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
22 | gem.add_runtime_dependency "google-api-client", '0.8.6'
23 | gem.add_runtime_dependency "mime-types", '>= 3.0'
24 | end
25 |
--------------------------------------------------------------------------------
/lib/fluent/plugin/out_google_cloud_storage.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | require 'fluent/mixin/config_placeholders'
4 | require 'fluent/mixin/plaintextformatter'
5 | require 'fluent/log'
6 |
7 | class Fluent::GoogleCloudStorageOutput < Fluent::TimeSlicedOutput
8 | Fluent::Plugin.register_output('google_cloud_storage', self)
9 |
10 | config_set_default :buffer_type, 'file'
11 | config_set_default :time_slice_format, '%Y%m%d'
12 |
13 | config_param :ignore_start_check_error, :bool, :default => false
14 |
15 | include Fluent::Mixin::ConfigPlaceholders
16 |
17 | config_param :service_email, :string
18 | config_param :service_pkcs12_path, :string
19 | config_param :service_pkcs12_password, :string, :default => "notasecret"
20 | config_param :project_id, :string
21 | config_param :bucket_id, :string
22 | config_param :path, :string
23 |
24 | config_param :compress, :default => nil do |val|
25 | unless ["gz", "gzip"].include?(val)
26 | raise ConfigError, "Unsupported compression algorithm '#{val}'"
27 | end
28 | val
29 | end
30 |
31 | # how many times of write failure before switch to standby namenode
32 | # by default it's 11 times that costs 1023 seconds inside fluentd,
33 | # which is considered enough to exclude the scenes that caused by temporary network fail or single datanode fail
34 | config_param :failures_before_use_standby, :integer, :default => 11
35 |
36 | include Fluent::Mixin::PlainTextFormatter
37 |
38 | config_param :default_tag, :string, :default => 'tag_missing'
39 |
40 | CHUNK_ID_PLACE_HOLDER = '${chunk_id}'
41 |
42 | def initialize
43 | super
44 | require 'zlib'
45 | require 'net/http'
46 | require 'time'
47 | require 'google/api_client'
48 | require 'signet/oauth_2/client'
49 | require 'mime-types'
50 | end
51 |
52 | # Define `log` method for v0.10.42 or earlier
53 | unless method_defined?(:log)
54 | define_method("log") { $log }
55 | end
56 |
57 | def call_google_api(params)
58 | # refresh_auth
59 | if @google_api_client.authorization.expired?
60 | @google_api_client.authorization.fetch_access_token!
61 | end
62 | return @google_api_client.execute(params)
63 | end
64 |
65 | def configure(conf)
66 | if conf['path']
67 | if conf['path'].index('%S')
68 | conf['time_slice_format'] = '%Y%m%d%H%M%S'
69 | elsif conf['path'].index('%M')
70 | conf['time_slice_format'] = '%Y%m%d%H%M'
71 | elsif conf['path'].index('%H')
72 | conf['time_slice_format'] = '%Y%m%d%H'
73 | end
74 | end
75 |
76 | super
77 |
78 | @client = prepare_client()
79 |
80 | if @path.index(CHUNK_ID_PLACE_HOLDER).nil?
81 | raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false."
82 | end
83 | end
84 |
85 | def prepare_client
86 | @google_api_client = Google::APIClient.new(
87 | :application_name => "fluent-plugin-google-cloud-storage",
88 | :user_agent => "fluent-plugin-google-cloud-storage/1.0.0 (gzip)",
89 | :application_version => "0.3.1")
90 | begin
91 | key = Google::APIClient::KeyUtils.load_from_pkcs12(
92 | @service_pkcs12_path, @service_pkcs12_password)
93 | @google_api_client.authorization = Signet::OAuth2::Client.new(
94 | token_credential_uri: "https://accounts.google.com/o/oauth2/token",
95 | audience: "https://accounts.google.com/o/oauth2/token",
96 | issuer: @service_email,
97 | scope: "https://www.googleapis.com/auth/devstorage.read_write",
98 | signing_key: key)
99 | @google_api_client.authorization.fetch_access_token!
100 | rescue Signet::AuthorizationError
101 | raise Fluent::ConfigError, "Error occurred authenticating with Google"
102 | end
103 | @storage_api = @google_api_client.discovered_api("storage", "v1")
104 | return @google_api_client
105 | end
106 |
107 | def start
108 | super
109 | end
110 |
111 | def shutdown
112 | super
113 | end
114 |
115 | def path_format(chunk_key)
116 | path = Time.strptime(chunk_key, @time_slice_format).strftime(@path)
117 | log.debug "GCS Path: #{path}"
118 | path
119 | end
120 |
121 | def chunk_unique_id_to_str(unique_id)
122 | unique_id.unpack('C*').map{|x| x.to_s(16).rjust(2,'0')}.join('')
123 | end
124 |
125 | def send_data(path, data)
126 | mimetype = MIME::Types.type_for(path).first
127 |
128 | io = nil
129 | if ["gz", "gzip"].include?(@compress)
130 | io = StringIO.new("")
131 | writer = Zlib::GzipWriter.new(io)
132 | writer.write(data)
133 | writer.finish
134 | io.rewind
135 | else
136 | io = StringIO.new(data)
137 | end
138 |
139 | media = Google::APIClient::UploadIO.new(io, mimetype.content_type, File.basename(path))
140 |
141 | call_google_api(api_method: @storage_api.objects.insert,
142 | parameters: {
143 | uploadType: "multipart",
144 | project: @project_id,
145 | bucket: @bucket_id,
146 | name: path
147 | },
148 | body_object: { contentType: media.content_type },
149 | media: media)
150 | end
151 |
152 | def write(chunk)
153 | hdfs_path = path_format(chunk.key).gsub(CHUNK_ID_PLACE_HOLDER, chunk_unique_id_to_str(chunk.unique_id))
154 |
155 | send_data(hdfs_path, chunk.read)
156 |
157 | hdfs_path
158 | end
159 | end
160 |
--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
1 | require 'rubygems'
2 | require 'bundler'
3 | begin
4 | Bundler.setup(:default, :development)
5 | rescue Bundler::BundlerError => e
6 | $stderr.puts e.message
7 | $stderr.puts "Run `bundle install` to install missing gems"
8 | exit e.status_code
9 | end
10 | require 'test/unit'
11 |
12 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13 | $LOAD_PATH.unshift(File.dirname(__FILE__))
14 | require 'fluent/test'
15 | unless ENV.has_key?('VERBOSE')
16 | nulllogger = Object.new
17 | nulllogger.instance_eval {|obj|
18 | def method_missing(method, *args)
19 | # pass
20 | end
21 | }
22 | $log = nulllogger
23 | end
24 |
25 | require 'fluent/plugin/out_google_cloud_storage'
26 |
27 | class Test::Unit::TestCase
28 | end
29 |
--------------------------------------------------------------------------------
/test/plugin/test_out_google_cloud_storage.rb:
--------------------------------------------------------------------------------
1 | require 'helper'
2 |
3 | class WebHDFSOutputTest < Test::Unit::TestCase
4 | CONFIG = %[
5 | host namenode.local
6 | path /hdfs/path/file.%Y%m%d.log
7 | ]
8 |
9 | def create_driver(conf=CONFIG,tag='test')
10 | Fluent::Test::OutputTestDriver.new(Fluent::WebHDFSOutput, tag).configure(conf)
11 | end
12 |
13 | def test_configure
14 | d = create_driver
15 | assert_equal 'namenode.local', d.instance.instance_eval{ @namenode_host }
16 | assert_equal 50070, d.instance.instance_eval{ @namenode_port }
17 | assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path
18 | assert_equal '%Y%m%d', d.instance.time_slice_format
19 | assert_equal false, d.instance.httpfs
20 | assert_nil d.instance.username
21 | assert_equal false, d.instance.ignore_start_check_error
22 |
23 | assert_equal true, d.instance.output_include_time
24 | assert_equal true, d.instance.output_include_tag
25 | assert_equal 'json', d.instance.output_data_type
26 | assert_nil d.instance.remove_prefix
27 | assert_equal 'TAB', d.instance.field_separator
28 | assert_equal true, d.instance.add_newline
29 | assert_equal 'tag_missing', d.instance.default_tag
30 |
31 | d = create_driver %[
32 | namenode server.local:14000
33 | path /hdfs/path/file.%Y%m%d.%H%M.log
34 | httpfs yes
35 | username hdfs_user
36 | ]
37 | assert_equal 'server.local', d.instance.instance_eval{ @namenode_host }
38 | assert_equal 14000, d.instance.instance_eval{ @namenode_port }
39 | assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path
40 | assert_equal '%Y%m%d%H%M', d.instance.time_slice_format
41 | assert_equal true, d.instance.httpfs
42 | assert_equal 'hdfs_user', d.instance.username
43 | end
44 |
45 | def test_configure_placeholders
46 | d = create_driver %[
47 | hostname testing.node.local
48 | namenode server.local:50070
49 | path /hdfs/${hostname}/file.%Y%m%d%H.log
50 | ]
51 | assert_equal '/hdfs/testing.node.local/file.%Y%m%d%H.log', d.instance.path
52 | end
53 |
54 | def test_path_format
55 | d = create_driver
56 | assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path
57 | assert_equal '%Y%m%d', d.instance.time_slice_format
58 | assert_equal '/hdfs/path/file.20120718.log', d.instance.path_format('20120718')
59 |
60 | d = create_driver %[
61 | namenode server.local:14000
62 | path /hdfs/path/file.%Y%m%d.%H%M.log
63 | ]
64 | assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path
65 | assert_equal '%Y%m%d%H%M', d.instance.time_slice_format
66 | assert_equal '/hdfs/path/file.20120718.1503.log', d.instance.path_format('201207181503')
67 |
68 | assert_raise Fluent::ConfigError do
69 | d = create_driver %[
70 | namenode server.local:14000
71 | path /hdfs/path/file.%Y%m%d.%H%M.log
72 | append false
73 | ]
74 | end
75 | end
76 | end
77 |
--------------------------------------------------------------------------------