├── .gitignore ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── Rakefile ├── fluent-plugin-google-cloud-storage.gemspec ├── lib └── fluent │ └── plugin │ └── out_google_cloud_storage.rb └── test ├── helper.rb └── plugin └── test_out_google_cloud_storage.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /test/tmp/ 9 | /test/version_tmp/ 10 | /tmp/ 11 | 12 | ## Specific to RubyMotion: 13 | .dat* 14 | .repl_history 15 | build/ 16 | 17 | ## Documentation cache and generated files: 18 | /.yardoc/ 19 | /_yardoc/ 20 | /doc/ 21 | /rdoc/ 22 | 23 | ## Environment normalisation: 24 | /.bundle/ 25 | /lib/bundler/man/ 26 | 27 | # for a library or gem, you might want to ignore these files since the code is 28 | # intended to run in multiple environments; otherwise, check them in: 29 | Gemfile.lock 30 | .ruby-version 31 | .ruby-gemset 32 | 33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 34 | .rvmrc 35 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in fluent-plugin-webhdfs.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | fluent-plugin-google-cloud-storage (0.3.3) 5 | fluent-mixin-config-placeholders (>= 0.3.0) 6 | fluent-mixin-plaintextformatter (>= 0.2.1) 7 | fluentd (>= 0.10.53) 8 | google-api-client (~> 0.7) 9 | 10 | GEM 11 | remote: https://rubygems.org/ 12 | specs: 13 | addressable (2.3.6) 14 | autoparse (0.3.3) 15 | addressable (>= 2.3.1) 16 | extlib (>= 0.9.15) 17 | multi_json (>= 1.0.0) 18 | cool.io (1.2.4) 19 | extlib (0.9.16) 20 | faraday (0.9.0) 21 | multipart-post (>= 1.2, < 3) 22 | fluent-mixin-config-placeholders (0.3.0) 23 | fluentd 24 | uuidtools (>= 2.1.5) 25 | fluent-mixin-plaintextformatter (0.2.6) 26 | fluentd 27 | ltsv 28 | fluentd (0.10.53) 29 | cool.io (>= 1.1.1, < 2.0.0, != 1.2.0) 30 | http_parser.rb (>= 0.5.1, < 0.7.0) 31 | json (>= 1.4.3) 32 | msgpack (>= 0.4.4, < 0.6.0, != 0.5.3, != 0.5.2, != 0.5.1, != 0.5.0) 33 | sigdump (~> 0.2.2) 34 | yajl-ruby (~> 1.0) 35 | google-api-client (0.7.1) 36 | addressable (>= 2.3.2) 37 | autoparse (>= 0.3.3) 38 | extlib (>= 0.9.15) 39 | faraday (>= 0.9.0) 40 | jwt (>= 0.1.5) 41 | launchy (>= 2.1.1) 42 | multi_json (>= 1.0.0) 43 | retriable (>= 1.4) 44 | signet (>= 0.5.0) 45 | uuidtools (>= 2.1.0) 46 | http_parser.rb (0.6.0) 47 | json (1.8.1) 48 | jwt (1.0.0) 49 | launchy (2.4.2) 50 | addressable (~> 2.3) 51 | ltsv (0.1.0) 52 | msgpack (0.5.9) 53 | multi_json (1.10.1) 54 | multipart-post (2.0.0) 55 | rake (10.3.2) 56 | retriable (1.4.1) 57 | sigdump (0.2.2) 58 | signet (0.5.1) 59 | addressable (>= 2.2.3) 60 | faraday (>= 0.9.0.rc5) 61 | jwt (>= 0.1.5) 62 | multi_json (>= 1.0.0) 63 | uuidtools (2.1.5) 64 | yajl-ruby (1.2.1) 65 | 66 | PLATFORMS 67 | ruby 68 | 69 | DEPENDENCIES 70 | fluent-plugin-google-cloud-storage! 71 | rake 72 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012- TAGOMORI Satoshi 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fluent-plugin-google-cloud-storage 2 | 3 | [![Gem Version](https://badge.fury.io/rb/fluent-plugin-google-cloud-storage.svg)](https://badge.fury.io/rb/fluent-plugin-google-cloud-storage) 4 | 5 | [Fluentd](http://fluentd.org/) output plugin to write data into a [Google Cloud 6 | Storage](https://cloud.google.com/storage/) bucket. 7 | 8 | GoogleCloudStorageOutput slices data by time (specified unit), and store these 9 | data as file of plain text. You can specify to: 10 | 11 | * format whole data as serialized JSON, single attribute or separated multi attributes 12 | * or LTSV, labeled-TSV (see http://ltsv.org/ ) 13 | * include time as line header, or not 14 | * include tag as line header, or not 15 | * change field separator (default: TAB) 16 | * add new line as termination, or not 17 | 18 | And you can specify output file path as 'path path/to/dir/access.%Y%m%d.log', then get 'path/to/dir/access.20120316.log' in your GCS bucket. 19 | 20 | ## Configuration 21 | 22 | ### Examples 23 | 24 | #### Complete Example 25 | 26 | # tail 27 | 28 | type tail 29 | format none 30 | path /tmp/test.log 31 | pos_file /var/log/td-agent/test.pos 32 | tag tail.test 33 | 34 | 35 | # post to GCS 36 | 37 | type google_cloud_storage 38 | service_email xxx.xxx.com 39 | service_pkcs12_path /etc/td-agent/My_First_Project-xxx.p12 40 | project_id handy-compass-xxx 41 | bucket_id test_bucket 42 | path tail.test/%Y/%m/%d/%H/${hostname}/${chunk_id}.log.gz 43 | output_include_time false 44 | output_include_tag false 45 | buffer_path /var/log/td-agent/buffer/tail.test 46 | # flush_interval 600s 47 | buffer_chunk_limit 128m 48 | time_slice_wait 300s 49 | compress gzip 50 | 51 | 52 | #### More Examples 53 | 54 | To store data by `time,tag,json` (same with 'type file') with GCS: 55 | 56 | 57 | type google_cloud_storage 58 | service_email SERVICE_ACCOUNT_EMAIL 59 | service_pkcs12_path /path/to/key.p12 60 | project_id name-of-project 61 | bucket_id name-of-bucket 62 | path path/to/access.%Y%m%d_%H.${chunk_id}.log 63 | 64 | 65 | To specify the pkcs12 file's password, use `service_pkcs12_password`: 66 | 67 | 68 | type google_cloud_storage 69 | service_email SERVICE_ACCOUNT_EMAIL 70 | service_pkcs12_path /path/to/key.p12 71 | service_pkcs12_password SECRET_PASSWORD 72 | project_id name-of-project 73 | bucket_id name-of-bucket 74 | path path/to/access.%Y%m%d_%H.${chunk_id}.log 75 | 76 | 77 | If you want JSON object only (without time or tag or both on header of lines), specify it by `output_include_time` or `output_include_tag` (default true): 78 | 79 | 80 | type google_cloud_storage 81 | service_email SERVICE_ACCOUNT_EMAIL 82 | service_pkcs12_path /path/to/key.p12 83 | project_id name-of-project 84 | bucket_id name-of-bucket 85 | path path/to/access.%Y%m%d_%H.${chunk_id}.log 86 | output_include_time false 87 | output_include_tag false 88 | 89 | 90 | To store data as LTSV without time and tag over WebHDFS: 91 | 92 | 93 | type google_cloud_storage 94 | # ... 95 | output_data_type ltsv 96 | 97 | 98 | Store data as TSV (TAB separated values) of specified keys, without time, with tag (removed prefix 'access'): 99 | 100 | 101 | type google_cloud_storage 102 | # ... 103 | 104 | field_separator TAB # or 'SPACE', 'COMMA' or 'SOH'(Start Of Heading: \001) 105 | output_include_time false 106 | output_include_tag true 107 | remove_prefix access 108 | 109 | output_data_type attr:path,status,referer,agent,bytes 110 | 111 | 112 | If message doesn't have specified attribute, fluent-plugin-webhdfs outputs 'NULL' instead of values. 113 | 114 | To store data compressed (gzip only now): 115 | 116 | 117 | type google_cloud_storage 118 | # ... 119 | 120 | compress gzip 121 | 122 | 123 | ### Major Caveat 124 | 125 | As GCS does not support appending to files, if you have multiple fluentd nodes, 126 | you most likely each to log to separate files. You can use '${hostname}' or 127 | '${uuid:random}' placeholders in configuration for this purpose. 128 | 129 | Note the `${chunk_id}` placeholder in the following paths. The plugin requires the presence 130 | of the placeholder to guarantee that each flush will not overwrite an existing 131 | file. 132 | 133 | For hostname: 134 | 135 | 136 | type google_cloud_storage 137 | # ... 138 | path log/access/%Y%m%d/${hostname}.${chunk_id}.log 139 | 140 | 141 | Or with random filename (to avoid duplicated file name only): 142 | 143 | 144 | type google_cloud_storage 145 | # ... 146 | path log/access/%Y%m%d/${uuid:random}.${chunk_id}.log 147 | 148 | 149 | With the configurations above, you can handle all of files of 150 | '/log/access/20120820/*' as specified timeslice access logs. 151 | 152 | ## TODO 153 | 154 | * docs? 155 | * patches welcome! 156 | 157 | ## Copyright 158 | 159 | * Copyright (c) 2014- Hsiu-Fan Wang (hfwang@porkbuns.net) 160 | * License 161 | * Apache License, Version 2.0 162 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rake 2 | require "bundler/gem_tasks" 3 | 4 | require 'rake/testtask' 5 | Rake::TestTask.new(:test) do |test| 6 | test.libs << 'lib' << 'test' 7 | test.pattern = 'test/**/test_*.rb' 8 | test.verbose = true 9 | end 10 | 11 | task :default => :test 12 | -------------------------------------------------------------------------------- /fluent-plugin-google-cloud-storage.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | Gem::Specification.new do |gem| 4 | gem.name = "fluent-plugin-google-cloud-storage" 5 | gem.version = "1.0.0" 6 | gem.authors = ["Hsiu-Fan Wang"] 7 | gem.email = ["hfwang@porkbuns.net"] 8 | gem.summary = %q{Fluentd plugin to write data to Google Cloud Storage} 9 | gem.description = %q{Google Cloud Storage fluentd output} 10 | gem.homepage = "https://github.com/hfwang/fluent-plugin-google-cloud-storage" 11 | gem.license = "APLv2" 12 | 13 | gem.files = `git ls-files`.split($\) 14 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 15 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 16 | gem.require_paths = ["lib"] 17 | 18 | gem.add_development_dependency "rake" 19 | gem.add_runtime_dependency "fluentd", '>= 0.10.53' 20 | gem.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1' 21 | gem.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0" 22 | gem.add_runtime_dependency "google-api-client", '0.8.6' 23 | gem.add_runtime_dependency "mime-types", '>= 3.0' 24 | end 25 | -------------------------------------------------------------------------------- /lib/fluent/plugin/out_google_cloud_storage.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | require 'fluent/mixin/config_placeholders' 4 | require 'fluent/mixin/plaintextformatter' 5 | require 'fluent/log' 6 | 7 | class Fluent::GoogleCloudStorageOutput < Fluent::TimeSlicedOutput 8 | Fluent::Plugin.register_output('google_cloud_storage', self) 9 | 10 | config_set_default :buffer_type, 'file' 11 | config_set_default :time_slice_format, '%Y%m%d' 12 | 13 | config_param :ignore_start_check_error, :bool, :default => false 14 | 15 | include Fluent::Mixin::ConfigPlaceholders 16 | 17 | config_param :service_email, :string 18 | config_param :service_pkcs12_path, :string 19 | config_param :service_pkcs12_password, :string, :default => "notasecret" 20 | config_param :project_id, :string 21 | config_param :bucket_id, :string 22 | config_param :path, :string 23 | 24 | config_param :compress, :default => nil do |val| 25 | unless ["gz", "gzip"].include?(val) 26 | raise ConfigError, "Unsupported compression algorithm '#{val}'" 27 | end 28 | val 29 | end 30 | 31 | # how many times of write failure before switch to standby namenode 32 | # by default it's 11 times that costs 1023 seconds inside fluentd, 33 | # which is considered enough to exclude the scenes that caused by temporary network fail or single datanode fail 34 | config_param :failures_before_use_standby, :integer, :default => 11 35 | 36 | include Fluent::Mixin::PlainTextFormatter 37 | 38 | config_param :default_tag, :string, :default => 'tag_missing' 39 | 40 | CHUNK_ID_PLACE_HOLDER = '${chunk_id}' 41 | 42 | def initialize 43 | super 44 | require 'zlib' 45 | require 'net/http' 46 | require 'time' 47 | require 'google/api_client' 48 | require 'signet/oauth_2/client' 49 | require 'mime-types' 50 | end 51 | 52 | # Define `log` method for v0.10.42 or earlier 53 | unless method_defined?(:log) 54 | define_method("log") { $log } 55 | end 56 | 57 | def call_google_api(params) 58 | # refresh_auth 59 | if @google_api_client.authorization.expired? 60 | @google_api_client.authorization.fetch_access_token! 61 | end 62 | return @google_api_client.execute(params) 63 | end 64 | 65 | def configure(conf) 66 | if conf['path'] 67 | if conf['path'].index('%S') 68 | conf['time_slice_format'] = '%Y%m%d%H%M%S' 69 | elsif conf['path'].index('%M') 70 | conf['time_slice_format'] = '%Y%m%d%H%M' 71 | elsif conf['path'].index('%H') 72 | conf['time_slice_format'] = '%Y%m%d%H' 73 | end 74 | end 75 | 76 | super 77 | 78 | @client = prepare_client() 79 | 80 | if @path.index(CHUNK_ID_PLACE_HOLDER).nil? 81 | raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false." 82 | end 83 | end 84 | 85 | def prepare_client 86 | @google_api_client = Google::APIClient.new( 87 | :application_name => "fluent-plugin-google-cloud-storage", 88 | :user_agent => "fluent-plugin-google-cloud-storage/1.0.0 (gzip)", 89 | :application_version => "0.3.1") 90 | begin 91 | key = Google::APIClient::KeyUtils.load_from_pkcs12( 92 | @service_pkcs12_path, @service_pkcs12_password) 93 | @google_api_client.authorization = Signet::OAuth2::Client.new( 94 | token_credential_uri: "https://accounts.google.com/o/oauth2/token", 95 | audience: "https://accounts.google.com/o/oauth2/token", 96 | issuer: @service_email, 97 | scope: "https://www.googleapis.com/auth/devstorage.read_write", 98 | signing_key: key) 99 | @google_api_client.authorization.fetch_access_token! 100 | rescue Signet::AuthorizationError 101 | raise Fluent::ConfigError, "Error occurred authenticating with Google" 102 | end 103 | @storage_api = @google_api_client.discovered_api("storage", "v1") 104 | return @google_api_client 105 | end 106 | 107 | def start 108 | super 109 | end 110 | 111 | def shutdown 112 | super 113 | end 114 | 115 | def path_format(chunk_key) 116 | path = Time.strptime(chunk_key, @time_slice_format).strftime(@path) 117 | log.debug "GCS Path: #{path}" 118 | path 119 | end 120 | 121 | def chunk_unique_id_to_str(unique_id) 122 | unique_id.unpack('C*').map{|x| x.to_s(16).rjust(2,'0')}.join('') 123 | end 124 | 125 | def send_data(path, data) 126 | mimetype = MIME::Types.type_for(path).first 127 | 128 | io = nil 129 | if ["gz", "gzip"].include?(@compress) 130 | io = StringIO.new("") 131 | writer = Zlib::GzipWriter.new(io) 132 | writer.write(data) 133 | writer.finish 134 | io.rewind 135 | else 136 | io = StringIO.new(data) 137 | end 138 | 139 | media = Google::APIClient::UploadIO.new(io, mimetype.content_type, File.basename(path)) 140 | 141 | call_google_api(api_method: @storage_api.objects.insert, 142 | parameters: { 143 | uploadType: "multipart", 144 | project: @project_id, 145 | bucket: @bucket_id, 146 | name: path 147 | }, 148 | body_object: { contentType: media.content_type }, 149 | media: media) 150 | end 151 | 152 | def write(chunk) 153 | hdfs_path = path_format(chunk.key).gsub(CHUNK_ID_PLACE_HOLDER, chunk_unique_id_to_str(chunk.unique_id)) 154 | 155 | send_data(hdfs_path, chunk.read) 156 | 157 | hdfs_path 158 | end 159 | end 160 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'bundler' 3 | begin 4 | Bundler.setup(:default, :development) 5 | rescue Bundler::BundlerError => e 6 | $stderr.puts e.message 7 | $stderr.puts "Run `bundle install` to install missing gems" 8 | exit e.status_code 9 | end 10 | require 'test/unit' 11 | 12 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 13 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 14 | require 'fluent/test' 15 | unless ENV.has_key?('VERBOSE') 16 | nulllogger = Object.new 17 | nulllogger.instance_eval {|obj| 18 | def method_missing(method, *args) 19 | # pass 20 | end 21 | } 22 | $log = nulllogger 23 | end 24 | 25 | require 'fluent/plugin/out_google_cloud_storage' 26 | 27 | class Test::Unit::TestCase 28 | end 29 | -------------------------------------------------------------------------------- /test/plugin/test_out_google_cloud_storage.rb: -------------------------------------------------------------------------------- 1 | require 'helper' 2 | 3 | class WebHDFSOutputTest < Test::Unit::TestCase 4 | CONFIG = %[ 5 | host namenode.local 6 | path /hdfs/path/file.%Y%m%d.log 7 | ] 8 | 9 | def create_driver(conf=CONFIG,tag='test') 10 | Fluent::Test::OutputTestDriver.new(Fluent::WebHDFSOutput, tag).configure(conf) 11 | end 12 | 13 | def test_configure 14 | d = create_driver 15 | assert_equal 'namenode.local', d.instance.instance_eval{ @namenode_host } 16 | assert_equal 50070, d.instance.instance_eval{ @namenode_port } 17 | assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path 18 | assert_equal '%Y%m%d', d.instance.time_slice_format 19 | assert_equal false, d.instance.httpfs 20 | assert_nil d.instance.username 21 | assert_equal false, d.instance.ignore_start_check_error 22 | 23 | assert_equal true, d.instance.output_include_time 24 | assert_equal true, d.instance.output_include_tag 25 | assert_equal 'json', d.instance.output_data_type 26 | assert_nil d.instance.remove_prefix 27 | assert_equal 'TAB', d.instance.field_separator 28 | assert_equal true, d.instance.add_newline 29 | assert_equal 'tag_missing', d.instance.default_tag 30 | 31 | d = create_driver %[ 32 | namenode server.local:14000 33 | path /hdfs/path/file.%Y%m%d.%H%M.log 34 | httpfs yes 35 | username hdfs_user 36 | ] 37 | assert_equal 'server.local', d.instance.instance_eval{ @namenode_host } 38 | assert_equal 14000, d.instance.instance_eval{ @namenode_port } 39 | assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path 40 | assert_equal '%Y%m%d%H%M', d.instance.time_slice_format 41 | assert_equal true, d.instance.httpfs 42 | assert_equal 'hdfs_user', d.instance.username 43 | end 44 | 45 | def test_configure_placeholders 46 | d = create_driver %[ 47 | hostname testing.node.local 48 | namenode server.local:50070 49 | path /hdfs/${hostname}/file.%Y%m%d%H.log 50 | ] 51 | assert_equal '/hdfs/testing.node.local/file.%Y%m%d%H.log', d.instance.path 52 | end 53 | 54 | def test_path_format 55 | d = create_driver 56 | assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path 57 | assert_equal '%Y%m%d', d.instance.time_slice_format 58 | assert_equal '/hdfs/path/file.20120718.log', d.instance.path_format('20120718') 59 | 60 | d = create_driver %[ 61 | namenode server.local:14000 62 | path /hdfs/path/file.%Y%m%d.%H%M.log 63 | ] 64 | assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path 65 | assert_equal '%Y%m%d%H%M', d.instance.time_slice_format 66 | assert_equal '/hdfs/path/file.20120718.1503.log', d.instance.path_format('201207181503') 67 | 68 | assert_raise Fluent::ConfigError do 69 | d = create_driver %[ 70 | namenode server.local:14000 71 | path /hdfs/path/file.%Y%m%d.%H%M.log 72 | append false 73 | ] 74 | end 75 | end 76 | end 77 | --------------------------------------------------------------------------------