├── .gitignore ├── lib ├── heap-rs3-segment │ ├── version.rb │ ├── logging.rb │ ├── processors │ │ └── segment.rb │ └── loader.rb └── heap-rs3-segment.rb ├── Gemfile ├── heap-rs3-segment.gemspec ├── bin └── heap-rs3-segment └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | Gemfile.lock 3 | -------------------------------------------------------------------------------- /lib/heap-rs3-segment/version.rb: -------------------------------------------------------------------------------- 1 | module HeapRS3Segment 2 | VERSION = '0.1.6' 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gemspec 3 | 4 | # we need latest master unless they release version > 2.2.6 5 | gem 'analytics-ruby', '~> 2.0', require: 'segment/analytics', github: 'segmentio/analytics-ruby' 6 | -------------------------------------------------------------------------------- /lib/heap-rs3-segment.rb: -------------------------------------------------------------------------------- 1 | require 'heap-rs3-segment/logging' 2 | require 'heap-rs3-segment/loader' 3 | require 'heap-rs3-segment/processors/segment' 4 | 5 | module HeapRS3Segment 6 | MANIFEST_REGEXP = /\/sync_\d+\.json$/ 7 | MANIFEST_BUCKET_PREFIX = 'manifests/sync_' 8 | 9 | def self.logger 10 | HeapRS3Segment::Logging.logger 11 | end 12 | 13 | def self.logger=(log) 14 | HeapRS3Segment::Logging.logger = log 15 | end 16 | end -------------------------------------------------------------------------------- /lib/heap-rs3-segment/logging.rb: -------------------------------------------------------------------------------- 1 | require 'time' 2 | require 'logger' 3 | 4 | module HeapRS3Segment 5 | module Logging 6 | class Pretty < Logger::Formatter 7 | def call(severity, time, program_name, message) 8 | "#{time.utc.iso8601} #{severity} #{message}\n" 9 | end 10 | end 11 | 12 | def self.initialize_logger(log_target = STDOUT) 13 | @logger = Logger.new(log_target) 14 | @logger.level = Logger::DEBUG # Logger::INFO 15 | @logger.formatter = Pretty.new 16 | @logger 17 | end 18 | 19 | def self.logger 20 | defined?(@logger) ? @logger : initialize_logger 21 | end 22 | 23 | def self.logger=(log) 24 | @logger = (log ? log : Logger.new(File::NULL)) 25 | end 26 | 27 | def logger 28 | self.class.logger 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /heap-rs3-segment.gemspec: -------------------------------------------------------------------------------- 1 | require File.expand_path('../lib/heap-rs3-segment/version', __FILE__) 2 | 3 | Gem::Specification.new do |gem| 4 | gem.authors = ['Troex Nevelin (Sergey B)'] 5 | gem.email = ['troex@upserver24.com'] 6 | gem.summary = 'HEAP Retrospective S3 to Segment processor' 7 | gem.description = 'Reads HEAP Retrospective S3 Syncs from AWS S3 buckets and process it as Segment events' 8 | gem.homepage = 'https://attributionapp.com' 9 | gem.license = 'MIT' 10 | 11 | gem.executables = ['heap-rs3-segment'] 12 | gem.files = `git ls-files | grep -Ev '^(test|myapp|examples)'`.split("\n") 13 | gem.test_files = [] 14 | gem.name = 'heap-rs3-segment' 15 | gem.require_paths = ['lib'] 16 | gem.version = HeapRS3Segment::VERSION 17 | gem.required_ruby_version = '>= 2.2.2' 18 | 19 | gem.add_dependency 'avro' 20 | gem.add_dependency 'snappy' 21 | gem.add_dependency 'aws-sdk-s3', '~> 1' 22 | gem.add_dependency 'analytics-ruby', '~> 2.0' 23 | gem.add_dependency 'activesupport' 24 | 25 | gem.add_development_dependency 'pry' 26 | gem.add_development_dependency 'dotenv' 27 | end 28 | -------------------------------------------------------------------------------- /lib/heap-rs3-segment/processors/segment.rb: -------------------------------------------------------------------------------- 1 | module HeapRS3Segment 2 | module Processors 3 | class Segment 4 | attr_accessor :analytics, :max_queue_size 5 | 6 | def initialize(*args) 7 | @analytics = ::Segment::Analytics.new(*args) 8 | 9 | @max_queue_size = @analytics. 10 | instance_variable_get('@client'). 11 | instance_variable_get('@max_queue_size') 12 | end 13 | 14 | def check_flush_queue! 15 | if @analytics.queued_messages >= @max_queue_size 16 | t = Time.now 17 | HeapRS3Segment.logger.info "Max queue size reached - #{@analytics.queued_messages}, flushing" 18 | @analytics.flush 19 | diff = Time.now - t 20 | rate = (@max_queue_size / diff).to_i 21 | HeapRS3Segment.logger.info "Flush done in #{diff} seconds (#{rate} req/sec), continue" 22 | end 23 | end 24 | 25 | def track(attrs) 26 | check_flush_queue! 27 | @analytics.track(attrs) 28 | end 29 | 30 | def identify(attrs) 31 | check_flush_queue! 32 | @analytics.identify(attrs) 33 | end 34 | 35 | def page(attrs) 36 | check_flush_queue! 37 | @analytics.page(attrs) 38 | end 39 | 40 | def alias(attrs) 41 | check_flush_queue! 42 | @analytics.alias(attrs) 43 | end 44 | end 45 | end 46 | end -------------------------------------------------------------------------------- /bin/heap-rs3-segment: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'bundler/setup' 4 | 5 | ENV['APP_ENV'] ||= 'development' 6 | 7 | if ENV['APP_ENV'] == 'development' 8 | require 'dotenv' 9 | Dotenv.load 10 | end 11 | 12 | if ENV['ROLLBAR_ACCESS_TOKEN'] 13 | require 'rollbar' 14 | Rollbar.configure do |config| 15 | config.access_token = ENV['ROLLBAR_ACCESS_TOKEN'] 16 | config.environment = ENV['APP_ENV'] 17 | end 18 | end 19 | 20 | project_id, aws_s3_bucket, *_ = ARGV 21 | 22 | # ENV fallback 23 | project_id ||= ENV['PROJECT_ID'] 24 | aws_s3_bucket ||= ENV['AWS_S3_BUCKET'] 25 | 26 | puts "Starting... (project_id: #{project_id}, aws_s3_bucket: #{aws_s3_bucket})" 27 | 28 | raise unless project_id && aws_s3_bucket 29 | 30 | require 'segment/analytics' 31 | # dirty patch segment library 32 | Segment::Analytics::Defaults::Request::HOST = ENV['TRACKING_ENDPOINT'] 33 | Segment::Analytics::Defaults::MessageBatch::MAX_BYTES = 5_120_000 # 5 Mb 34 | # Segment::Analytics::Defaults::MessageBatch::MAX_SIZE = 1_000 35 | 36 | require 'heap-rs3-segment' 37 | 38 | processor = HeapRS3Segment::Processors::Segment.new( 39 | write_key: project_id, 40 | on_error: Proc.new { |status, msg| print msg }, 41 | max_queue_size: 10_000, 42 | batch_size: 500 43 | # stub: true 44 | ) 45 | 46 | HeapRS3Segment::Loader.new( 47 | processor, 48 | project_id, 49 | aws_s3_bucket, 50 | ENV['AWS_ACCESS_KEY_ID'], 51 | ENV['AWS_SECRET_ACCESS_KEY'] 52 | ).call 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Heap Reverse S3 processor 2 | 3 | ### Sample usage 4 | 5 | ``` 6 | args = { 7 | project_identifier: 'XXXXXX', 8 | s3_bucket: 'heap-rs3-atb-NNNN', 9 | revenue_fallback: 'revenue' 10 | } 11 | 12 | processor = Services::Segment::HeapProcessor.new(args[:project_identifier]) 13 | require 'heap-rs3-segment' 14 | 15 | heap_rs3 = HeapRS3Segment::Loader.new( 16 | processor, 17 | args[:project_identifier], 18 | args[:s3_bucket], 19 | ENV['S3_DATABASE_EXPORT_ID'], 20 | ENV['S3_DATABASE_EXPORT_KEY'] 21 | ) 22 | heap_rs3.revenue_fallback << args[:revenue_fallback] 23 | heap_rs3.skip_tables << 'global_view_any_page' << 'mg_taxonomy_page_viewed' 24 | heap_rs3.process_single_sync = false 25 | 26 | # heap_rs3.identify_only_users = true 27 | 28 | heap_rs3.call 29 | ``` 30 | 31 | 32 | ### Specific manifest with skip logic 33 | 34 | ``` 35 | args = { 36 | project_identifier: 'XXXXXX', 37 | s3_bucket: 'heap-rs3-atb-NNN', 38 | revenue_fallback: 'revenue' 39 | } 40 | processor = Services::Segment::HeapProcessor.new(args[:project_identifier]) 41 | require 'heap-rs3-segment' 42 | 43 | heap_rs3 = HeapRS3Segment::Loader.new( 44 | processor, 45 | args[:project_identifier], 46 | args[:s3_bucket], 47 | ENV['S3_DATABASE_EXPORT_ID'], 48 | ENV['S3_DATABASE_EXPORT_KEY'] 49 | ) 50 | heap_rs3.revenue_fallback << args[:revenue_fallback] 51 | 52 | manifest_sync = 'manifests/sync_1008461850.json' 53 | list_opts = { bucket: heap_rs3.aws_s3_bucket, prefix: manifest_sync, delimiter: '/' } 54 | resp = heap_rs3.s3.list_objects_v2(list_opts) 55 | obj = resp.contents.first 56 | heap_rs3.skip_before = Time.utc(2025, 1, 1) 57 | heap_rs3.identify_only_users = true 58 | heap_rs3.skip_types = [:track] 59 | heap_rs3.skip_file = ->(file) do 60 | if match = file.match(/pageviews\/part-(\d+)/) 61 | # skip files with part number OUTSIDE of 10000..10999 62 | # e.g. only process files that match pageviews/part-10??? 63 | unless (10000..10999).include?(match[1].to_i) 64 | return true 65 | end 66 | end 67 | end 68 | heap_rs3.process_sync(obj) 69 | ``` -------------------------------------------------------------------------------- /lib/heap-rs3-segment/loader.rb: -------------------------------------------------------------------------------- 1 | require 'aws-sdk-s3' 2 | require 'avro' 3 | require 'active_support/time' 4 | 5 | module HeapRS3Segment 6 | class Loader 7 | # HEAP Data Schema 8 | # https://help.heap.io/hc/en-us/articles/18700033317020-Heap-Connect-Data-Schema 9 | 10 | AWS_S3_DEFAULT_REGION = 'us-east-1' 11 | 12 | attr_accessor :processor, :project_identifier, :aws_s3_bucket, :prompt, :process_single_sync, 13 | :identify_only_users, :alias_on_identify, :revenue_mapping, :revenue_fallback, :user_id_prop, 14 | :skip_types, :skip_tables, :skip_before, :skip_file, :s3, :alias_cache, :alias_cache_reverse, 15 | :session_cache 16 | 17 | def initialize(processor, project_identifier, aws_s3_bucket, aws_access_key_id, aws_secret_access_key, aws_region=nil) 18 | Time.zone = 'UTC' 19 | @alias_cache = {} 20 | @alias_cache_reverse = {} 21 | @session_cache = {} 22 | 23 | @processor = processor 24 | @project_identifier = project_identifier 25 | 26 | @s3 = Aws::S3::Client.new( 27 | access_key_id: aws_access_key_id, 28 | secret_access_key: aws_secret_access_key, 29 | region: aws_region || AWS_S3_DEFAULT_REGION 30 | ) 31 | @aws_s3_bucket = aws_s3_bucket 32 | @aws_s3_bucket_prefix = MANIFEST_BUCKET_PREFIX 33 | 34 | @prompt = true 35 | @process_single_sync = true # stops after one sync is processed 36 | @skip_types = [] # [:page, :track, :identify, :alias] 37 | @skip_tables = ['_event_metadata'] 38 | @skip_before = nil 39 | @skip_file = nil # skip file if it matches 40 | @identify_only_users = false # this is useful when doing initial import and we don't need to identify anonymous users 41 | @alias_on_identify = true # find migrated user in @alias_cache and fire alias events for identified users 42 | @revenue_mapping = {} 43 | @revenue_fallback = [] 44 | @user_id_prop = 'identity' 45 | end 46 | 47 | def call 48 | scan_manifests.each do |obj| 49 | already_synced = begin 50 | @s3.head_object({ bucket: @aws_s3_bucket, key: "imported_#{obj.key}" }) && true 51 | rescue Aws::S3::Errors::NotFound 52 | nil 53 | end 54 | 55 | next if already_synced 56 | 57 | if @prompt 58 | require 'pry' 59 | logger.debug 'Ready to process ' + obj.key + ', type "exit!" to interrupt, "already_synced = true" to skip this sync, set @skip_types to skip certian event types and CTRL-D to continue' 60 | binding.pry 61 | 62 | next if already_synced 63 | end 64 | 65 | process_sync(obj) 66 | break if @process_single_sync 67 | end 68 | end 69 | 70 | def logger 71 | HeapRS3Segment.logger 72 | end 73 | 74 | def scan_manifests 75 | list_opts = { bucket: @aws_s3_bucket, prefix: @aws_s3_bucket_prefix, delimiter: '/' } 76 | all_objects = [] 77 | while true 78 | resp = @s3.list_objects_v2(list_opts) 79 | all_objects += resp.contents 80 | 81 | break unless resp.next_continuation_token.present? 82 | 83 | list_opts[:continuation_token] = resp.next_continuation_token 84 | end 85 | all_objects.select { |obj| obj.key.match(MANIFEST_REGEXP) }.sort_by(&:key) 86 | end 87 | 88 | def mark_manifest_as_synced(obj) 89 | @s3.copy_object( 90 | copy_source: "#{@aws_s3_bucket}/#{obj.key}", 91 | bucket: @aws_s3_bucket, 92 | key: "imported_#{obj.key}" 93 | ) 94 | end 95 | 96 | def process_sync(obj) 97 | # reset cachees on every sync 98 | @alias_cache.clear 99 | @alias_cache_reverse.clear 100 | @session_cache.clear 101 | 102 | start_time = Time.now.utc 103 | manifest = get_manifest(obj) 104 | process_manifest(manifest) 105 | mark_manifest_as_synced(obj) 106 | 107 | diff = Time.now.utc - start_time 108 | logger.info "Done syncing #{obj.key} in #{diff.to_i} seconds" 109 | end 110 | 111 | def get_manifest(obj) 112 | logger.info "Reading #{obj.key}" 113 | 114 | manifest = s3_get_file(obj) 115 | JSON.parse(manifest.body.read) 116 | end 117 | 118 | def process_manifest(manifest) 119 | logger.info "Processing manifest(dump_id: #{manifest['dump_id']})" 120 | 121 | # skip tables we don't need, e.g. "sessions" 122 | tables = manifest['tables'].reject { |table| @skip_tables.include?(table['name']) } 123 | 124 | # custom sorter - aliases, any events then pageviews, finally identify 125 | index_type_name = ->(table) { 126 | idx_type = case table['name'] 127 | when 'user_migrations' then [1, :alias] 128 | when 'sessions' then [3, :session] 129 | when 'pageviews' then [4, :page] 130 | when 'users' then [5, :identify] 131 | else [2, :track] 132 | end 133 | idx_type << table['name'] 134 | } 135 | tables.sort_by!(&index_type_name) 136 | 137 | tables.each do |table| 138 | table['type'] = index_type_name.call(table)[1] 139 | logger.info "Order key #{index_type_name.call(table)}" 140 | end 141 | 142 | tables.each do |table| 143 | process_table(table) 144 | end 145 | end 146 | 147 | def process_table(table) 148 | event_name = table['name'].split('_').map(&:capitalize).join(' ') 149 | logger.info "Processing table(#{table['name']}) - \"#{event_name}\" event" 150 | 151 | files = table['files'].sort 152 | 153 | if ['users', 'user_migrations'].include?(table['name']) 154 | files.sort_by! do |path| 155 | filename = path.split('/').last 156 | filename.split('_').first.to_i 157 | end 158 | end 159 | 160 | files.each do |file| 161 | next if @skip_types.include?(table['type']) 162 | process_file(file, table['type'], event_name) 163 | end 164 | end 165 | 166 | def process_file(file, type, event_name) 167 | # example skip logic 168 | # heap_rs3.skip_file = ->(file) do 169 | # if file.match(/pageviews\/part-(\d+)/) 170 | # # skip files with part number outside of 10000..10999 171 | # # e.g. only process files that match pageviews/part-10??? 172 | # unless (10000..10999).include?(match[1].to_i) 173 | # return true 174 | # end 175 | # end 176 | # end 177 | 178 | if @skip_file.is_a?(Proc) && skip_file.call(file) 179 | logger.info "Skipping file(#{file})" 180 | return 181 | end 182 | 183 | logger.info "Processing file(#{file})" 184 | 185 | load_start_time = Time.now.utc 186 | s3_file = s3_get_file(file) 187 | reader = Avro::IO::DatumReader.new 188 | avro = Avro::DataFile::Reader.new(s3_file.body, reader) 189 | load_diff = Time.now.utc - load_start_time 190 | 191 | counter = 0 192 | skipped = 0 193 | start_time = Time.now.utc # we start timer after file is read from S3 194 | 195 | avro.each do |hash| 196 | # TODO sample raw logger 197 | # if counter % 10_000 == 0 198 | # if counter == 0 199 | # logger.info hash.inspect 200 | # end 201 | 202 | result = case type 203 | when :track 204 | track(hash, event_name) 205 | when :session 206 | store_session(hash) 207 | when :alias 208 | store_alias(hash) 209 | when :manual_alias # useful for manually processing user_migrations as aliases 210 | aliaz(hash) 211 | else 212 | send(type, hash) 213 | end 214 | 215 | skipped += 1 if result.nil? 216 | counter += 1 217 | end 218 | 219 | diff = Time.now.utc - start_time 220 | if diff > 0 221 | logger.info "Done. Loading #{load_diff.to_i}s, processing #{diff.to_i}s, #{counter} rows, #{skipped} skipped (#{(counter / diff).to_i} rows/sec)" 222 | end 223 | end 224 | 225 | def parse_time(time) 226 | Time.zone.parse(time).utc 227 | end 228 | 229 | def parse_heap_timestamp(value) 230 | return unless value 231 | Time.at(value / 1_000_000).utc.iso8601 232 | rescue 233 | value 234 | end 235 | 236 | def wrap_cookie(heap_user_id, resolve=true) 237 | return nil unless heap_user_id 238 | 239 | resolved_user_id = resolve ? resolve_heap_user(heap_user_id) : heap_user_id 240 | "#{@project_identifier}|#{resolved_user_id}" 241 | end 242 | 243 | def common_payload(hash) 244 | heap_user_id = hash.delete('user_id') 245 | { 246 | anonymous_id: wrap_cookie(heap_user_id), 247 | message_id: "HEAP|#{hash.delete('event_id')}", 248 | timestamp: parse_time(hash.delete('time')), 249 | context: { 250 | 'ip' => (hash.delete('ip') || hash.delete('browser_ip')), 251 | 'library' => { 252 | 'name' => 'HeapIntegration', 253 | 'version' => '1.0' 254 | } 255 | }, 256 | properties: { 257 | 'heap_user_id' => heap_user_id 258 | } 259 | } 260 | end 261 | 262 | def skip_before?(timestamp) 263 | @skip_before && timestamp < @skip_before 264 | end 265 | 266 | def track(hash, event_name) 267 | payload = common_payload(hash) 268 | return if skip_before?(payload[:timestamp]) 269 | 270 | payload[:event] = event_name 271 | payload[:properties].merge!(hash.reject { |_, v| v.nil? || v.to_s.bytesize > 200 }) 272 | 273 | if revenue_field = @revenue_mapping[event_name] 274 | payload[:properties]['revenue'] ||= hash.delete(revenue_field.to_s) 275 | else @revenue_fallback.any? 276 | payload[:properties]['revenue'] ||= hash.values_at(*@revenue_fallback).compact.first 277 | end 278 | 279 | @processor.track(payload) 280 | end 281 | 282 | def page(hash) 283 | payload = common_payload(hash) 284 | return if skip_before?(payload[:timestamp]) 285 | 286 | payload[:name] = 'Loaded a Page' 287 | 288 | # TODO detect mobile and send screen event instead 289 | url = case hash['library'] 290 | when 'web' 291 | 'https://' + hash.values_at('domain', 'path', 'query', 'hash').join 292 | when 'ios', 'android' 293 | "#{hash['library']}-app://" + hash.values_at('app_name', 'view_controller').compact.join('/') 294 | else 295 | 'unknown://' + hash['event_id'].to_s 296 | end 297 | 298 | # UPDATE 2025-04-18 probably that doesn't exist anymore or was custom 299 | # if `session_time` present detect session referrer in place 300 | referrer = if hash['session_time'] 301 | session_time = parse_time(hash.delete('session_time')) 302 | if session_time == payload[:timestamp] 303 | hash.delete('referrer') 304 | end 305 | end 306 | 307 | # UPDATE 2025-04-18 new logic to detect session start 308 | referrer ||= if @session_cache.has_key?(hash['session_id']) && (@session_cache[hash['session_id']] == payload[:timestamp]) 309 | hash.delete('referrer') 310 | end 311 | 312 | # build previous page if referrer not found previously 313 | previous_page = hash.delete('previous_page') || hash.delete('heap_previous_page') 314 | if !referrer && previous_page 315 | referrer = if hash['domain'].present? 316 | 'https://' + hash['domain'] + previous_page 317 | else # weird cases when previous_page is something like local path e.g. "/C:/Users/bk221/OneDrive/Desktop/NFL%20Draft%20Big%20Board.html" 318 | 'unknown://' + previous_page 319 | end 320 | end 321 | 322 | payload[:properties] = { 323 | 'referrer' => referrer, 324 | 'title' => hash.delete('title'), 325 | 'url' => url 326 | } 327 | 328 | @processor.page(payload) 329 | end 330 | 331 | def identify(hash) 332 | heap_user_id = hash.delete('user_id') 333 | email = hash.delete('email') || hash.delete('_email') 334 | identity = hash.delete('identity') 335 | 336 | # common workaround for heap? email used as identity 337 | if email.nil? && identity && identity.include?('@') 338 | email = identity 339 | end 340 | 341 | # uses email as USER_ID or sets it to null (if email is empty) 342 | if @user_id_prop == 'email' 343 | identity = email 344 | end 345 | 346 | payload = { 347 | anonymous_id: wrap_cookie(heap_user_id), 348 | user_id: identity, 349 | traits: { 350 | 'email' => email, 351 | 'identity' => identity, 352 | 'heap_user_id' => heap_user_id, 353 | 'join_date' => parse_heap_timestamp(hash.delete('joindate')), 354 | 'last_modified' => parse_heap_timestamp(hash.delete('last_modified')) 355 | }.reject { |_, v| v.nil? } 356 | } 357 | 358 | payload[:traits] = hash.reject { |_, v| v.nil? }.merge(payload[:traits]) 359 | 360 | return if @identify_only_users && payload[:user_id].nil? 361 | 362 | if @alias_on_identify 363 | # OLD approach - works slower on big user migrations but consumes less memory 364 | # @alias_cache. 365 | # select { |_, v| v == heap_user_id }. 366 | # keys. 367 | if @alias_cache_reverse.has_key?(heap_user_id) 368 | @alias_cache_reverse[heap_user_id].each do |from_user_id| 369 | alias_payload = { 370 | 'from_user_id' => from_user_id, 371 | 'to_user_id' => heap_user_id 372 | } 373 | aliaz(alias_payload) 374 | end 375 | end 376 | end 377 | 378 | @processor.identify(payload) 379 | end 380 | 381 | def aliaz(hash) 382 | payload = { 383 | previous_id: wrap_cookie(hash['from_user_id'], false), 384 | anonymous_id: wrap_cookie(hash['to_user_id'], false) 385 | } 386 | p payload if @prompt 387 | 388 | @processor.alias(payload) 389 | end 390 | 391 | def store_session(hash) 392 | @session_cache[hash['session_id']] = parse_time(hash['time']) 393 | end 394 | 395 | def store_alias(hash) 396 | if @alias_on_identify 397 | @alias_cache_reverse[hash['to_user_id']] ||= Set.new 398 | @alias_cache_reverse[hash['to_user_id']].add(hash['from_user_id']) 399 | end 400 | 401 | @alias_cache[hash['from_user_id']] ||= hash['to_user_id'] 402 | end 403 | 404 | def resolve_heap_user(heap_user_id) 405 | @alias_cache[heap_user_id] || heap_user_id 406 | end 407 | 408 | def s3uri_to_hash(s3uri) 409 | raise ArgumentError unless s3uri[0..4] == 's3://' 410 | 411 | bucket, key = s3uri[5..-1].split('/', 2) 412 | { bucket: bucket, key: key } 413 | end 414 | 415 | def s3_get_file(obj) 416 | hash = case obj 417 | when String 418 | s3uri_to_hash(obj) 419 | when Aws::S3::Types::Object 420 | { bucket: @aws_s3_bucket, key: obj.key } 421 | when Hash 422 | obj 423 | else 424 | {} 425 | end 426 | 427 | raise ArgumentError unless hash.has_key?(:bucket) && hash.has_key?(:key) 428 | 429 | @s3.get_object(hash) 430 | end 431 | 432 | end 433 | end 434 | --------------------------------------------------------------------------------