├── .rspec ├── Gemfile ├── lib ├── google_translate_diff │ ├── version.rb │ ├── redis_cache_store.rb │ ├── redis_rate_limiter.rb │ ├── linearizer.rb │ ├── spacing.rb │ ├── cache.rb │ ├── chunker.rb │ ├── tokenizer.rb │ └── request.rb └── google_translate_diff.rb ├── Rakefile ├── .gitignore ├── spec ├── google_translate_diff │ ├── google_translate_diff_spec.rb │ ├── spacing_spec.rb │ ├── linearizer_spec.rb │ ├── chunker_spec.rb │ ├── request_spec.rb │ └── tokenizer_spec.rb └── spec_helper.rb ├── .travis.yml ├── .rubocop.yml ├── google_translate_diff.gemspec └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /lib/google_translate_diff/version.rb: -------------------------------------------------------------------------------- 1 | module GoogleTranslateDiff 2 | VERSION = "1.0.11".freeze 3 | end 4 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task default: :spec 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | 11 | # rspec failure tracking 12 | .rspec_status 13 | test.rb 14 | -------------------------------------------------------------------------------- /spec/google_translate_diff/google_translate_diff_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | RSpec.describe GoogleTranslateDiff do 4 | it "has a version number" do 5 | expect(GoogleTranslateDiff::VERSION).not_to be nil 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.3.3 4 | cache: bundler 5 | 6 | addons: 7 | code_climate: 8 | repo_token: "49b1afe0298c521f3d73377db0d20d1d4b9749ad533126bf3ed105fe9612eb0e" 9 | 10 | after_success: 11 | - bundle exec codeclimate-test-reporter 12 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | Documentation: 2 | Enabled: false 3 | 4 | Style/StringLiterals: 5 | EnforcedStyle: double_quotes 6 | 7 | Style/ClassAndModuleChildren: 8 | EnforcedStyle: compact 9 | 10 | Metrics/BlockLength: 11 | Exclude: 12 | - spec/**/* 13 | 14 | Metrics/LineLength: 15 | Max: 120 16 | -------------------------------------------------------------------------------- /spec/google_translate_diff/spacing_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | RSpec.describe GoogleTranslateDiff::Spacing do 4 | [ 5 | ["a ", "А", "А ", ""], 6 | [" b ", "Б", " Б ", ""] 7 | ].each do |(left, right, result)| 8 | it { expect(described_class.restore(left, right)).to eq(result) } 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | require "simplecov" 3 | 4 | SimpleCov.start 5 | 6 | require "google_translate_diff" 7 | 8 | RSpec.configure do |config| 9 | # Enable flags like --only-failures and --next-failure 10 | config.example_status_persistence_file_path = ".rspec_status" 11 | 12 | config.expect_with :rspec do |c| 13 | c.syntax = :expect 14 | end 15 | 16 | config.filter_run :focus 17 | config.run_all_when_everything_filtered = true 18 | end 19 | -------------------------------------------------------------------------------- /lib/google_translate_diff/redis_cache_store.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::RedisCacheStore 2 | extend Dry::Initializer 3 | 4 | param :connection_pool 5 | 6 | option :timeout, default: proc { 60 * 60 * 24 * 7 } 7 | option :namespace, default: proc { GoogleTranslateDiff::CACHE_NAMESPACE } 8 | 9 | def read_multi(keys) 10 | redis { |redis| redis.mget(*keys) } 11 | end 12 | 13 | def write(key, value) 14 | redis { |redis| redis.setex(key, timeout, value) } 15 | end 16 | 17 | private 18 | 19 | def redis 20 | connection_pool.with do |redis| 21 | yield Redis::Namespace.new(namespace, redis: redis) 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/google_translate_diff/redis_rate_limiter.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::RedisRateLimiter 2 | extend Dry::Initializer 3 | 4 | class RateLimitExceeded < StandardError; end 5 | 6 | param :connection_pool 7 | param :threshold, default: proc { 8000 } 8 | param :interval, default: proc { 60 } 9 | 10 | option :namespace, default: proc { GoogleTranslateDiff::CACHE_NAMESPACE } 11 | 12 | def check(size) 13 | connection_pool.with do |redis| 14 | rate_limit = Ratelimit.new(namespace, redis: redis) 15 | if rate_limit.exceeded?("call", threshold: threshold, interval: interval) 16 | raise RateLimitExceeded 17 | end 18 | rate_limit.add size 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/google_translate_diff/linearizer.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::Linearizer 2 | class << self 3 | def linearize(struct, array = []) 4 | case struct 5 | when Hash then 6 | struct.each { |_k, v| linearize(v, array) } 7 | when Array then 8 | struct.each { |v| linearize(v, array) } 9 | else 10 | array << struct 11 | end 12 | 13 | array 14 | end 15 | 16 | def restore(struct, array) 17 | case struct 18 | when Hash then 19 | struct.each_with_object({}) { |(k, v), h| h[k] = restore(v, array) } 20 | when Array then 21 | struct.map { |v| restore(v, array) } 22 | else 23 | array.shift 24 | end 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/google_translate_diff/linearizer_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | RSpec.describe GoogleTranslateDiff::Linearizer do 4 | let(:array) { described_class.linearize(value) } 5 | subject { described_class.restore(value, array) } 6 | 7 | shared_examples "linearizer" do 8 | it { expect(subject).to eq(value) } 9 | end 10 | 11 | context "with single value" do 12 | let(:value) { "Value" } 13 | 14 | it_behaves_like "linearizer" 15 | end 16 | 17 | context "with array" do 18 | let(:value) { [1, :two, "Three"] } 19 | 20 | it_behaves_like "linearizer" 21 | end 22 | 23 | context "with hash" do 24 | let(:value) { { a: "1", b: 2, c: { d: :three } } } 25 | 26 | it_behaves_like "linearizer" 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/google_translate_diff/spacing.rb: -------------------------------------------------------------------------------- 1 | # Adds same count leading-trailing spaces left has to the right 2 | class GoogleTranslateDiff::Spacing 3 | class << self 4 | # GoogleTranslateDiff::Spacing.restore(" a ", "Z") # => " Z " 5 | def restore(left, right) 6 | leading(left) + right.strip + trailing(left) 7 | end 8 | 9 | private 10 | 11 | def spaces(count) 12 | ([" "] * count).join 13 | end 14 | 15 | def leading(value) 16 | pos = value =~ /[^[:space:]]+/ui 17 | return "" if pos.nil? || pos.zero? 18 | value[0..(pos - 1)] 19 | end 20 | 21 | def trailing(value) 22 | pos = value =~ /[[:space:]]+\z/ui 23 | return "" if pos.nil? 24 | value[pos..-1] 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/google_translate_diff.rb: -------------------------------------------------------------------------------- 1 | require "ox" 2 | require "punkt-segmenter" 3 | require "dry/initializer" 4 | require "google/cloud/translate" 5 | 6 | require "google_translate_diff/version" 7 | require "google_translate_diff/tokenizer" 8 | require "google_translate_diff/linearizer" 9 | require "google_translate_diff/chunker" 10 | require "google_translate_diff/spacing" 11 | require "google_translate_diff/cache" 12 | require "google_translate_diff/redis_cache_store" 13 | require "google_translate_diff/redis_rate_limiter" 14 | require "google_translate_diff/request" 15 | 16 | module GoogleTranslateDiff 17 | class << self 18 | attr_accessor :api 19 | attr_accessor :cache_store 20 | attr_accessor :rate_limiter 21 | 22 | def translate(*args) 23 | Request.new(*args).call 24 | end 25 | end 26 | 27 | CACHE_NAMESPACE = "google-translate-diff".freeze 28 | end 29 | -------------------------------------------------------------------------------- /lib/google_translate_diff/cache.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::Cache 2 | extend Dry::Initializer 3 | 4 | param :from 5 | param :to 6 | 7 | def cached_and_missing(values) 8 | keys = values.map { |v| key(v) } 9 | cached = cache_store.read_multi(keys) 10 | missing = values.map.with_index { |v, i| v if cached[i].nil? }.compact 11 | 12 | [cached, missing] 13 | end 14 | 15 | def store(values, cached, updates) 16 | cached.map.with_index do |value, index| 17 | value || store_value(values[index], updates.shift) 18 | end 19 | end 20 | 21 | private 22 | 23 | def store_value(value, translation) 24 | cache_store.write(key(value), translation) 25 | translation 26 | end 27 | 28 | def key(value) 29 | hash = Digest::MD5.hexdigest(value.strip) # No matter how much spaces 30 | "#{from}:#{to}:#{hash}" 31 | end 32 | 33 | def cache_store 34 | GoogleTranslateDiff.cache_store 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/google_translate_diff/chunker.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::Chunker 2 | extend ::Dry::Initializer 3 | 4 | class Error < StandardError; end 5 | 6 | Chunk = Struct.new(:values, :size) 7 | 8 | param :values 9 | option :limit, default: proc { MAX_CHUNK_SIZE } 10 | option :count_limit, default: proc { COUNT_LIMIT } 11 | 12 | def call 13 | chunks.map(&:values) 14 | end 15 | 16 | def chunks 17 | values.each_with_object([]) do |value, chunks| 18 | validate_value_size(value) 19 | 20 | tail = chunks.last 21 | 22 | if next_chunk?(tail, value) 23 | chunks << Chunk.new([], 0) 24 | tail = chunks.last 25 | end 26 | 27 | update_chunk(tail, value) 28 | end 29 | end 30 | 31 | private 32 | 33 | def next_chunk?(tail, value) 34 | tail.nil? || 35 | (size(value) + tail.size > limit) || 36 | tail.values.size > count_limit 37 | end 38 | 39 | def size(text) 40 | URI.encode(text).size 41 | end 42 | 43 | def update_chunk(chunk, value) 44 | chunk.values << value 45 | chunk.size = chunk.size + value.size 46 | end 47 | 48 | def validate_value_size(value) 49 | raise Error, "Too long part #{value.size} > #{limit}" if value.size > limit 50 | end 51 | 52 | MAX_CHUNK_SIZE = 1700 53 | COUNT_LIMIT = 120 54 | end 55 | -------------------------------------------------------------------------------- /spec/google_translate_diff/chunker_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | RSpec.describe GoogleTranslateDiff::Chunker do 4 | subject { described_class.new(source, limit: 20, count_limit: 5).call } 5 | 6 | let(:a_word) { (["a"] * 10).join } 7 | let(:b_word) { (["a"] * 7).join } 8 | let(:z_word) { (["a"] * 30).join } 9 | let(:x_word) { "x" } 10 | 11 | shared_examples "chunker" do 12 | it { is_expected.to eq(chunks) } 13 | end 14 | 15 | context "not splits if fits" do 16 | let(:source) { %w[a b c] } 17 | let(:chunks) { [%w[a b c]] } 18 | 19 | it_behaves_like "chunker" 20 | end 21 | 22 | context "splits by rough borders" do 23 | let(:source) { [a_word, a_word, a_word] } 24 | let(:chunks) { [[a_word, a_word], [a_word]] } 25 | 26 | it_behaves_like "chunker" 27 | end 28 | 29 | context "splits by non-rogugh borders" do 30 | let(:source) { [b_word, b_word, b_word, a_word] } 31 | let(:chunks) { [[b_word, b_word], [b_word, a_word]] } 32 | 33 | it_behaves_like "chunker" 34 | end 35 | 36 | context "splits by count" do 37 | let(:source) { [x_word] * 10 } 38 | let(:chunks) { [[x_word] * 6, [x_word] * 4] } 39 | 40 | it_behaves_like "chunker" 41 | end 42 | 43 | context "raises if part is too long" do 44 | let(:source) { [z_word] } 45 | let(:chunks) { [[z_word]] } 46 | 47 | it { expect { subject }.to raise_error(/Too long part/) } 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /google_translate_diff.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | lib = File.expand_path("../lib", __FILE__) 4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 5 | require "google_translate_diff/version" 6 | 7 | # rubocop:disable Metrics/BlockLength 8 | Gem::Specification.new do |spec| 9 | spec.name = "google_translate_diff" 10 | spec.version = GoogleTranslateDiff::VERSION 11 | spec.authors = ["Victor Sokolov"] 12 | spec.email = ["gzigzigzeo@evilmartians.com"] 13 | 14 | spec.summary = %( 15 | Google Translate API wrapper for Ruby which helps to translate only changes 16 | between revisions of long texts. 17 | 18 | ) 19 | spec.description = %( 20 | Google Translate API wrapper for Ruby which helps to translate only changes 21 | between revisions of long texts. 22 | ) 23 | spec.homepage = "https://github.com/gzigzigzeo/google_translate_diff" 24 | 25 | if spec.respond_to?(:metadata) 26 | spec.metadata["allowed_push_host"] = "https://rubygems.org" 27 | else 28 | raise "RubyGems 2.0 or newer is required to protect against " \ 29 | "public gem pushes." 30 | end 31 | 32 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 33 | f.match(%r{^(test|spec|features)/}) 34 | end 35 | spec.bindir = "exe" 36 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 37 | spec.require_paths = ["lib"] 38 | 39 | spec.add_development_dependency "bundler", "~> 1.14" 40 | spec.add_development_dependency "rake", "~> 10.0" 41 | spec.add_development_dependency "rspec", "~> 3.0" 42 | spec.add_development_dependency "rubocop" 43 | spec.add_development_dependency "codeclimate-test-reporter", "~> 1.0.0" 44 | spec.add_development_dependency "simplecov" 45 | 46 | spec.add_dependency "google-cloud-translate" 47 | spec.add_dependency "ox" 48 | spec.add_dependency "dry-initializer" 49 | spec.add_dependency "punkt-segmenter" 50 | end 51 | # rubocop:enable Metrics/BlockLength 52 | -------------------------------------------------------------------------------- /spec/google_translate_diff/request_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | RSpec.describe GoogleTranslateDiff::Request do 4 | subject { described_class.new(values, options).call } 5 | 6 | let(:api) { double("API") } 7 | let(:cache_store) { double("Cache store") } 8 | let(:api_response_wrap) { api_response.map { |v| OpenStruct.new(text: v) } } 9 | let(:cache_response) { nil } 10 | 11 | before do 12 | GoogleTranslateDiff.api = api 13 | GoogleTranslateDiff.cache_store = cache_store 14 | 15 | allow(cache_store).to receive(:read_multi) do |keys| 16 | cache_response || ([nil] * keys.size) 17 | end 18 | 19 | allow(cache_store).to receive(:write) { |_, value| value } 20 | 21 | allow(api).to receive(:translate).with(*api_request, options).and_return( 22 | api_response_wrap 23 | ) 24 | end 25 | 26 | context "simple case" do 27 | let(:values) { "Some string" } 28 | let(:options) { { from: :en, to: :ru } } 29 | let(:api_request) { ["Some string"] } 30 | let(:api_response) { ["Какая-то строка"] } 31 | 32 | it { is_expected.to eq("Какая-то строка") } 33 | end 34 | 35 | context "complex structure, simple case" do 36 | let(:values) { { title: "One", description: "Two" } } 37 | let(:options) { { from: :en, to: :ru } } 38 | let(:api_request) { %w[One Two] } 39 | let(:api_response) { %w[Один Два] } 40 | 41 | it { is_expected.to eq(title: "Один", description: "Два") } 42 | end 43 | 44 | context "complex structure" do 45 | let(:values) { { title: "One", more: { description: "Two" }, skip: nil } } 46 | let(:options) { { from: :en, to: :ru } } 47 | let(:api_request) { %w[One Two] } 48 | let(:api_response) { %w[Один Два] } 49 | 50 | it do 51 | is_expected.to eq(title: "Один", more: { description: "Два" }, skip: "") 52 | end 53 | end 54 | 55 | context "HTML" do 56 | let(:values) do 57 | { 58 | title: "One", 59 | more: { 60 | description: "Black", 61 | color: %(So Red 62 | that) 63 | } 64 | } 65 | end 66 | let(:options) { { from: :en, to: :ru } } 67 | let(:api_request) { %w[One Black So Red that] } 68 | let(:api_response) { %w[Один Черный Что Кра что] } 69 | 70 | it do 71 | is_expected.to eq( 72 | title: "Один", 73 | more: { 74 | description: "Черный", 75 | color: %(Что Кра 76 | что) 77 | } 78 | ) 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /lib/google_translate_diff/tokenizer.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::Tokenizer < ::Ox::Sax 2 | def initialize(source) 3 | @pos = nil 4 | @source = source 5 | @tokens = nil 6 | @context = [] 7 | @sequence = [] 8 | @indicies = [] 9 | end 10 | 11 | def instruct(target) 12 | start_markup(target) 13 | end 14 | 15 | def end_instruct(target) 16 | end_markup(target) 17 | end 18 | 19 | def start_element(name) 20 | start_markup(name) 21 | end 22 | 23 | def end_element(name) 24 | end_markup(name) 25 | end 26 | 27 | def attr(name, value) 28 | return unless @context.last == :span 29 | return unless name == :class && value == "notranslate" 30 | return if notranslate? 31 | 32 | @sequence[-1] = :notranslate 33 | end 34 | 35 | def text(value) 36 | return if value == "" 37 | @sequence << (SKIP.include?(@context.last) ? :markup : :text) 38 | @indicies << @pos - 1 39 | end 40 | 41 | def tokens 42 | @tokens ||= token_sequences_joined 43 | .tap { |tokens| make_sentences_from_last_token(tokens) } 44 | end 45 | 46 | private 47 | 48 | def token_sequences_joined 49 | raw_tokens.each_with_object([]) do |token, tokens| 50 | if tokens.empty? # Initial state 51 | tokens << token 52 | elsif tokens.last[1] == token[1] 53 | # Join series of tokens of the same type into one 54 | tokens.last[0].concat(token[0]) 55 | else 56 | # If token before :markup is :text we need to split it into sentences 57 | make_sentences_from_last_token(tokens) 58 | tokens << token 59 | end 60 | end 61 | end 62 | 63 | def make_sentences_from_last_token(tokens) 64 | return if tokens.empty? 65 | tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text 66 | end 67 | 68 | # rubocop: disable Metrics/MethodLength 69 | def sentences(value) 70 | return [] if value.strip.empty? 71 | 72 | boundaries = 73 | Punkt::SentenceTokenizer 74 | .new(value) 75 | .sentences_from_text(value) 76 | 77 | return [[value, :text]] if boundaries.size == 1 78 | 79 | boundaries.map.with_index do |(left, right), index| 80 | next_boundary = boundaries[index + 1] 81 | right = next_boundary[0] - 1 if next_boundary 82 | 83 | [value[left..right], :text] 84 | end 85 | end 86 | # rubocop:enable Metrics/MethodLength 87 | 88 | # Whether the sequence is between `:notranslate` and `:end_notranslate` 89 | def notranslate? 90 | @sequence.select { |item| item[/notranslate/] }.last == :notranslate 91 | end 92 | 93 | # Returns the item for last opened span 94 | def end_span 95 | return :markup unless notranslate? 96 | opened_spans = @sequence 97 | .reverse 98 | .take_while { |item| item != :notranslate } 99 | .map { |item| { span: 1, end_span: -1 }.fetch(item, 0) } 100 | .reduce(0, :+) 101 | 102 | opened_spans.positive? ? :end_span : :end_notranslate 103 | end 104 | 105 | def raw_tokens 106 | @raw_tokens ||= @indicies.map.with_index do |i, n| 107 | first = i 108 | last = (@indicies[n + 1] || 0) - 1 109 | value = fix_utf(@source.byteslice(first..last)) 110 | type = @sequence[n] 111 | type = :text if INNER_SPANS.include?(type) 112 | [value, type] 113 | end 114 | end 115 | 116 | def fix_utf(value) 117 | value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ") 118 | end 119 | 120 | def start_markup(name) 121 | @context << name 122 | @sequence << (notranslate? ? (name == :span ? :span : :text) : :markup) 123 | @indicies << @pos - 1 124 | end 125 | 126 | def end_markup(name) 127 | @context.pop 128 | @sequence << (notranslate? ? (name == :span ? end_span : :text) : :markup) 129 | @indicies << @pos - 1 unless @pos == @source.bytesize 130 | end 131 | 132 | class << self 133 | def tokenize(value) 134 | return [] if value.nil? 135 | tokenizer = new(value).tap do |h| 136 | Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS) 137 | end 138 | tokenizer.tokens 139 | end 140 | end 141 | 142 | SKIP = %i[script style].freeze 143 | INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze 144 | HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze 145 | end 146 | -------------------------------------------------------------------------------- /lib/google_translate_diff/request.rb: -------------------------------------------------------------------------------- 1 | class GoogleTranslateDiff::Request 2 | extend Dry::Initializer 3 | extend Forwardable 4 | 5 | param :values 6 | param :options 7 | 8 | def_delegators :GoogleTranslateDiff, :api, :cache_store, :rate_limiter 9 | def_delegators :"GoogleTranslateDiff::Linearizer", :linearize, :restore 10 | 11 | def call 12 | validate_globals 13 | 14 | return values if from == to || values.empty? 15 | 16 | translation 17 | end 18 | 19 | private 20 | 21 | def from 22 | @from ||= options.fetch(:from) 23 | end 24 | 25 | def to 26 | @to ||= options.fetch(:to) 27 | end 28 | 29 | def validate_globals 30 | raise "Set GoogleTranslateDiff.api before calling ::translate" unless api 31 | return if cache_store 32 | raise "Set GoogleTranslateDiff.cache_store before calling ::translate" 33 | end 34 | 35 | # Extracts flat text array 36 | # => "Name", "Good boy" 37 | # 38 | # #values might be something like { name: "Name", bio: "Good boy" } 39 | def texts 40 | @texts ||= linearize(values) 41 | end 42 | 43 | # Converts each array item to token list 44 | # => [..., [["", :markup], ["Good", :text], ...]] 45 | def tokens 46 | @tokens ||= texts.map do |value| 47 | GoogleTranslateDiff::Tokenizer.tokenize(value) 48 | end 49 | end 50 | 51 | # Extracts text tokens from token list 52 | # => { ..., "1_1" => "Good", 1_3 => "Boy", ... } 53 | def text_tokens 54 | @text_tokens ||= extract_text_tokens.to_h 55 | end 56 | 57 | def extract_text_tokens 58 | tokens.each_with_object([]).with_index do |(group, result), group_index| 59 | group.each_with_index do |(value, type), index| 60 | result << ["#{group_index}_#{index}", value] if type == :text 61 | end 62 | end 63 | end 64 | 65 | # Extracts values from text tokens 66 | # => [ ..., "Good", "Boy", ... ] 67 | def text_tokens_texts 68 | @text_tokens_texts ||= linearize(text_tokens).map(&:to_s).map(&:strip) 69 | end 70 | 71 | # Splits things requires translations to per-request chunks 72 | # (groups less 2k sym) 73 | # => [[ ..., "Good", "Boy", ... ]] 74 | def chunks 75 | @chunks ||= GoogleTranslateDiff::Chunker.new(text_tokens_texts).call 76 | end 77 | 78 | # Translates/loads from cache values from each chunk 79 | # => [[ ..., "Horoshiy", "Malchik", ... ]] 80 | def chunks_translated 81 | @chunks_translated ||= chunks.map do |chunk| 82 | cached, missing = cache.cached_and_missing(chunk) 83 | if missing.empty? 84 | cached 85 | else 86 | cache.store(chunk, cached, call_api(missing)) 87 | end 88 | end 89 | end 90 | 91 | # Restores indexes for translated tokens 92 | # => { ..., "1_1" => "Horoshiy", 1_3 => "Malchik", ... } 93 | def text_tokens_translated 94 | @text_tokens_texts_translated ||= 95 | restore(text_tokens, chunks_translated.flatten) 96 | end 97 | 98 | # Restores tokens translated + adds same spacing as in source token 99 | # => [[..., [ "Horoshiy", :text ], ...]] 100 | # rubocop:disable Metrics/AbcSize 101 | def tokens_translated 102 | @tokens_translated ||= tokens.dup.tap do |tokens| 103 | text_tokens_translated.each do |index, value| 104 | group_index, index = index.split("_") 105 | tokens[group_index.to_i][index.to_i][0] = 106 | restore_spacing(tokens[group_index.to_i][index.to_i][0], value) 107 | end 108 | end 109 | end 110 | # rubocop:enable Metrics/AbcSize 111 | 112 | def restore_spacing(source_value, value) 113 | GoogleTranslateDiff::Spacing.restore(source_value, value) 114 | end 115 | 116 | # Restores texts from tokens 117 | # [..., "Horoshiy Malchik", ...] 118 | def texts_translated 119 | @texts_translated ||= tokens_translated.map do |group| 120 | group.map { |value, type| type == :text ? value : fix_ascii(value) }.join 121 | end 122 | end 123 | 124 | # Final result 125 | def translation 126 | @translation ||= restore(values, texts_translated) 127 | end 128 | 129 | def call_api(values) 130 | check_rate_limit(values) 131 | [api.translate(*values, **options)].flatten.map(&:text) 132 | end 133 | 134 | def cache 135 | @cache ||= GoogleTranslateDiff::Cache.new(from, to) 136 | end 137 | 138 | def check_rate_limit(values) 139 | return if rate_limiter.nil? 140 | size = values.map(&:size).inject(0) { |sum, x| sum + x } 141 | rate_limiter.check(size) 142 | end 143 | 144 | # Markup should not contain control characters 145 | def fix_ascii(value) 146 | value.gsub(/[\u0000-\u001F]/, " ") 147 | end 148 | end 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GoogleTranslateDiff 2 | 3 | Google Translate API wrapper helps to translate only changes between revisions of long texts. 4 | 5 | 6 | Sponsored by Evil Martians 7 | 8 | 9 | [![Build Status](https://travis-ci.org/gzigzigzeo/google_translate_diff.svg?branch=master)](https://travis-ci.org/gzigzigzeo/google_translate_diff) [![Code Climate](https://codeclimate.com/github/gzigzigzeo/google_translate_diff/badges/gpa.svg)](https://codeclimate.com/github/gzigzigzeo/google_translate_diff) [![Test Coverage](https://codeclimate.com/github/gzigzigzeo/google_translate_diff/badges/coverage.svg)](https://codeclimate.com/github/gzigzigzeo/google_translate_diff/coverage) 10 | 11 | ## Use case 12 | 13 | Assume your project contains a significant amount of products descriptions which: 14 | - Require retranslation each time user edits them. 15 | - Have a lot of equal parts (like return policy). 16 | - Change frequently. 17 | 18 | If your user changes a single word within the long description, you will be charged for the retranslation of the whole text. 19 | 20 | Much better approach is to try to translate every repeated structural element (sentence) in your texts array just once to save money. This gem helps to make it done. 21 | 22 | ## Installation 23 | 24 | Add this line to your application's Gemfile: 25 | 26 | ```ruby 27 | gem 'google_translate_diff' 28 | ``` 29 | 30 | And then execute: 31 | 32 | $ bundle 33 | 34 | Or install it yourself as: 35 | 36 | $ gem install google_translate_diff 37 | 38 | ## Usage 39 | 40 | ```ruby 41 | require "google_translate_diff" 42 | 43 | # This dependencies are not included, as you might need to roll your own cache based on different store 44 | require "redis" 45 | require "connection_pool" 46 | require "redis-namespace" 47 | require "ratelimit" # Optional, if you will use 48 | 49 | # Setup https://github.com/GoogleCloudPlatform/google-cloud-ruby/tree/master/google-cloud-translate 50 | ENV["TRANSLATE_KEY"] = "foobarkey" 51 | 52 | # I always use pool for redis 53 | pool = ConnectionPool.new(size: 10, timeout: 5) { Redis.new } 54 | 55 | # Pass any options (like app id) 56 | GoogleTranslateDiff.api = Google::Cloud::Translate.new 57 | 58 | GoogleTranslateDiff.cache_store = 59 | GoogleTranslateDiff::RedisCacheStore.new(pool, timeout: 7.days, namespace: "t") 60 | 61 | # Optional 62 | GoogleTranslateDiff.rate_limiter = 63 | GoogleTranslateDiff::RedisRateLimiter.new( 64 | pool, threshold: 8000, interval: 60, namespace: "t" 65 | ) 66 | 67 | GoogleTranslateDiff.translate("test translations", from: "en", to: "es") 68 | ``` 69 | 70 | ## How it works 71 | 72 | - Text nodes are extracted from HTML. 73 | - Every text node is split into sentences (using `punkt-segmenter` gem). 74 | - Cache is checked for the presence of each sentence (using language couple and a hash of string). 75 | - Missing sentences are translated via API and cached. 76 | - Original HTML is recombined from translations and cache data. 77 | 78 | *NOTE:* `:from` is required param. Cache can not be checked without specifying exact language couple, that's the limitation. 79 | 80 | ## Input 81 | 82 | `::translate` can receive string, array or deep hash and will return the same, but translated. 83 | 84 | ```ruby 85 | GoogleTranslateDiff.translate("test", from: "en", to: "es") 86 | GoogleTranslateDiff.translate("test", "language", from: "en", to: "es") 87 | GoogleTranslateDiff.translate( 88 | { title: "test", values: { type: "frequent" } }, from: "en", to: "es" 89 | ) 90 | ``` 91 | 92 | See `GoogleTranslateDiff::Linearizer` for details. 93 | 94 | ## HTML 95 | 96 | You can pass HTML as like as plain text: 97 | 98 | ```ruby 99 | GoogleTranslateDiff.translate("Black", from: "en", to: "es") 100 | ``` 101 | 102 | ## Very long texts 103 | 104 | Google API has a limitation: query can not be longer than approximately 4 KB. If your text is really that long, multiple queries will be used to translate it automatically. 105 | 106 | ## Development 107 | 108 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 109 | 110 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 111 | 112 | ## Contributing 113 | 114 | Bug reports and pull requests are welcome on GitHub at https://github.com/gzigzigzeo/google_translate_diff. 115 | -------------------------------------------------------------------------------- /spec/google_translate_diff/tokenizer_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | RSpec.describe GoogleTranslateDiff::Tokenizer do 4 | subject do 5 | described_class.tokenize(source) 6 | end 7 | 8 | shared_examples "tokenizer" do 9 | it { expect(subject).to eq(tokens) } 10 | end 11 | 12 | context "empty" do 13 | let(:source) { "" } 14 | let(:tokens) { [] } 15 | 16 | it_behaves_like "tokenizer" 17 | end 18 | 19 | context "pure crlf" do 20 | let(:source) { "
\n
" } 21 | let(:tokens) { [["
", :markup], ["
", :markup]] } 22 | 23 | it_behaves_like "tokenizer" 24 | end 25 | 26 | context "pure text" do 27 | let(:source) { "test\nphrase" } 28 | let(:tokens) { [[source, :text]] } 29 | 30 | it_behaves_like "tokenizer" 31 | end 32 | 33 | context "with some markup ending with text" do 34 | let(:source) { "alfabravokilo" } 35 | let(:tokens) do 36 | [ 37 | ["alfa", :text], 38 | ["", :markup], 39 | ["bravo", :text], 40 | ["", :markup], 41 | ["kilo", :text] 42 | ] 43 | end 44 | 45 | it_behaves_like "tokenizer" 46 | end 47 | 48 | context "with some markup ending with tag" do 49 | let(:source) { "alfabravo" } 50 | let(:tokens) do 51 | [ 52 | ["alfa", :text], 53 | ["", :markup], 54 | ["bravo", :text], 55 | ["", :markup] 56 | ] 57 | end 58 | 59 | it_behaves_like "tokenizer" 60 | end 61 | 62 | context "with some markup ending with script and non-ascii" do 63 | let(:source) { "альбракил" } 64 | let(:tokens) do 65 | [ 66 | ["аль", :text], 67 | ["", :markup], 68 | ["бра", :text], 69 | ["", :markup], 70 | ["кил", :text], 71 | ["", :markup] 72 | ] 73 | end 74 | 75 | it_behaves_like "tokenizer" 76 | end 77 | 78 | context "sentences" do 79 | let(:source) do 80 | "! Киловольт. Смеркалось. Ворчало. Кричало." 81 | end 82 | 83 | let(:tokens) do 84 | [ 85 | ["! ", :text], 86 | ["Киловольт. ", :text], 87 | ["", :markup], 88 | ["Смеркалось. ", :text], 89 | ["Ворчало. ", :text], 90 | ["Кричало.", :text], 91 | ["", :markup] 92 | ] 93 | end 94 | 95 | it_behaves_like "tokenizer" 96 | end 97 | 98 | context "notranslate" do 99 | let(:source) do 100 | "test 101 | xy" 102 | end 103 | 104 | let(:tokens) do 105 | [ 106 | ["test", :text], 107 | ["", :markup], 108 | ["\nxy", :text], 109 | ["", :markup] 110 | ] 111 | end 112 | 113 | it_behaves_like "tokenizer" 114 | end 115 | 116 | context "notranslate inside another span" do 117 | let(:source) do 118 | "foobar
baz
" 119 | end 120 | 121 | let(:tokens) do 122 | [ 123 | ["", :markup], 124 | ["foobar
baz
", :text], 125 | ["
", :markup] 126 | ] 127 | end 128 | 129 | it_behaves_like "tokenizer" 130 | end 131 | 132 | context "notranslate inside another notranslate" do 133 | let(:source) do 134 | "foo" \ 135 | "barbaz" \ 136 | "" 137 | end 138 | 139 | let(:tokens) do 140 | [ 141 | [source, :text] 142 | ] 143 | end 144 | 145 | it_behaves_like "tokenizer" 146 | end 147 | 148 | context "with
tag before closing tag" do 149 | let(:source) do 150 | "Смеркалось.
" 151 | end 152 | 153 | let(:tokens) do 154 | [ 155 | ["", :markup], 156 | ["Смеркалось.", :text], 157 | ["
", :markup] 158 | ] 159 | end 160 | 161 | it_behaves_like "tokenizer" 162 | end 163 | 164 | context "with tag" do 165 | let(:source) do 166 | "Hey!
Look!" \ 167 | "" 168 | end 169 | 170 | let(:tokens) do 171 | [ 172 | ["Hey!", :text], 173 | ["
", :markup], 174 | ["Look!", :text], 175 | ["", :markup] 176 | ] 177 | end 178 | 179 | it_behaves_like "tokenizer" 180 | end 181 | 182 | context "bizarre sentences" do 183 | let(:source) do 184 | "Набор «Солнечная механика» от 4М — это 6 экспериментов." \ 185 | "\n\nЮному изобретателю предстоит воочию посмотреть на чудеса." 186 | end 187 | 188 | let(:tokens) do 189 | [ 190 | ["Набор «Солнечная механика» от 4М — это 6 экспериментов.\n\n", :text], 191 | ["Юному изобретателю предстоит воочию посмотреть на чудеса.", :text] 192 | ] 193 | end 194 | 195 | it_behaves_like "tokenizer" 196 | end 197 | end 198 | --------------------------------------------------------------------------------