├── .rspec
├── Gemfile
├── lib
├── google_translate_diff
│ ├── version.rb
│ ├── redis_cache_store.rb
│ ├── redis_rate_limiter.rb
│ ├── linearizer.rb
│ ├── spacing.rb
│ ├── cache.rb
│ ├── chunker.rb
│ ├── tokenizer.rb
│ └── request.rb
└── google_translate_diff.rb
├── Rakefile
├── .gitignore
├── spec
├── google_translate_diff
│ ├── google_translate_diff_spec.rb
│ ├── spacing_spec.rb
│ ├── linearizer_spec.rb
│ ├── chunker_spec.rb
│ ├── request_spec.rb
│ └── tokenizer_spec.rb
└── spec_helper.rb
├── .travis.yml
├── .rubocop.yml
├── google_translate_diff.gemspec
└── README.md
/.rspec:
--------------------------------------------------------------------------------
1 | --format documentation
2 | --color
3 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 |
3 | gemspec
4 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/version.rb:
--------------------------------------------------------------------------------
1 | module GoogleTranslateDiff
2 | VERSION = "1.0.11".freeze
3 | end
4 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | require "rspec/core/rake_task"
3 |
4 | RSpec::Core::RakeTask.new(:spec)
5 |
6 | task default: :spec
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle/
2 | /.yardoc
3 | /Gemfile.lock
4 | /_yardoc/
5 | /coverage/
6 | /doc/
7 | /pkg/
8 | /spec/reports/
9 | /tmp/
10 |
11 | # rspec failure tracking
12 | .rspec_status
13 | test.rb
14 |
--------------------------------------------------------------------------------
/spec/google_translate_diff/google_translate_diff_spec.rb:
--------------------------------------------------------------------------------
1 | require "spec_helper"
2 |
3 | RSpec.describe GoogleTranslateDiff do
4 | it "has a version number" do
5 | expect(GoogleTranslateDiff::VERSION).not_to be nil
6 | end
7 | end
8 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - 2.3.3
4 | cache: bundler
5 |
6 | addons:
7 | code_climate:
8 | repo_token: "49b1afe0298c521f3d73377db0d20d1d4b9749ad533126bf3ed105fe9612eb0e"
9 |
10 | after_success:
11 | - bundle exec codeclimate-test-reporter
12 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | Documentation:
2 | Enabled: false
3 |
4 | Style/StringLiterals:
5 | EnforcedStyle: double_quotes
6 |
7 | Style/ClassAndModuleChildren:
8 | EnforcedStyle: compact
9 |
10 | Metrics/BlockLength:
11 | Exclude:
12 | - spec/**/*
13 |
14 | Metrics/LineLength:
15 | Max: 120
16 |
--------------------------------------------------------------------------------
/spec/google_translate_diff/spacing_spec.rb:
--------------------------------------------------------------------------------
1 | require "spec_helper"
2 |
3 | RSpec.describe GoogleTranslateDiff::Spacing do
4 | [
5 | ["a ", "А", "А ", ""],
6 | [" b ", "Б", " Б ", ""]
7 | ].each do |(left, right, result)|
8 | it { expect(described_class.restore(left, right)).to eq(result) }
9 | end
10 | end
11 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | require "bundler/setup"
2 | require "simplecov"
3 |
4 | SimpleCov.start
5 |
6 | require "google_translate_diff"
7 |
8 | RSpec.configure do |config|
9 | # Enable flags like --only-failures and --next-failure
10 | config.example_status_persistence_file_path = ".rspec_status"
11 |
12 | config.expect_with :rspec do |c|
13 | c.syntax = :expect
14 | end
15 |
16 | config.filter_run :focus
17 | config.run_all_when_everything_filtered = true
18 | end
19 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/redis_cache_store.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::RedisCacheStore
2 | extend Dry::Initializer
3 |
4 | param :connection_pool
5 |
6 | option :timeout, default: proc { 60 * 60 * 24 * 7 }
7 | option :namespace, default: proc { GoogleTranslateDiff::CACHE_NAMESPACE }
8 |
9 | def read_multi(keys)
10 | redis { |redis| redis.mget(*keys) }
11 | end
12 |
13 | def write(key, value)
14 | redis { |redis| redis.setex(key, timeout, value) }
15 | end
16 |
17 | private
18 |
19 | def redis
20 | connection_pool.with do |redis|
21 | yield Redis::Namespace.new(namespace, redis: redis)
22 | end
23 | end
24 | end
25 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/redis_rate_limiter.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::RedisRateLimiter
2 | extend Dry::Initializer
3 |
4 | class RateLimitExceeded < StandardError; end
5 |
6 | param :connection_pool
7 | param :threshold, default: proc { 8000 }
8 | param :interval, default: proc { 60 }
9 |
10 | option :namespace, default: proc { GoogleTranslateDiff::CACHE_NAMESPACE }
11 |
12 | def check(size)
13 | connection_pool.with do |redis|
14 | rate_limit = Ratelimit.new(namespace, redis: redis)
15 | if rate_limit.exceeded?("call", threshold: threshold, interval: interval)
16 | raise RateLimitExceeded
17 | end
18 | rate_limit.add size
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/linearizer.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::Linearizer
2 | class << self
3 | def linearize(struct, array = [])
4 | case struct
5 | when Hash then
6 | struct.each { |_k, v| linearize(v, array) }
7 | when Array then
8 | struct.each { |v| linearize(v, array) }
9 | else
10 | array << struct
11 | end
12 |
13 | array
14 | end
15 |
16 | def restore(struct, array)
17 | case struct
18 | when Hash then
19 | struct.each_with_object({}) { |(k, v), h| h[k] = restore(v, array) }
20 | when Array then
21 | struct.map { |v| restore(v, array) }
22 | else
23 | array.shift
24 | end
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/spec/google_translate_diff/linearizer_spec.rb:
--------------------------------------------------------------------------------
1 | require "spec_helper"
2 |
3 | RSpec.describe GoogleTranslateDiff::Linearizer do
4 | let(:array) { described_class.linearize(value) }
5 | subject { described_class.restore(value, array) }
6 |
7 | shared_examples "linearizer" do
8 | it { expect(subject).to eq(value) }
9 | end
10 |
11 | context "with single value" do
12 | let(:value) { "Value" }
13 |
14 | it_behaves_like "linearizer"
15 | end
16 |
17 | context "with array" do
18 | let(:value) { [1, :two, "Three"] }
19 |
20 | it_behaves_like "linearizer"
21 | end
22 |
23 | context "with hash" do
24 | let(:value) { { a: "1", b: 2, c: { d: :three } } }
25 |
26 | it_behaves_like "linearizer"
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/spacing.rb:
--------------------------------------------------------------------------------
1 | # Adds same count leading-trailing spaces left has to the right
2 | class GoogleTranslateDiff::Spacing
3 | class << self
4 | # GoogleTranslateDiff::Spacing.restore(" a ", "Z") # => " Z "
5 | def restore(left, right)
6 | leading(left) + right.strip + trailing(left)
7 | end
8 |
9 | private
10 |
11 | def spaces(count)
12 | ([" "] * count).join
13 | end
14 |
15 | def leading(value)
16 | pos = value =~ /[^[:space:]]+/ui
17 | return "" if pos.nil? || pos.zero?
18 | value[0..(pos - 1)]
19 | end
20 |
21 | def trailing(value)
22 | pos = value =~ /[[:space:]]+\z/ui
23 | return "" if pos.nil?
24 | value[pos..-1]
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/lib/google_translate_diff.rb:
--------------------------------------------------------------------------------
1 | require "ox"
2 | require "punkt-segmenter"
3 | require "dry/initializer"
4 | require "google/cloud/translate"
5 |
6 | require "google_translate_diff/version"
7 | require "google_translate_diff/tokenizer"
8 | require "google_translate_diff/linearizer"
9 | require "google_translate_diff/chunker"
10 | require "google_translate_diff/spacing"
11 | require "google_translate_diff/cache"
12 | require "google_translate_diff/redis_cache_store"
13 | require "google_translate_diff/redis_rate_limiter"
14 | require "google_translate_diff/request"
15 |
16 | module GoogleTranslateDiff
17 | class << self
18 | attr_accessor :api
19 | attr_accessor :cache_store
20 | attr_accessor :rate_limiter
21 |
22 | def translate(*args)
23 | Request.new(*args).call
24 | end
25 | end
26 |
27 | CACHE_NAMESPACE = "google-translate-diff".freeze
28 | end
29 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/cache.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::Cache
2 | extend Dry::Initializer
3 |
4 | param :from
5 | param :to
6 |
7 | def cached_and_missing(values)
8 | keys = values.map { |v| key(v) }
9 | cached = cache_store.read_multi(keys)
10 | missing = values.map.with_index { |v, i| v if cached[i].nil? }.compact
11 |
12 | [cached, missing]
13 | end
14 |
15 | def store(values, cached, updates)
16 | cached.map.with_index do |value, index|
17 | value || store_value(values[index], updates.shift)
18 | end
19 | end
20 |
21 | private
22 |
23 | def store_value(value, translation)
24 | cache_store.write(key(value), translation)
25 | translation
26 | end
27 |
28 | def key(value)
29 | hash = Digest::MD5.hexdigest(value.strip) # No matter how much spaces
30 | "#{from}:#{to}:#{hash}"
31 | end
32 |
33 | def cache_store
34 | GoogleTranslateDiff.cache_store
35 | end
36 | end
37 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/chunker.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::Chunker
2 | extend ::Dry::Initializer
3 |
4 | class Error < StandardError; end
5 |
6 | Chunk = Struct.new(:values, :size)
7 |
8 | param :values
9 | option :limit, default: proc { MAX_CHUNK_SIZE }
10 | option :count_limit, default: proc { COUNT_LIMIT }
11 |
12 | def call
13 | chunks.map(&:values)
14 | end
15 |
16 | def chunks
17 | values.each_with_object([]) do |value, chunks|
18 | validate_value_size(value)
19 |
20 | tail = chunks.last
21 |
22 | if next_chunk?(tail, value)
23 | chunks << Chunk.new([], 0)
24 | tail = chunks.last
25 | end
26 |
27 | update_chunk(tail, value)
28 | end
29 | end
30 |
31 | private
32 |
33 | def next_chunk?(tail, value)
34 | tail.nil? ||
35 | (size(value) + tail.size > limit) ||
36 | tail.values.size > count_limit
37 | end
38 |
39 | def size(text)
40 | URI.encode(text).size
41 | end
42 |
43 | def update_chunk(chunk, value)
44 | chunk.values << value
45 | chunk.size = chunk.size + value.size
46 | end
47 |
48 | def validate_value_size(value)
49 | raise Error, "Too long part #{value.size} > #{limit}" if value.size > limit
50 | end
51 |
52 | MAX_CHUNK_SIZE = 1700
53 | COUNT_LIMIT = 120
54 | end
55 |
--------------------------------------------------------------------------------
/spec/google_translate_diff/chunker_spec.rb:
--------------------------------------------------------------------------------
1 | require "spec_helper"
2 |
3 | RSpec.describe GoogleTranslateDiff::Chunker do
4 | subject { described_class.new(source, limit: 20, count_limit: 5).call }
5 |
6 | let(:a_word) { (["a"] * 10).join }
7 | let(:b_word) { (["a"] * 7).join }
8 | let(:z_word) { (["a"] * 30).join }
9 | let(:x_word) { "x" }
10 |
11 | shared_examples "chunker" do
12 | it { is_expected.to eq(chunks) }
13 | end
14 |
15 | context "not splits if fits" do
16 | let(:source) { %w[a b c] }
17 | let(:chunks) { [%w[a b c]] }
18 |
19 | it_behaves_like "chunker"
20 | end
21 |
22 | context "splits by rough borders" do
23 | let(:source) { [a_word, a_word, a_word] }
24 | let(:chunks) { [[a_word, a_word], [a_word]] }
25 |
26 | it_behaves_like "chunker"
27 | end
28 |
29 | context "splits by non-rogugh borders" do
30 | let(:source) { [b_word, b_word, b_word, a_word] }
31 | let(:chunks) { [[b_word, b_word], [b_word, a_word]] }
32 |
33 | it_behaves_like "chunker"
34 | end
35 |
36 | context "splits by count" do
37 | let(:source) { [x_word] * 10 }
38 | let(:chunks) { [[x_word] * 6, [x_word] * 4] }
39 |
40 | it_behaves_like "chunker"
41 | end
42 |
43 | context "raises if part is too long" do
44 | let(:source) { [z_word] }
45 | let(:chunks) { [[z_word]] }
46 |
47 | it { expect { subject }.to raise_error(/Too long part/) }
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/google_translate_diff.gemspec:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | lib = File.expand_path("../lib", __FILE__)
4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5 | require "google_translate_diff/version"
6 |
7 | # rubocop:disable Metrics/BlockLength
8 | Gem::Specification.new do |spec|
9 | spec.name = "google_translate_diff"
10 | spec.version = GoogleTranslateDiff::VERSION
11 | spec.authors = ["Victor Sokolov"]
12 | spec.email = ["gzigzigzeo@evilmartians.com"]
13 |
14 | spec.summary = %(
15 | Google Translate API wrapper for Ruby which helps to translate only changes
16 | between revisions of long texts.
17 |
18 | )
19 | spec.description = %(
20 | Google Translate API wrapper for Ruby which helps to translate only changes
21 | between revisions of long texts.
22 | )
23 | spec.homepage = "https://github.com/gzigzigzeo/google_translate_diff"
24 |
25 | if spec.respond_to?(:metadata)
26 | spec.metadata["allowed_push_host"] = "https://rubygems.org"
27 | else
28 | raise "RubyGems 2.0 or newer is required to protect against " \
29 | "public gem pushes."
30 | end
31 |
32 | spec.files = `git ls-files -z`.split("\x0").reject do |f|
33 | f.match(%r{^(test|spec|features)/})
34 | end
35 | spec.bindir = "exe"
36 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37 | spec.require_paths = ["lib"]
38 |
39 | spec.add_development_dependency "bundler", "~> 1.14"
40 | spec.add_development_dependency "rake", "~> 10.0"
41 | spec.add_development_dependency "rspec", "~> 3.0"
42 | spec.add_development_dependency "rubocop"
43 | spec.add_development_dependency "codeclimate-test-reporter", "~> 1.0.0"
44 | spec.add_development_dependency "simplecov"
45 |
46 | spec.add_dependency "google-cloud-translate"
47 | spec.add_dependency "ox"
48 | spec.add_dependency "dry-initializer"
49 | spec.add_dependency "punkt-segmenter"
50 | end
51 | # rubocop:enable Metrics/BlockLength
52 |
--------------------------------------------------------------------------------
/spec/google_translate_diff/request_spec.rb:
--------------------------------------------------------------------------------
1 | require "spec_helper"
2 |
3 | RSpec.describe GoogleTranslateDiff::Request do
4 | subject { described_class.new(values, options).call }
5 |
6 | let(:api) { double("API") }
7 | let(:cache_store) { double("Cache store") }
8 | let(:api_response_wrap) { api_response.map { |v| OpenStruct.new(text: v) } }
9 | let(:cache_response) { nil }
10 |
11 | before do
12 | GoogleTranslateDiff.api = api
13 | GoogleTranslateDiff.cache_store = cache_store
14 |
15 | allow(cache_store).to receive(:read_multi) do |keys|
16 | cache_response || ([nil] * keys.size)
17 | end
18 |
19 | allow(cache_store).to receive(:write) { |_, value| value }
20 |
21 | allow(api).to receive(:translate).with(*api_request, options).and_return(
22 | api_response_wrap
23 | )
24 | end
25 |
26 | context "simple case" do
27 | let(:values) { "Some string" }
28 | let(:options) { { from: :en, to: :ru } }
29 | let(:api_request) { ["Some string"] }
30 | let(:api_response) { ["Какая-то строка"] }
31 |
32 | it { is_expected.to eq("Какая-то строка") }
33 | end
34 |
35 | context "complex structure, simple case" do
36 | let(:values) { { title: "One", description: "Two" } }
37 | let(:options) { { from: :en, to: :ru } }
38 | let(:api_request) { %w[One Two] }
39 | let(:api_response) { %w[Один Два] }
40 |
41 | it { is_expected.to eq(title: "Один", description: "Два") }
42 | end
43 |
44 | context "complex structure" do
45 | let(:values) { { title: "One", more: { description: "Two" }, skip: nil } }
46 | let(:options) { { from: :en, to: :ru } }
47 | let(:api_request) { %w[One Two] }
48 | let(:api_response) { %w[Один Два] }
49 |
50 | it do
51 | is_expected.to eq(title: "Один", more: { description: "Два" }, skip: "")
52 | end
53 | end
54 |
55 | context "HTML" do
56 | let(:values) do
57 | {
58 | title: "One",
59 | more: {
60 | description: "Black",
61 | color: %(So Red
62 | that)
63 | }
64 | }
65 | end
66 | let(:options) { { from: :en, to: :ru } }
67 | let(:api_request) { %w[One Black So Red that] }
68 | let(:api_response) { %w[Один Черный Что Кра что] }
69 |
70 | it do
71 | is_expected.to eq(
72 | title: "Один",
73 | more: {
74 | description: "Черный",
75 | color: %(Что Кра
76 | что)
77 | }
78 | )
79 | end
80 | end
81 | end
82 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/tokenizer.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
2 | def initialize(source)
3 | @pos = nil
4 | @source = source
5 | @tokens = nil
6 | @context = []
7 | @sequence = []
8 | @indicies = []
9 | end
10 |
11 | def instruct(target)
12 | start_markup(target)
13 | end
14 |
15 | def end_instruct(target)
16 | end_markup(target)
17 | end
18 |
19 | def start_element(name)
20 | start_markup(name)
21 | end
22 |
23 | def end_element(name)
24 | end_markup(name)
25 | end
26 |
27 | def attr(name, value)
28 | return unless @context.last == :span
29 | return unless name == :class && value == "notranslate"
30 | return if notranslate?
31 |
32 | @sequence[-1] = :notranslate
33 | end
34 |
35 | def text(value)
36 | return if value == ""
37 | @sequence << (SKIP.include?(@context.last) ? :markup : :text)
38 | @indicies << @pos - 1
39 | end
40 |
41 | def tokens
42 | @tokens ||= token_sequences_joined
43 | .tap { |tokens| make_sentences_from_last_token(tokens) }
44 | end
45 |
46 | private
47 |
48 | def token_sequences_joined
49 | raw_tokens.each_with_object([]) do |token, tokens|
50 | if tokens.empty? # Initial state
51 | tokens << token
52 | elsif tokens.last[1] == token[1]
53 | # Join series of tokens of the same type into one
54 | tokens.last[0].concat(token[0])
55 | else
56 | # If token before :markup is :text we need to split it into sentences
57 | make_sentences_from_last_token(tokens)
58 | tokens << token
59 | end
60 | end
61 | end
62 |
63 | def make_sentences_from_last_token(tokens)
64 | return if tokens.empty?
65 | tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
66 | end
67 |
68 | # rubocop: disable Metrics/MethodLength
69 | def sentences(value)
70 | return [] if value.strip.empty?
71 |
72 | boundaries =
73 | Punkt::SentenceTokenizer
74 | .new(value)
75 | .sentences_from_text(value)
76 |
77 | return [[value, :text]] if boundaries.size == 1
78 |
79 | boundaries.map.with_index do |(left, right), index|
80 | next_boundary = boundaries[index + 1]
81 | right = next_boundary[0] - 1 if next_boundary
82 |
83 | [value[left..right], :text]
84 | end
85 | end
86 | # rubocop:enable Metrics/MethodLength
87 |
88 | # Whether the sequence is between `:notranslate` and `:end_notranslate`
89 | def notranslate?
90 | @sequence.select { |item| item[/notranslate/] }.last == :notranslate
91 | end
92 |
93 | # Returns the item for last opened span
94 | def end_span
95 | return :markup unless notranslate?
96 | opened_spans = @sequence
97 | .reverse
98 | .take_while { |item| item != :notranslate }
99 | .map { |item| { span: 1, end_span: -1 }.fetch(item, 0) }
100 | .reduce(0, :+)
101 |
102 | opened_spans.positive? ? :end_span : :end_notranslate
103 | end
104 |
105 | def raw_tokens
106 | @raw_tokens ||= @indicies.map.with_index do |i, n|
107 | first = i
108 | last = (@indicies[n + 1] || 0) - 1
109 | value = fix_utf(@source.byteslice(first..last))
110 | type = @sequence[n]
111 | type = :text if INNER_SPANS.include?(type)
112 | [value, type]
113 | end
114 | end
115 |
116 | def fix_utf(value)
117 | value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ")
118 | end
119 |
120 | def start_markup(name)
121 | @context << name
122 | @sequence << (notranslate? ? (name == :span ? :span : :text) : :markup)
123 | @indicies << @pos - 1
124 | end
125 |
126 | def end_markup(name)
127 | @context.pop
128 | @sequence << (notranslate? ? (name == :span ? end_span : :text) : :markup)
129 | @indicies << @pos - 1 unless @pos == @source.bytesize
130 | end
131 |
132 | class << self
133 | def tokenize(value)
134 | return [] if value.nil?
135 | tokenizer = new(value).tap do |h|
136 | Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
137 | end
138 | tokenizer.tokens
139 | end
140 | end
141 |
142 | SKIP = %i[script style].freeze
143 | INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze
144 | HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze
145 | end
146 |
--------------------------------------------------------------------------------
/lib/google_translate_diff/request.rb:
--------------------------------------------------------------------------------
1 | class GoogleTranslateDiff::Request
2 | extend Dry::Initializer
3 | extend Forwardable
4 |
5 | param :values
6 | param :options
7 |
8 | def_delegators :GoogleTranslateDiff, :api, :cache_store, :rate_limiter
9 | def_delegators :"GoogleTranslateDiff::Linearizer", :linearize, :restore
10 |
11 | def call
12 | validate_globals
13 |
14 | return values if from == to || values.empty?
15 |
16 | translation
17 | end
18 |
19 | private
20 |
21 | def from
22 | @from ||= options.fetch(:from)
23 | end
24 |
25 | def to
26 | @to ||= options.fetch(:to)
27 | end
28 |
29 | def validate_globals
30 | raise "Set GoogleTranslateDiff.api before calling ::translate" unless api
31 | return if cache_store
32 | raise "Set GoogleTranslateDiff.cache_store before calling ::translate"
33 | end
34 |
35 | # Extracts flat text array
36 | # => "Name", "Good boy"
37 | #
38 | # #values might be something like { name: "Name", bio: "Good boy" }
39 | def texts
40 | @texts ||= linearize(values)
41 | end
42 |
43 | # Converts each array item to token list
44 | # => [..., [["", :markup], ["Good", :text], ...]]
45 | def tokens
46 | @tokens ||= texts.map do |value|
47 | GoogleTranslateDiff::Tokenizer.tokenize(value)
48 | end
49 | end
50 |
51 | # Extracts text tokens from token list
52 | # => { ..., "1_1" => "Good", 1_3 => "Boy", ... }
53 | def text_tokens
54 | @text_tokens ||= extract_text_tokens.to_h
55 | end
56 |
57 | def extract_text_tokens
58 | tokens.each_with_object([]).with_index do |(group, result), group_index|
59 | group.each_with_index do |(value, type), index|
60 | result << ["#{group_index}_#{index}", value] if type == :text
61 | end
62 | end
63 | end
64 |
65 | # Extracts values from text tokens
66 | # => [ ..., "Good", "Boy", ... ]
67 | def text_tokens_texts
68 | @text_tokens_texts ||= linearize(text_tokens).map(&:to_s).map(&:strip)
69 | end
70 |
71 | # Splits things requires translations to per-request chunks
72 | # (groups less 2k sym)
73 | # => [[ ..., "Good", "Boy", ... ]]
74 | def chunks
75 | @chunks ||= GoogleTranslateDiff::Chunker.new(text_tokens_texts).call
76 | end
77 |
78 | # Translates/loads from cache values from each chunk
79 | # => [[ ..., "Horoshiy", "Malchik", ... ]]
80 | def chunks_translated
81 | @chunks_translated ||= chunks.map do |chunk|
82 | cached, missing = cache.cached_and_missing(chunk)
83 | if missing.empty?
84 | cached
85 | else
86 | cache.store(chunk, cached, call_api(missing))
87 | end
88 | end
89 | end
90 |
91 | # Restores indexes for translated tokens
92 | # => { ..., "1_1" => "Horoshiy", 1_3 => "Malchik", ... }
93 | def text_tokens_translated
94 | @text_tokens_texts_translated ||=
95 | restore(text_tokens, chunks_translated.flatten)
96 | end
97 |
98 | # Restores tokens translated + adds same spacing as in source token
99 | # => [[..., [ "Horoshiy", :text ], ...]]
100 | # rubocop:disable Metrics/AbcSize
101 | def tokens_translated
102 | @tokens_translated ||= tokens.dup.tap do |tokens|
103 | text_tokens_translated.each do |index, value|
104 | group_index, index = index.split("_")
105 | tokens[group_index.to_i][index.to_i][0] =
106 | restore_spacing(tokens[group_index.to_i][index.to_i][0], value)
107 | end
108 | end
109 | end
110 | # rubocop:enable Metrics/AbcSize
111 |
112 | def restore_spacing(source_value, value)
113 | GoogleTranslateDiff::Spacing.restore(source_value, value)
114 | end
115 |
116 | # Restores texts from tokens
117 | # [..., "Horoshiy Malchik", ...]
118 | def texts_translated
119 | @texts_translated ||= tokens_translated.map do |group|
120 | group.map { |value, type| type == :text ? value : fix_ascii(value) }.join
121 | end
122 | end
123 |
124 | # Final result
125 | def translation
126 | @translation ||= restore(values, texts_translated)
127 | end
128 |
129 | def call_api(values)
130 | check_rate_limit(values)
131 | [api.translate(*values, **options)].flatten.map(&:text)
132 | end
133 |
134 | def cache
135 | @cache ||= GoogleTranslateDiff::Cache.new(from, to)
136 | end
137 |
138 | def check_rate_limit(values)
139 | return if rate_limiter.nil?
140 | size = values.map(&:size).inject(0) { |sum, x| sum + x }
141 | rate_limiter.check(size)
142 | end
143 |
144 | # Markup should not contain control characters
145 | def fix_ascii(value)
146 | value.gsub(/[\u0000-\u001F]/, " ")
147 | end
148 | end
149 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GoogleTranslateDiff
2 |
3 | Google Translate API wrapper helps to translate only changes between revisions of long texts.
4 |
5 |
6 |
7 |
8 |
9 | [](https://travis-ci.org/gzigzigzeo/google_translate_diff) [](https://codeclimate.com/github/gzigzigzeo/google_translate_diff) [](https://codeclimate.com/github/gzigzigzeo/google_translate_diff/coverage)
10 |
11 | ## Use case
12 |
13 | Assume your project contains a significant amount of products descriptions which:
14 | - Require retranslation each time user edits them.
15 | - Have a lot of equal parts (like return policy).
16 | - Change frequently.
17 |
18 | If your user changes a single word within the long description, you will be charged for the retranslation of the whole text.
19 |
20 | Much better approach is to try to translate every repeated structural element (sentence) in your texts array just once to save money. This gem helps to make it done.
21 |
22 | ## Installation
23 |
24 | Add this line to your application's Gemfile:
25 |
26 | ```ruby
27 | gem 'google_translate_diff'
28 | ```
29 |
30 | And then execute:
31 |
32 | $ bundle
33 |
34 | Or install it yourself as:
35 |
36 | $ gem install google_translate_diff
37 |
38 | ## Usage
39 |
40 | ```ruby
41 | require "google_translate_diff"
42 |
43 | # This dependencies are not included, as you might need to roll your own cache based on different store
44 | require "redis"
45 | require "connection_pool"
46 | require "redis-namespace"
47 | require "ratelimit" # Optional, if you will use
48 |
49 | # Setup https://github.com/GoogleCloudPlatform/google-cloud-ruby/tree/master/google-cloud-translate
50 | ENV["TRANSLATE_KEY"] = "foobarkey"
51 |
52 | # I always use pool for redis
53 | pool = ConnectionPool.new(size: 10, timeout: 5) { Redis.new }
54 |
55 | # Pass any options (like app id)
56 | GoogleTranslateDiff.api = Google::Cloud::Translate.new
57 |
58 | GoogleTranslateDiff.cache_store =
59 | GoogleTranslateDiff::RedisCacheStore.new(pool, timeout: 7.days, namespace: "t")
60 |
61 | # Optional
62 | GoogleTranslateDiff.rate_limiter =
63 | GoogleTranslateDiff::RedisRateLimiter.new(
64 | pool, threshold: 8000, interval: 60, namespace: "t"
65 | )
66 |
67 | GoogleTranslateDiff.translate("test translations", from: "en", to: "es")
68 | ```
69 |
70 | ## How it works
71 |
72 | - Text nodes are extracted from HTML.
73 | - Every text node is split into sentences (using `punkt-segmenter` gem).
74 | - Cache is checked for the presence of each sentence (using language couple and a hash of string).
75 | - Missing sentences are translated via API and cached.
76 | - Original HTML is recombined from translations and cache data.
77 |
78 | *NOTE:* `:from` is required param. Cache can not be checked without specifying exact language couple, that's the limitation.
79 |
80 | ## Input
81 |
82 | `::translate` can receive string, array or deep hash and will return the same, but translated.
83 |
84 | ```ruby
85 | GoogleTranslateDiff.translate("test", from: "en", to: "es")
86 | GoogleTranslateDiff.translate("test", "language", from: "en", to: "es")
87 | GoogleTranslateDiff.translate(
88 | { title: "test", values: { type: "frequent" } }, from: "en", to: "es"
89 | )
90 | ```
91 |
92 | See `GoogleTranslateDiff::Linearizer` for details.
93 |
94 | ## HTML
95 |
96 | You can pass HTML as like as plain text:
97 |
98 | ```ruby
99 | GoogleTranslateDiff.translate("Black", from: "en", to: "es")
100 | ```
101 |
102 | ## Very long texts
103 |
104 | Google API has a limitation: query can not be longer than approximately 4 KB. If your text is really that long, multiple queries will be used to translate it automatically.
105 |
106 | ## Development
107 |
108 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
109 |
110 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
111 |
112 | ## Contributing
113 |
114 | Bug reports and pull requests are welcome on GitHub at https://github.com/gzigzigzeo/google_translate_diff.
115 |
--------------------------------------------------------------------------------
/spec/google_translate_diff/tokenizer_spec.rb:
--------------------------------------------------------------------------------
1 | require "spec_helper"
2 |
3 | RSpec.describe GoogleTranslateDiff::Tokenizer do
4 | subject do
5 | described_class.tokenize(source)
6 | end
7 |
8 | shared_examples "tokenizer" do
9 | it { expect(subject).to eq(tokens) }
10 | end
11 |
12 | context "empty" do
13 | let(:source) { "" }
14 | let(:tokens) { [] }
15 |
16 | it_behaves_like "tokenizer"
17 | end
18 |
19 | context "pure crlf" do
20 | let(:source) { "
\n
" }
21 | let(:tokens) { [["", :markup], ["
", :markup]] }
22 |
23 | it_behaves_like "tokenizer"
24 | end
25 |
26 | context "pure text" do
27 | let(:source) { "test\nphrase" }
28 | let(:tokens) { [[source, :text]] }
29 |
30 | it_behaves_like "tokenizer"
31 | end
32 |
33 | context "with some markup ending with text" do
34 | let(:source) { "alfabravokilo" }
35 | let(:tokens) do
36 | [
37 | ["alfa", :text],
38 | ["", :markup],
39 | ["bravo", :text],
40 | ["", :markup],
41 | ["kilo", :text]
42 | ]
43 | end
44 |
45 | it_behaves_like "tokenizer"
46 | end
47 |
48 | context "with some markup ending with tag" do
49 | let(:source) { "alfabravo" }
50 | let(:tokens) do
51 | [
52 | ["alfa", :text],
53 | ["", :markup],
54 | ["bravo", :text],
55 | ["", :markup]
56 | ]
57 | end
58 |
59 | it_behaves_like "tokenizer"
60 | end
61 |
62 | context "with some markup ending with script and non-ascii" do
63 | let(:source) { "альбракил" }
64 | let(:tokens) do
65 | [
66 | ["аль", :text],
67 | ["", :markup],
68 | ["бра", :text],
69 | ["", :markup],
70 | ["кил", :text],
71 | ["", :markup]
72 | ]
73 | end
74 |
75 | it_behaves_like "tokenizer"
76 | end
77 |
78 | context "sentences" do
79 | let(:source) do
80 | "! Киловольт. Смеркалось. Ворчало. Кричало."
81 | end
82 |
83 | let(:tokens) do
84 | [
85 | ["! ", :text],
86 | ["Киловольт. ", :text],
87 | ["", :markup],
88 | ["Смеркалось. ", :text],
89 | ["Ворчало. ", :text],
90 | ["Кричало.", :text],
91 | ["", :markup]
92 | ]
93 | end
94 |
95 | it_behaves_like "tokenizer"
96 | end
97 |
98 | context "notranslate" do
99 | let(:source) do
100 | "test
101 | xy"
102 | end
103 |
104 | let(:tokens) do
105 | [
106 | ["test", :text],
107 | ["", :markup],
108 | ["\nxy", :text],
109 | ["", :markup]
110 | ]
111 | end
112 |
113 | it_behaves_like "tokenizer"
114 | end
115 |
116 | context "notranslate inside another span" do
117 | let(:source) do
118 | "foobar
baz"
119 | end
120 |
121 | let(:tokens) do
122 | [
123 | ["", :markup],
124 | ["foobar
baz", :text],
125 | ["", :markup]
126 | ]
127 | end
128 |
129 | it_behaves_like "tokenizer"
130 | end
131 |
132 | context "notranslate inside another notranslate" do
133 | let(:source) do
134 | "foo" \
135 | "barbaz" \
136 | ""
137 | end
138 |
139 | let(:tokens) do
140 | [
141 | [source, :text]
142 | ]
143 | end
144 |
145 | it_behaves_like "tokenizer"
146 | end
147 |
148 | context "with
tag before closing tag" do
149 | let(:source) do
150 | "Смеркалось.
"
151 | end
152 |
153 | let(:tokens) do
154 | [
155 | ["", :markup],
156 | ["Смеркалось.", :text],
157 | ["
", :markup]
158 | ]
159 | end
160 |
161 | it_behaves_like "tokenizer"
162 | end
163 |
164 | context "with tag" do
165 | let(:source) do
166 | "Hey!
Look!" \
167 | ""
168 | end
169 |
170 | let(:tokens) do
171 | [
172 | ["Hey!", :text],
173 | ["
", :markup],
174 | ["Look!", :text],
175 | ["", :markup]
176 | ]
177 | end
178 |
179 | it_behaves_like "tokenizer"
180 | end
181 |
182 | context "bizarre sentences" do
183 | let(:source) do
184 | "Набор «Солнечная механика» от 4М — это 6 экспериментов." \
185 | "\n\nЮному изобретателю предстоит воочию посмотреть на чудеса."
186 | end
187 |
188 | let(:tokens) do
189 | [
190 | ["Набор «Солнечная механика» от 4М — это 6 экспериментов.\n\n", :text],
191 | ["Юному изобретателю предстоит воочию посмотреть на чудеса.", :text]
192 | ]
193 | end
194 |
195 | it_behaves_like "tokenizer"
196 | end
197 | end
198 |
--------------------------------------------------------------------------------