├── .gitignore ├── .formatter.exs ├── test ├── lz_string_test.exs ├── test_helper.exs └── lz_string │ └── reference_test.exs ├── LICENSE ├── mix.exs ├── README.md ├── mix.lock └── lib ├── lz_string └── base64.ex └── lz_string.ex /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /cover 3 | /deps 4 | erl_crash.dump 5 | *.ez 6 | node_modules -------------------------------------------------------------------------------- /.formatter.exs: -------------------------------------------------------------------------------- 1 | [ 2 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 3 | ] 4 | -------------------------------------------------------------------------------- /test/lz_string_test.exs: -------------------------------------------------------------------------------- 1 | defmodule LzStringTest do 2 | use ExUnit.Case, async: true 3 | import TestHelper 4 | import LZString 5 | doctest LZString 6 | 7 | test "roundtrip repeated single-bute strings" do 8 | Enum.each(1..2000, &assert_roundtrip(String.pad_trailing("", &1, "a"))) 9 | end 10 | 11 | test "roundtrip repeated multi-byte char strings" do 12 | Enum.each(1..2000, &assert_roundtrip(String.pad_trailing("", &1, "猫"))) 13 | end 14 | 15 | test "roundtrip random high entropy strings" do 16 | Enum.each(1..1000, fn _ -> 17 | 1000 18 | |> random_string 19 | |> assert_roundtrip 20 | end) 21 | end 22 | 23 | test "roundtrip random large low entropy string" do 24 | 1_000_000 25 | |> :crypto.strong_rand_bytes() 26 | |> Base.encode16() 27 | |> assert_roundtrip 28 | end 29 | 30 | test "compress/1 should be able to handle every valid utf8 character that fits in two bytes" do 31 | valid_utf8_char_ranges() 32 | |> Enum.flat_map(fn range -> Enum.map(range, &<<&1::utf8>>) end) 33 | |> :erlang.list_to_binary() 34 | |> assert_roundtrip 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Michael Shapiro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule LzString.Mixfile do 2 | use Mix.Project 3 | 4 | @version "0.0.8" 5 | 6 | def project do 7 | [ 8 | app: :lz_string, 9 | version: @version, 10 | elixir: "~> 1.2", 11 | package: package(), 12 | docs: docs(), 13 | deps: deps(), 14 | description: "Elixir implementation of pieroxy's lz-string compression algorithm." 15 | ] 16 | end 17 | 18 | # Configuration for the OTP application 19 | # 20 | # Type "mix help compile.app" for more information 21 | def application do 22 | [] 23 | end 24 | 25 | # Dependencies can be Hex packages: 26 | # 27 | # {:mydep, "~> 0.3.0"} 28 | # 29 | # Or git/path repositories: 30 | # 31 | # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"} 32 | # 33 | # Type "mix help deps" for more examples and options 34 | defp deps() do 35 | [ 36 | {:ex_doc, ">= 0.0.0", only: :dev, runtime: false} 37 | ] 38 | end 39 | 40 | defp package do 41 | [ 42 | maintainers: ["Michael Shapiro"], 43 | licenses: ["MIT"], 44 | links: %{GitHub: "https://github.com/koudelka/elixir-lz-string"} 45 | ] 46 | end 47 | 48 | defp docs do 49 | [extras: ["README.md"], 50 | source_url: "https://github.com/koudelka/elixir-lz-string", 51 | source_ref: @version, 52 | assets: "assets", 53 | main: "readme"] 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LZString 2 | 3 | An Elixir implementation of [pieroxy/lz-string](https://github.com/pieroxy/lz-string), an LZ-based compression algorithm. 4 | 5 | ```elixir 6 | iex> LZString.compress("hello, i am a 猫") 7 | <<5, 133, 48, 54, 96, 246, 3, 64, 4, 9, 107, 2, 24, 22, 217, 180, 53, 51, 144, 0>> 8 | 9 | iex> LZString.decompress(<<5, 133, 48, 54, 96, 246, 3, 64, 4, 9, 107, 2, 24, 22, 217, 180, 53, 51, 144, 0>>) 10 | "hello, i am a 猫" 11 | 12 | iex> LZString.compress_base64("hello, i am a 猫") 13 | "BYUwNmD2A0AECWsCGBbZtDUzkAA=" 14 | 15 | iex> LZString.decompress_base64("BYUwNmD2A0AECWsCGBbZtDUzkA==") 16 | "hello, i am a 猫" 17 | ``` 18 | 19 | ## Installation 20 | 21 | Add lz_string to your list of dependencies in `mix.exs`: 22 | 23 | ```elixir 24 | def deps do 25 | [{:lz_string, "~> 0.0.8"}] 26 | end 27 | ``` 28 | 29 | ## Running Tests 30 | The tests compare LZString's output against that produced by the JS reference implementation (by way of a janky node.js `Port`). You'll need to install the node module in the root directory of the project beforehand: 31 | 32 | ``` 33 | $ npm install lz-string 34 | ``` 35 | 36 | Depending on how the port output is flushed, you'll once in a while get an error that complains about something like `:erlang.binary_to_integer("64'\n>")`, those are safe to ignore, and you can re-run the tests to try again. I'll gladly accept a PR to switch to https://github.com/awetzel/node_erlastic <3. 37 | 38 | ## Base64 39 | The Base64 that the reference library produces is invalid, but we can still use it as the end-of-message indication is a dictionary marker rather than the actual end-of-input, so it may be academic. This library will properly decompress the invalid base64, and produce valid base64 output during compression. 40 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "earmark_parser": {:hex, :earmark_parser, "1.4.26", "f4291134583f373c7d8755566122908eb9662df4c4b63caa66a0eabe06569b0a", [:mix], [], "hexpm", "48d460899f8a0c52c5470676611c01f64f3337bad0b26ddab43648428d94aabc"}, 3 | "ex_doc": {:hex, :ex_doc, "0.28.4", "001a0ea6beac2f810f1abc3dbf4b123e9593eaa5f00dd13ded024eae7c523298", [:mix], [{:earmark_parser, "~> 1.4.19", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bf85d003dd34911d89c8ddb8bda1a958af3471a274a4c2150a9c01c78ac3f8ed"}, 4 | "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, 5 | "makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"}, 6 | "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, 7 | "nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"}, 8 | "parallel": {:git, "https://github.com/eproxus/parallel.git", "08337182573befc55f4aea835b0f68c686d57002", []}, 9 | } 10 | -------------------------------------------------------------------------------- /lib/lz_string/base64.ex: -------------------------------------------------------------------------------- 1 | defmodule LZString.Base64 do 2 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 3 | |> String.to_charlist() 4 | |> Enum.with_index() 5 | |> Enum.each(fn {c, i} -> 6 | def base64_to_bitstring(unquote(c)), do: <> 7 | end) 8 | 9 | defmacro __using__(_env) do 10 | quote do 11 | @doc ~S""" 12 | Compresses the given string and base64 encodes it. 13 | 14 | iex> LZString.compress_base64("hello, i am a 猫") 15 | "BYUwNmD2A0AECWsCGBbZtDUzkAA=" 16 | """ 17 | def compress_base64(str) do 18 | str |> compress |> Base.encode64() 19 | end 20 | 21 | @doc ~S""" 22 | Decompresses the given string after decoding lz-string's non-standard base64. 23 | 24 | iex> LZString.decompress_base64("BYUwNmD2A0AECWsCGBbZtDUzkA==") 25 | "hello, i am a 猫" 26 | """ 27 | def decompress_base64(str) do 28 | str |> decode_base64 |> decompress 29 | end 30 | 31 | @doc ~S""" 32 | Decodes the given "base64" string, giving a naked lz-string bitstring. 33 | 34 | iex> LZString.decode_base64("BYUwNmD2A0AECWsCGBbZtDUzkA==") 35 | <<5, 133, 48, 54, 96, 246, 3, 64, 4, 9, 107, 2, 24, 22, 217, 180, 53, 51, 144, 0, 0>> 36 | """ 37 | def decode_base64(str) do 38 | for <>, into: <<>> do 39 | LZString.Base64.base64_to_bitstring(c) 40 | end 41 | end 42 | 43 | @doc ~S""" 44 | Compresses the given string and base64 encodes it, substituting uri-unsafe characters. 45 | 46 | iex> LZString.compress_uri_encoded("hello, i am a 猫") 47 | "BYUwNmD2A0AECWsCGBbZtDUzkAA$" 48 | """ 49 | def compress_uri_encoded(str) do 50 | str 51 | |> compress_base64 52 | |> String.replace("/", "-") 53 | |> String.replace("=", "$") 54 | end 55 | 56 | @doc ~S""" 57 | Decompresses the given "uri encoded" base64 compressed string. 58 | 59 | iex> LZString.decompress_uri_encoded("BYUwNmD2A0AECWsCGBbZtDUzkAA$") 60 | "hello, i am a 猫" 61 | """ 62 | def decompress_uri_encoded(str) do 63 | str 64 | |> String.replace("-", "/") 65 | |> String.replace("$", "=") 66 | |> decompress_base64 67 | end 68 | 69 | @doc ~S""" 70 | Decodes the given "uri encoded" base64 string, giving a naked lz-string bitstring. 71 | 72 | iex> LZString.decode_uri_encoded("BYUwNmD2A0AECWsCGBbZtDUzkA$$") 73 | <<5, 133, 48, 54, 96, 246, 3, 64, 4, 9, 107, 2, 24, 22, 217, 180, 53, 51, 144, 0, 0>> 74 | """ 75 | def decode_uri_encoded(str) do 76 | str 77 | |> String.replace("-", "/") 78 | |> String.replace("$", "=") 79 | |> decode_base64 80 | end 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | defmodule TestHelper do 2 | # chars in the "surrogate pair" range are invalid on their own 3 | @surrogate_pair_start "D800" |> :erlang.binary_to_integer(16) |> Kernel.-(1) 4 | @surrogate_pair_stop "DFFF" |> :erlang.binary_to_integer(16) |> Kernel.+(1) 5 | # UCS-2 is limited to 16 bits 6 | @max_two_byte :math.pow(2, 16) |> trunc |> Kernel.-(1) 7 | 8 | @valid_utf8_char_ranges [0..@surrogate_pair_start, @surrogate_pair_stop..@max_two_byte] 9 | 10 | defmacro assert_roundtrip(str) do 11 | quote do 12 | str = unquote(str) 13 | roundtrip_str = str |> compress |> decompress 14 | assert roundtrip_str == str 15 | end 16 | end 17 | 18 | defmacro assert_same_as_node_compress(str, port) do 19 | quote do 20 | str = unquote(str) 21 | assert compress(str) == compress_to_binary_with_node(unquote(port), str), str 22 | end 23 | end 24 | 25 | def valid_utf8_char_ranges do 26 | @valid_utf8_char_ranges 27 | end 28 | 29 | def random_string(size) do 30 | Enum.map(0..size, fn _ -> random_utf8_char() end) 31 | |> :erlang.list_to_binary() 32 | end 33 | 34 | def random_utf8_char do 35 | <> 36 | end 37 | 38 | def random_int_in_range do 39 | int = :rand.uniform(@max_two_byte) 40 | 41 | if int_in_range?(int) do 42 | int 43 | else 44 | random_int_in_range() 45 | end 46 | end 47 | 48 | defp int_in_range?(i) do 49 | i <= @surrogate_pair_start || (@surrogate_pair_stop <= i && i <= @max_two_byte) 50 | end 51 | 52 | # node reference implementation interactions 53 | # this isn't perfect, but it's ok for our purposes 54 | 55 | def compress_to_base64_with_node(port, str) do 56 | str = String.replace(str, "'", "\'") 57 | 58 | repl_eval(port, "LZString.compressToBase64('#{str}')") 59 | |> String.trim("'") 60 | end 61 | 62 | def decompress_base64_with_node(str, port) do 63 | repl_eval(port, "LZString.decompressFromBase64('#{str}')") 64 | |> String.trim("'") 65 | end 66 | 67 | def compress_to_binary_with_node(port, str) do 68 | str = String.replace(str, "'", "\'") 69 | 70 | repl_eval(port, "LZString.compressToUint8Array('#{str}').join(',')") 71 | |> String.trim("'") 72 | |> String.split(",") 73 | |> Enum.map(&String.to_integer/1) 74 | |> :erlang.list_to_binary() 75 | end 76 | 77 | def lz_string_node_port do 78 | port = Port.open({:spawn, "node -i"}, [:binary]) 79 | repl_eval(port, "LZString = require('lz-string')") 80 | # Clear buffer of output from require() 81 | wait_for_result(port) 82 | port 83 | end 84 | 85 | defp repl_eval(port, str) do 86 | Port.command(port, str) 87 | Port.command(port, "\n") 88 | wait_for_result(port) 89 | end 90 | 91 | defp wait_for_result(port) do 92 | receive do 93 | {^port, {:data, "> "}} -> wait_for_result(port) 94 | {^port, {:data, "> " <> rest}} -> rest 95 | {^port, {:data, result}} -> result 96 | end 97 | |> String.replace(~r/\s*\>?\s*$/, "") 98 | end 99 | end 100 | 101 | ExUnit.start() 102 | -------------------------------------------------------------------------------- /test/lz_string/reference_test.exs: -------------------------------------------------------------------------------- 1 | defmodule LzStringTest.Reference.Test do 2 | use ExUnit.Case, async: true 3 | import TestHelper 4 | import LZString 5 | 6 | setup do 7 | {:ok, port: TestHelper.lz_string_node_port()} 8 | end 9 | 10 | test "compress/1 should match output from the reference implementation for random strings", %{ 11 | port: port 12 | } do 13 | Enum.each(1..2000, fn _ -> 14 | 1000 15 | |> :crypto.strong_rand_bytes() 16 | |> Base.encode16() 17 | |> assert_same_as_node_compress(port) 18 | end) 19 | end 20 | 21 | test "compress/1 should match output from the reference implementation for repeated ascii", %{ 22 | port: port 23 | } do 24 | Enum.each(1..2000, &assert_same_as_node_compress(String.pad_trailing("", &1, "a"), port)) 25 | end 26 | 27 | test "compress/1 should match output from the reference implementation for repeated multibyte char strings", 28 | %{port: port} do 29 | Enum.each(1..2000, &assert_same_as_node_compress(String.pad_trailing("", &1, "猫"), port)) 30 | end 31 | 32 | # compress using node, and decompress using elixir 33 | 34 | test "decompress/1 should be able to decompress random strings from the reference implementation's compressToUint8Array/1", 35 | %{port: port} do 36 | Enum.each(1..2000, fn _ -> 37 | str = 38 | 1000 39 | |> :crypto.strong_rand_bytes() 40 | |> Base.encode16() 41 | 42 | assert str == compress_to_binary_with_node(port, str) |> decompress 43 | end) 44 | end 45 | 46 | test "decompress/1 should be able to decompress multi-byte utf8 from the reference implementation's compressToUint8Array/1", 47 | %{port: port} do 48 | str = "今日は 今日は 今日は 今日は 今日は 今日は" 49 | assert str == compress_to_binary_with_node(port, str) |> decompress 50 | end 51 | 52 | test "decompress/1 should be able to handle utf-16 characters with surrogate pairs", %{ 53 | port: port 54 | } do 55 | str = "abc💔abc" 56 | assert str == compress_to_binary_with_node(port, str) |> decompress 57 | end 58 | 59 | # compress to lz-string pseudo-base64 using node, and decompress using elixir 60 | 61 | test "decompress_base64/1 should be able to decompress random strings from the reference implementation's compressToBase64/1", 62 | %{port: port} do 63 | Enum.each(1..2000, fn _ -> 64 | str = 65 | 1000 66 | |> :crypto.strong_rand_bytes() 67 | |> Base.encode16() 68 | 69 | assert str == compress_to_base64_with_node(port, str) |> decompress_base64 70 | end) 71 | end 72 | 73 | test "decompress_base64/1 should be able to decompress multi-byte utf8 from the reference implementation's compressToBase64/1", 74 | %{port: port} do 75 | str = "今日は 今日は 今日は 今日は 今日は 今日は" 76 | assert str == compress_to_base64_with_node(port, str) |> decompress_base64 77 | end 78 | 79 | # compress to base64, and decompress using node 80 | 81 | test "the reference implementation's decompressFromBase64/1 should be able to random strings from compress_base64/1", 82 | %{port: port} do 83 | Enum.each(1..2000, fn _ -> 84 | str = 85 | 1000 86 | |> :crypto.strong_rand_bytes() 87 | |> Base.encode16() 88 | 89 | assert str == compress_base64(str) |> decompress_base64_with_node(port) 90 | end) 91 | end 92 | 93 | test "the reference implementation's decompressFromBase64/1 should be able to decompress multi-byte utf8 from compress_base64/1", 94 | %{port: port} do 95 | str = "今日は 今日は 今日は 今日は 今日は 今日は" 96 | assert str == compress_base64(str) |> decompress_base64_with_node(port) 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /lib/lz_string.ex: -------------------------------------------------------------------------------- 1 | defmodule LZString do 2 | use LZString.Base64 3 | 4 | @compress_dict %{ 5 | size_8: 0, 6 | size_16: 1, 7 | eof: 2 8 | } 9 | 10 | @decompress_dict Enum.into(@compress_dict, %{}, fn {k, v} -> {v, k} end) 11 | 12 | @size_8 @compress_dict[:size_8] 13 | @size_16 @compress_dict[:size_16] 14 | @eof @compress_dict[:eof] 15 | 16 | @doc ~S""" 17 | Compresses the given String with the lz-string algorithm. 18 | 19 | iex> LZString.compress("hello, i am a 猫") 20 | <<5, 133, 48, 54, 96, 246, 3, 64, 4, 9, 107, 2, 24, 22, 217, 180, 53, 51, 144, 0>> 21 | """ 22 | 23 | @spec compress(String.t()) :: binary 24 | def compress(""), do: "" 25 | 26 | def compress(str) do 27 | output = compress("", str, @compress_dict) |> :erlang.list_to_bitstring() 28 | 29 | # the js implementation incorrectly adds padding when none is needed, so we do too. 30 | padding_bits = 16 - (output |> bit_size |> rem(16)) 31 | # padding_bits = 32 | # case 16 - (output |> bit_size |> rem(16)) do 33 | # 16 -> 0 34 | # n -> n 35 | # end 36 | <> 37 | end 38 | 39 | def compress(w, <> <> rest, dict) do 40 | c = <> 41 | 42 | char_just_added = !Map.has_key?(dict, c) 43 | 44 | dict = 45 | if char_just_added do 46 | Map.put(dict, c, {:first_time, map_size(dict)}) 47 | else 48 | dict 49 | end 50 | 51 | wc = w <> c 52 | 53 | if Map.has_key?(dict, wc) do 54 | compress(wc, rest, dict) 55 | else 56 | {dict, output} = w_output(w, dict, char_just_added) 57 | dict = Map.put(dict, wc, map_size(dict)) 58 | [output | compress(c, rest, dict)] 59 | end 60 | end 61 | 62 | def compress(w, "", dict) do 63 | size = num_bits(map_size(dict) - 1) 64 | {_dict, output} = w_output(w, dict, false) 65 | [output, reverse(<>)] 66 | end 67 | 68 | defp w_output([], _dict, _char_just_added), do: <<>> 69 | 70 | defp w_output(w, dict, char_just_added) do 71 | case Map.fetch(dict, w) do 72 | {:ok, {:first_time, dict_index}} -> 73 | dict = Map.put(dict, w, dict_index) 74 | marker_size = num_bits(dict_index) 75 | <> = w 76 | 77 | {size_marker, char_size} = 78 | if num_bits(char_val) <= 8 do 79 | {dict[:size_8], 8} 80 | else 81 | {dict[:size_16], 16} 82 | end 83 | 84 | size_marker_bits = reverse(<>) 85 | char_bits = reverse(<>) 86 | {dict, <>} 87 | 88 | {:ok, dict_index} -> 89 | map_size = map_size(dict) - 1 90 | # a char just being added to the dict may cause us to add an extra bit 91 | # to the dict_index output where one isn't strictly needed yet 92 | map_size = 93 | if char_just_added do 94 | map_size - 1 95 | else 96 | map_size 97 | end 98 | 99 | size = num_bits(map_size) 100 | {dict, reverse(<>)} 101 | end 102 | end 103 | 104 | @doc ~S""" 105 | Decompresses the given binary with the lz-string algorithm. 106 | 107 | iex> LZString.decompress(<<5, 133, 48, 54, 96, 246, 3, 64, 4, 9, 107, 2, 24, 22, 217, 180, 53, 51, 144, 0>>) 108 | "hello, i am a 猫" 109 | """ 110 | 111 | @spec decompress(binary) :: String.t() 112 | def decompress(""), do: "" 113 | 114 | def decompress(str) do 115 | {:char, c, rest, dict} = decode_next_segment(str, @decompress_dict) 116 | decompress(c, rest, dict) |> :erlang.list_to_binary() |> :unicode.characters_to_binary(:utf16) 117 | end 118 | 119 | def decompress(w, str, dict) do 120 | case decode_next_segment(str, dict) do 121 | {:char, c, rest, dict} -> 122 | dict = Map.put(dict, map_size(dict), w <> c) 123 | [w | decompress(c, rest, dict)] 124 | 125 | {:seq, seq, rest} -> 126 | c = 127 | case Map.fetch(dict, seq) do 128 | {:ok, decompressed} -> 129 | decompressed 130 | 131 | :error -> 132 | unless map_size(dict) == seq, do: raise("unknown sequence index #{seq}") 133 | w <> first_utf16(w) 134 | end 135 | 136 | dict = Map.put(dict, map_size(dict), w <> first_utf16(c)) 137 | [w | decompress(c, rest, dict)] 138 | 139 | :eof -> 140 | [w] 141 | end 142 | end 143 | 144 | defp decode_next_segment(str, dict) do 145 | size = dict |> map_size |> num_bits 146 | <> = str 147 | # dict_entry is in LSB format, bring it back to MSB 148 | <> = reverse(<>) 149 | 150 | case dict_entry do 151 | @size_8 -> 152 | <> = rest 153 | <> = reverse(<>) 154 | char = <> 155 | dict = Map.put(dict, map_size(dict), char) 156 | {:char, char, rest, dict} 157 | 158 | @size_16 -> 159 | <> = rest 160 | <> = reverse(<>) 161 | char = <> 162 | dict = Map.put(dict, map_size(dict), char) 163 | {:char, char, rest, dict} 164 | 165 | @eof -> 166 | :eof 167 | 168 | index -> 169 | {:seq, index, rest} 170 | end 171 | end 172 | 173 | defp first_utf16(<> <> _) do 174 | <> 175 | end 176 | 177 | defp num_bits(0), do: 1 178 | 179 | defp num_bits(int) do 180 | int 181 | |> :math.log2() 182 | |> trunc 183 | |> Kernel.+(1) 184 | end 185 | 186 | # http://erlang.org/euc/07/papers/1700Gustafsson.pdf 187 | defp reverse(<<>>), do: <<>> 188 | 189 | defp reverse(<>) do 190 | <> 191 | end 192 | 193 | def debug(bitstring) do 194 | for <>, do: bit 195 | end 196 | end 197 | --------------------------------------------------------------------------------