├── LICENSE ├── README.md ├── charsetutil.go ├── charsetutil_test.go ├── go.mod └── go.sum /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Yusuke Inuzuka 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## charsetutil - An easiest way to convert character set encodings in Go 2 | 3 | charsetutil provides easiest way to convert character set encodings in Go. 4 | 5 | ## Install 6 | 7 | ```bash 8 | go get github.com/yuin/charsetutil 9 | ``` 10 | 11 | ## Utilities 12 | 13 | - `Decode*` : Converts from the specified charset to UTF-8. 14 | - `Encode*` : Converts from the UTF-8 to specified charset. 15 | - `Guess*` : Guesses a charcter set. 16 | 17 | - `MustDecode*` : Same as `Decode*`, but panics when errors occur 18 | - `MustEncode*` : Same as `Encode*`, but panics when errors occur 19 | 20 | ```go 21 | b, err = EncodeString("こんにちわ", "Windows-31J") 22 | b, err = Encode("こんにちわ", "Windows-31J") 23 | b, err = EncodeBytes([]byte("こんにちわ"), "Windows-31J") 24 | b, err = EncodeReader(strings.NewReader("こんにちわ"), "Windows-31J") 25 | b = MustEncodeString("こんにちわ", "Windows-31J") 26 | b = MustEncode("こんにちわ", "Windows-31J") 27 | b = MustEncodeBytes([]byte("こんにちわ"), "Windows-31J") 28 | b = MustEncodeReader(strings.NewReader("こんにちわ"), "Windows-31J") 29 | 30 | s, err = DecodeString(string(source), "Windows-31J") 31 | s, err = Decode(source, "Windows-31J") 32 | s, err = DecodeBytes(source, "Windows-31J") 33 | s, err = DecodeReader(bytes.NewReader(source), "Windows-31J") 34 | s = MustDecodeString(string(source), "Windows-31J") 35 | s = MustDecode(source, "Windows-31J") 36 | s = MustDecodeBytes(source, "Windows-31J") 37 | s = MustDecodeReader(bytes.NewReader(source), "Windows-31J") 38 | 39 | cs, err := GuessString(string(source)) 40 | cs, err := GuessBytes(source) 41 | cs, err := GuessReader(bytes.NewReader(source)) 42 | cs, err := Guess(source) 43 | ``` 44 | 45 | ## Supported character sets 46 | 47 | See [Encoding spec on WHATWG](https://encoding.spec.whatwg.org/#names-and-labels) 48 | 49 | ## Author 50 | 51 | Yusuke Inuzuka 52 | 53 | ## License 54 | 55 | [BSD License](http://opensource.org/licenses/BSD-2-Clause) 56 | 57 | -------------------------------------------------------------------------------- /charsetutil.go: -------------------------------------------------------------------------------- 1 | package charsetutil 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "strings" 9 | 10 | "github.com/gogs/chardet" 11 | 12 | "golang.org/x/net/html/charset" 13 | "golang.org/x/text/transform" 14 | ) 15 | 16 | func panicIfError(err error) { 17 | if err != nil { 18 | panic(err) 19 | } 20 | } 21 | 22 | // CharsetGuess is a guessd charcter set 23 | type CharsetGuess interface { 24 | // Charset returns a guessed charcter set 25 | Charset() string 26 | 27 | // Language returns a guessed language 28 | Language() string 29 | 30 | // Confidence returns a confidence of this guess 31 | Confidence() int 32 | } 33 | 34 | type charsetGuess struct { 35 | *chardet.Result 36 | } 37 | 38 | func (g *charsetGuess) Charset() string { 39 | return g.Result.Charset 40 | } 41 | 42 | func (g *charsetGuess) Language() string { 43 | return g.Result.Language 44 | } 45 | 46 | func (g *charsetGuess) Confidence() int { 47 | return g.Result.Confidence 48 | } 49 | 50 | // GuessBytes guesses a character set of given bytes 51 | func GuessBytes(s []byte) (CharsetGuess, error) { 52 | detector := chardet.NewTextDetector() 53 | result, err := detector.DetectBest(s) 54 | if err != nil { 55 | return nil, err 56 | } 57 | return &charsetGuess{result}, err 58 | } 59 | 60 | // Guess guesses a character set of given bytes 61 | func Guess(s []byte) (CharsetGuess, error) { 62 | return GuessBytes(s) 63 | } 64 | 65 | // GuessBytes guesses a character set of given Reader 66 | func GuessReader(s io.Reader) (CharsetGuess, error) { 67 | detector := chardet.NewTextDetector() 68 | buf := make([]byte, 128) 69 | if _, err := s.Read(buf); err != nil { 70 | return nil, err 71 | } 72 | result, err := detector.DetectBest(buf) 73 | if err != nil { 74 | return nil, err 75 | } 76 | return &charsetGuess{result}, err 77 | } 78 | 79 | // GuessBytes guesses a character set of given string 80 | func GuessString(s string) (CharsetGuess, error) { 81 | detector := chardet.NewTextDetector() 82 | result, err := detector.DetectBest([]byte(s)) 83 | if err != nil { 84 | return nil, err 85 | } 86 | return &charsetGuess{result}, err 87 | } 88 | 89 | // DecodeReader converts given Reader to a UTF-8 string 90 | func DecodeReader(s io.Reader, enc string) (string, error) { 91 | reader, err := charset.NewReaderLabel(enc, s) 92 | if err != nil { 93 | return "", err 94 | } 95 | bytes, err := ioutil.ReadAll(reader) 96 | if err != nil { 97 | return "", err 98 | } 99 | return string(bytes), nil 100 | } 101 | 102 | // MustDecodeReader converts given Reader to a UTF-8 string and panics if errros occur. 103 | func MustDecodeReader(s io.Reader, enc string) string { 104 | ret, err := DecodeReader(s, enc) 105 | panicIfError(err) 106 | return ret 107 | } 108 | 109 | // DecodeBytes converts given bytes to a UTF-8 string 110 | func DecodeBytes(s []byte, enc string) (string, error) { 111 | return DecodeReader(bytes.NewReader(s), enc) 112 | } 113 | 114 | // MustDecodeBytes converts given bytes to a UTF-8 string and panics if errros occur. 115 | func MustDecodeBytes(s []byte, enc string) string { 116 | ret, err := DecodeReader(bytes.NewReader(s), enc) 117 | panicIfError(err) 118 | return ret 119 | } 120 | 121 | // DecodeString converts given string to a UTF-8 string 122 | func DecodeString(s, enc string) (string, error) { 123 | return DecodeReader(strings.NewReader(s), enc) 124 | } 125 | 126 | // MustDecodeString converts given string to a UTF-8 string and panics if errros occur. 127 | func MustDecodeString(s, enc string) string { 128 | ret, err := DecodeReader(strings.NewReader(s), enc) 129 | panicIfError(err) 130 | return ret 131 | } 132 | 133 | // DecodeBytes converts given bytes to a UTF-8 string 134 | func Decode(s []byte, enc string) (string, error) { 135 | return DecodeReader(bytes.NewReader(s), enc) 136 | } 137 | 138 | // MustDecodeBytes converts given bytes to a UTF-8 string and panics if errros occur. 139 | func MustDecode(s []byte, enc string) string { 140 | ret, err := DecodeReader(bytes.NewReader(s), enc) 141 | panicIfError(err) 142 | return ret 143 | } 144 | 145 | // EncodeReader converts a Reader to bytes encoded with given encoding 146 | func EncodeReader(s io.Reader, enc string) ([]byte, error) { 147 | e, _ := charset.Lookup(enc) 148 | if e == nil { 149 | return nil, fmt.Errorf("unsupported charset: %q", enc) 150 | } 151 | var buf bytes.Buffer 152 | writer := transform.NewWriter(&buf, e.NewEncoder()) 153 | _, err := io.Copy(writer, s) 154 | if err != nil { 155 | return nil, err 156 | } 157 | return buf.Bytes(), nil 158 | } 159 | 160 | // MustEncodeReader converts a Reader to bytes encoded with given encoding and panics if errors occur 161 | func MustEncodeReader(s io.Reader, enc string) []byte { 162 | ret, err := EncodeReader(s, enc) 163 | panicIfError(err) 164 | return ret 165 | } 166 | 167 | // EncodeBytes converts bytes to bytes encoded with given encoding 168 | func EncodeBytes(s []byte, enc string) ([]byte, error) { 169 | return EncodeReader(bytes.NewReader(s), enc) 170 | } 171 | 172 | // MustEncodeBytes converts a bytes to bytes encoded with given encoding and panics if errors occur 173 | func MustEncodeBytes(s []byte, enc string) []byte { 174 | ret, err := EncodeReader(bytes.NewReader(s), enc) 175 | panicIfError(err) 176 | return ret 177 | } 178 | 179 | // EncodeString converts a string to bytes encoded with given encoding 180 | func EncodeString(s, enc string) ([]byte, error) { 181 | return EncodeReader(strings.NewReader(s), enc) 182 | } 183 | 184 | // MustEncodeString converts a bytes to bytes encoded with given encoding and panics if errors occur 185 | func MustEncodeString(s, enc string) []byte { 186 | ret, err := EncodeReader(strings.NewReader(s), enc) 187 | panicIfError(err) 188 | return ret 189 | } 190 | 191 | // Encode converts a string to bytes encoded with given encoding 192 | func Encode(s string, enc string) ([]byte, error) { 193 | return EncodeReader(strings.NewReader(s), enc) 194 | } 195 | 196 | // MustEncode converts a bytes to bytes encoded with given encoding and panics if errors occur 197 | func MustEncode(s string, enc string) []byte { 198 | ret, err := EncodeReader(strings.NewReader(s), enc) 199 | panicIfError(err) 200 | return ret 201 | 202 | } 203 | -------------------------------------------------------------------------------- /charsetutil_test.go: -------------------------------------------------------------------------------- 1 | package charsetutil 2 | 3 | import ( 4 | "bytes" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | func TestEncodeOk(t *testing.T) { 10 | expected := []byte{'\x82', '\xb1', '\x82', '\xf1', '\x82', '\xc9', '\x82', '\xbf', '\x82', '\xed'} 11 | assert := func(b []byte, err error) { 12 | if err != nil { 13 | t.Errorf("Failed: %s", err.Error()) 14 | } 15 | if string(b) != string(expected) { 16 | t.Error("Failed") 17 | } 18 | } 19 | 20 | b, err := EncodeString("こんにちわ", "Windows-31J") 21 | assert(b, err) 22 | 23 | b, err = EncodeBytes([]byte("こんにちわ"), "Windows-31J") 24 | assert(b, err) 25 | 26 | b, err = Encode("こんにちわ", "Windows-31J") 27 | assert(b, err) 28 | 29 | b, err = EncodeReader(strings.NewReader("こんにちわ"), "Windows-31J") 30 | assert(b, err) 31 | 32 | b = MustEncodeString("こんにちわ", "Windows-31J") 33 | assert(b, nil) 34 | 35 | b = MustEncodeBytes([]byte("こんにちわ"), "Windows-31J") 36 | assert(b, nil) 37 | 38 | b = MustEncode("こんにちわ", "Windows-31J") 39 | assert(b, nil) 40 | 41 | b = MustEncodeReader(strings.NewReader("こんにちわ"), "Windows-31J") 42 | assert(b, nil) 43 | } 44 | 45 | func TestEncodeError(t *testing.T) { 46 | assert := func(b []byte, err error) { 47 | if b != nil || err == nil { 48 | t.Error("Failed") 49 | } 50 | } 51 | 52 | assertPanic := func(f func() []byte) { 53 | defer func() { 54 | if recover() == nil { 55 | t.Error("Should be failed") 56 | } 57 | }() 58 | b := f() 59 | if b != nil { 60 | t.Error("Failed") 61 | } 62 | } 63 | 64 | b, err := EncodeString("こんにちわ", "unknown") 65 | assert(b, err) 66 | 67 | b, err = EncodeBytes([]byte("こんにちわ"), "unknown") 68 | assert(b, err) 69 | 70 | b, err = Encode("こんにちわ", "unknown") 71 | assert(b, err) 72 | 73 | b, err = EncodeReader(strings.NewReader("こんにちわ"), "unknown") 74 | assert(b, err) 75 | 76 | assertPanic(func() []byte { return MustEncodeString("こんにちわ", "unknown") }) 77 | 78 | assertPanic(func() []byte { return MustEncodeBytes([]byte("こんにちわ"), "unknown") }) 79 | 80 | assertPanic(func() []byte { return MustEncode("こんにちわ", "unknown") }) 81 | 82 | assertPanic(func() []byte { return MustEncodeReader(strings.NewReader("こんにちわ"), "unknown") }) 83 | } 84 | 85 | func TestDecodeOk(t *testing.T) { 86 | source := []byte{'\x82', '\xb1', '\x82', '\xf1', '\x82', '\xc9', '\x82', '\xbf', '\x82', '\xed'} 87 | expected := "こんにちわ" 88 | 89 | assert := func(b string, err error) { 90 | if err != nil { 91 | t.Errorf("Failed: %s", err.Error()) 92 | } 93 | if b != expected { 94 | t.Error("Failed") 95 | } 96 | } 97 | 98 | b, err := DecodeString(string(source), "Windows-31J") 99 | assert(b, err) 100 | 101 | b, err = DecodeBytes(source, "Windows-31J") 102 | assert(b, err) 103 | 104 | b, err = Decode(source, "Windows-31J") 105 | assert(b, err) 106 | 107 | b, err = DecodeReader(bytes.NewReader(source), "Windows-31J") 108 | assert(b, err) 109 | 110 | b = MustDecodeString(string(source), "Windows-31J") 111 | assert(b, nil) 112 | 113 | b = MustDecodeBytes(source, "Windows-31J") 114 | assert(b, nil) 115 | 116 | b = MustDecode(source, "Windows-31J") 117 | assert(b, nil) 118 | 119 | b = MustDecodeReader(bytes.NewReader(source), "Windows-31J") 120 | assert(b, nil) 121 | } 122 | 123 | func TestDecodeError(t *testing.T) { 124 | source := []byte{'\x82', '\xb1', '\x82', '\xf1', '\x82', '\xc9', '\x82', '\xbf', '\x82', '\xed'} 125 | assert := func(s string, err error) { 126 | if s != "" || err == nil { 127 | t.Error("Failed") 128 | } 129 | } 130 | 131 | assertPanic := func(f func() string) { 132 | defer func() { 133 | if recover() == nil { 134 | t.Error("Should be failed") 135 | } 136 | }() 137 | s := f() 138 | if s != "" { 139 | t.Error("Failed") 140 | } 141 | } 142 | 143 | b, err := DecodeString(string(source), "unknown") 144 | assert(b, err) 145 | 146 | b, err = DecodeBytes(source, "unknown") 147 | assert(b, err) 148 | 149 | b, err = Decode(source, "unknown") 150 | assert(b, err) 151 | 152 | b, err = DecodeReader(bytes.NewReader(source), "unknown") 153 | assert(b, err) 154 | 155 | assertPanic(func() string { return MustDecodeString(string(source), "unknown") }) 156 | 157 | assertPanic(func() string { return MustDecodeBytes(source, "unknown") }) 158 | 159 | assertPanic(func() string { return MustDecode(source, "unknown") }) 160 | 161 | assertPanic(func() string { return MustDecodeReader(bytes.NewReader(source), "unknown") }) 162 | } 163 | 164 | func TestGuess(t *testing.T) { 165 | sourceEuc := []byte{'\xa4', '\xa2', '\xa4', '\xa4', '\xa4', '\xa6', '\xa4', '\xa8', '\xa4', '\xaa', '\x0d', '\x0a', '\xa5', '\xbd', '\xc7', '\xbd', '\x0d', '\x0a', '\x74', '\x65', '\x73', '\x74', '\x0d', '\x0a', '\x8e', '\xb6', '\x8e', '\xb7', '\x8e', '\xb8', '\x8e', '\xb9', '\x8e', '\xba'} 166 | // sourceSjis := []byte{'\x82', '\xa0', '\x82', '\xa2', '\x82', '\xa4', '\x82', '\xa6', '\x82', '\xa8', '\x0d', '\x0a', '\x83', '\x5c', '\x94', '\x5c', '\x0d', '\x0a', '\x74', '\x65', '\x73', '\x74', '\x0d', '\x0a', '\xb6', '\xb7', '\xb8', '\xb9', '\xba'} 167 | 168 | assert := func(r CharsetGuess, charset, language string, err error) { 169 | if err != nil { 170 | t.Errorf("Failed:%+v", err) 171 | } 172 | if r.Charset() != charset { 173 | t.Errorf("'%s' expected, but got '%s'", charset, r.Charset()) 174 | } 175 | if r.Language() != language { 176 | t.Errorf("'%s' expected, but got '%s'", language, r.Language()) 177 | } 178 | } 179 | 180 | result, err := Guess(sourceEuc) 181 | assert(result, "EUC-JP", "ja", err) 182 | 183 | result, err = GuessBytes(sourceEuc) 184 | assert(result, "EUC-JP", "ja", err) 185 | 186 | result, err = GuessReader(bytes.NewReader(sourceEuc)) 187 | assert(result, "EUC-JP", "ja", err) 188 | 189 | result, err = GuessString("ああイイ”haa") 190 | assert(result, "UTF-8", "", err) 191 | 192 | } 193 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/yuin/charsetutil 2 | 3 | require ( 4 | github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561 5 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect 6 | golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e 7 | golang.org/x/text v0.3.0 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561 h1:aBzukfDxQlCTVS0NBUjI5YA3iVeaZ9Tb5PxNrrIP1xs= 2 | github.com/gogs/chardet v0.0.0-20150115103509-2404f7772561/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= 3 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= 4 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= 5 | golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e h1:bRhVy7zSSasaqNksaRZiA5EEI+Ei4I1nO5Jh72wfHlg= 6 | golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 7 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 8 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 9 | --------------------------------------------------------------------------------