├── .github └── workflows │ ├── platforms.yml │ ├── strawberry.yml │ └── versions.yml ├── .gitignore ├── AUTHORS ├── Byte ├── Byte.pm └── Makefile.PL ├── CN ├── CN.pm └── Makefile.PL ├── Changes ├── EBCDIC ├── EBCDIC.pm └── Makefile.PL ├── Encode.pm ├── Encode.xs ├── Encode ├── Changes.e2x ├── ConfigLocal_PM.e2x ├── Makefile_PL.e2x ├── README.e2x ├── _PM.e2x ├── _T.e2x └── encode.h ├── JP ├── JP.pm └── Makefile.PL ├── KR ├── KR.pm └── Makefile.PL ├── MANIFEST ├── Makefile.PL ├── README ├── README.md ├── Symbol ├── Makefile.PL └── Symbol.pm ├── TW ├── Makefile.PL └── TW.pm ├── Unicode ├── Makefile.PL ├── Unicode.pm └── Unicode.xs ├── bin ├── enc2xs ├── encguess ├── piconv ├── ucm2table ├── ucmlint ├── ucmsort └── unidump ├── encengine.c ├── encoding.pm ├── lib └── Encode │ ├── Alias.pm │ ├── CJKConstants.pm │ ├── CN │ └── HZ.pm │ ├── Config.pm │ ├── Encoder.pm │ ├── Encoding.pm │ ├── GSM0338.pm │ ├── Guess.pm │ ├── JP │ ├── H2Z.pm │ └── JIS7.pm │ ├── KR │ └── 2022_KR.pm │ ├── MIME │ ├── Header.pm │ ├── Header │ │ └── ISO_2022_JP.pm │ └── Name.pm │ ├── PerlIO.pod │ ├── Supported.pod │ └── Unicode │ └── UTF7.pm ├── t ├── Aliases.t ├── CJKT.t ├── Encode.t ├── Encoder.t ├── Mod_EUCJP.pm ├── Unicode.t ├── Unicode_trailing_nul.t ├── at-cn.t ├── at-tw.t ├── big5-eten.enc ├── big5-eten.utf ├── big5-hkscs.enc ├── big5-hkscs.utf ├── cow.t ├── decode.t ├── enc_data.t ├── enc_eucjp.t ├── enc_module.enc ├── enc_module.t ├── enc_utf8.t ├── encoding-locale.t ├── encoding.t ├── fallback.t ├── from_to.t ├── gb2312.enc ├── gb2312.utf ├── grow.t ├── gsm0338.t ├── guess.t ├── isa.t ├── jis7-fallback.t ├── jisx0201.enc ├── jisx0201.utf ├── jisx0208.enc ├── jisx0208.utf ├── jisx0212.enc ├── jisx0212.utf ├── jperl.t ├── ksc5601.enc ├── ksc5601.utf ├── magic.t ├── mime-header.t ├── mime-name.t ├── mime_header_iso2022jp.t ├── perlio.t ├── piconv.t ├── rt.pl ├── rt113164.t ├── rt65541.t ├── rt76824.t ├── rt85489.t ├── rt86327.t ├── taint.t ├── truncated_utf8.t ├── undef.t ├── unibench.pl ├── use-Encode-Alias.t ├── utf32warnings.t ├── utf8ref.t ├── utf8strict.t ├── utf8warnings.t ├── whatwg-aliases.json ├── whatwg-aliases.t └── xml.t └── ucm ├── 8859-1.ucm ├── 8859-10.ucm ├── 8859-11.ucm ├── 8859-13.ucm ├── 8859-14.ucm ├── 8859-15.ucm ├── 8859-16.ucm ├── 8859-2.ucm ├── 8859-3.ucm ├── 8859-4.ucm ├── 8859-5.ucm ├── 8859-6.ucm ├── 8859-7.ucm ├── 8859-8.ucm ├── 8859-9.ucm ├── adobeStdenc.ucm ├── adobeSymbol.ucm ├── adobeZdingbat.ucm ├── ascii.ucm ├── big5-eten.ucm ├── big5-hkscs.ucm ├── cp037.ucm ├── cp1006.ucm ├── cp1026.ucm ├── cp1047.ucm ├── cp1250.ucm ├── cp1251.ucm ├── cp1252.ucm ├── cp1253.ucm ├── cp1254.ucm ├── cp1255.ucm ├── cp1256.ucm ├── cp1257.ucm ├── cp1258.ucm ├── cp424.ucm ├── cp437.ucm ├── cp500.ucm ├── cp737.ucm ├── cp775.ucm ├── cp850.ucm ├── cp852.ucm ├── cp855.ucm ├── cp856.ucm ├── cp857.ucm ├── cp858.ucm ├── cp860.ucm ├── cp861.ucm ├── cp862.ucm ├── cp863.ucm ├── cp864.ucm ├── cp865.ucm ├── cp866.ucm ├── cp869.ucm ├── cp874.ucm ├── cp875.ucm ├── cp932.ucm ├── cp936.ucm ├── cp949.ucm ├── cp950.ucm ├── ctrl.ucm ├── dingbats.ucm ├── euc-cn.ucm ├── euc-jp.ucm ├── euc-kr.ucm ├── gb12345.ucm ├── gb2312.ucm ├── hp-roman8.ucm ├── ir-165.ucm ├── jis0201.ucm ├── jis0208.ucm ├── jis0212.ucm ├── johab.ucm ├── koi8-f.ucm ├── koi8-r.ucm ├── koi8-u.ucm ├── ksc5601.ucm ├── macArabic.ucm ├── macCentEuro.ucm ├── macChinsimp.ucm ├── macChintrad.ucm ├── macCroatian.ucm ├── macCyrillic.ucm ├── macDingbats.ucm ├── macFarsi.ucm ├── macGreek.ucm ├── macHebrew.ucm ├── macIceland.ucm ├── macJapanese.ucm ├── macKorean.ucm ├── macROMnn.ucm ├── macRUMnn.ucm ├── macRoman.ucm ├── macSami.ucm ├── macSymbol.ucm ├── macThai.ucm ├── macTurkish.ucm ├── macUkraine.ucm ├── nextstep.ucm ├── null.ucm ├── posix-bc.ucm ├── shiftjis.ucm ├── symbol.ucm └── viscii.ucm /.github/workflows/platforms.yml: -------------------------------------------------------------------------------- 1 | name: CI on Perl 5.{40,38,36} on {Linux,macOS} 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: [ '*.*' ] 7 | pull_request: 8 | branches: [ main ] 9 | 10 | jobs: 11 | build: 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: ['ubuntu-latest', 'macos-latest'] 16 | perl: [ '5.40', '5.38', '5.36' ] 17 | name: Perl ${{ matrix.perl }} on ${{ matrix.os }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up perl 21 | uses: shogo82148/actions-setup-perl@v1 22 | with: 23 | perl-version: ${{ matrix.perl }} 24 | - run: perl -V 25 | - run: cpanm --installdeps . 26 | - run: perl Makefile.PL 27 | - run: make test 28 | -------------------------------------------------------------------------------- /.github/workflows/strawberry.yml: -------------------------------------------------------------------------------- 1 | name: CI on Perl 5.{38,36} on Strawberry Perl 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: [ '*.*' ] 7 | 8 | jobs: 9 | build: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | os: ['windows-latest'] 14 | perl: [ '5.38', '5.36' ] 15 | name: Perl ${{ matrix.perl }} on ${{ matrix.os }} 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up perl 19 | uses: shogo82148/actions-setup-perl@v1 20 | with: 21 | perl-version: ${{ matrix.perl }} 22 | distribution: strawberry 23 | - run: perl -V 24 | - run: perl Makefile.PL 25 | - run: gmake test 26 | -------------------------------------------------------------------------------- /.github/workflows/versions.yml: -------------------------------------------------------------------------------- 1 | name: CI on Perl 5.8-34 on Linux 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: [ '*.*' ] 7 | 8 | jobs: 9 | build: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | os: 14 | - 'ubuntu-latest' 15 | perl: 16 | - "5.34" 17 | - "5.32" 18 | - "5.30" 19 | - "5.28" 20 | - "5.26" 21 | - "5.24" 22 | - "5.22" 23 | - "5.20" 24 | - "5.18" 25 | - "5.16" 26 | - "5.14" 27 | - "5.12" 28 | - "5.10" 29 | - "5.8" 30 | name: Perl ${{ matrix.perl }} on ${{ matrix.os }} 31 | steps: 32 | - uses: actions/checkout@v4 33 | - name: Set up perl 34 | uses: shogo82148/actions-setup-perl@v1 35 | with: 36 | perl-version: ${{ matrix.perl }} 37 | - run: perl -V 38 | - run: cpanm --installdeps . 39 | - run: perl Makefile.PL 40 | - run: make test 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /blib/ 2 | /.build/ 3 | _build/ 4 | cover_db/ 5 | inc/ 6 | Build 7 | !Build/ 8 | Build.bat 9 | .last_cover_stats 10 | /Makefile 11 | /Makefile.old 12 | /MANIFEST.bak 13 | /META.yml 14 | /META.json 15 | /MYMETA.* 16 | nytprof.out 17 | /pm_to_blib 18 | *.o 19 | *.bs 20 | /_eumm/ 21 | Byte/Byte.c 22 | Byte/Byte.xs 23 | Byte/MYMETA.json 24 | Byte/MYMETA.yml 25 | Byte/Makefile 26 | Byte/Makefile.old 27 | Byte/byte_t.c 28 | Byte/byte_t.exh 29 | Byte/byte_t.fnm 30 | Byte/byte_t.h 31 | Byte/pm_to_blib 32 | CN/CN.c 33 | CN/CN.xs 34 | CN/MYMETA.json 35 | CN/MYMETA.yml 36 | CN/Makefile 37 | CN/Makefile.old 38 | CN/cp_00_t.c 39 | CN/cp_00_t.exh 40 | CN/cp_00_t.fnm 41 | CN/cp_00_t.h 42 | CN/eu_01_t.c 43 | CN/eu_01_t.exh 44 | CN/eu_01_t.fnm 45 | CN/eu_01_t.h 46 | CN/gb_02_t.c 47 | CN/gb_02_t.exh 48 | CN/gb_02_t.fnm 49 | CN/gb_02_t.h 50 | CN/gb_03_t.c 51 | CN/gb_03_t.exh 52 | CN/gb_03_t.fnm 53 | CN/gb_03_t.h 54 | CN/ir_04_t.c 55 | CN/ir_04_t.exh 56 | CN/ir_04_t.fnm 57 | CN/ir_04_t.h 58 | CN/ma_05_t.c 59 | CN/ma_05_t.exh 60 | CN/ma_05_t.fnm 61 | CN/ma_05_t.h 62 | CN/pm_to_blib 63 | EBCDIC/EBCDIC.c 64 | EBCDIC/EBCDIC.xs 65 | EBCDIC/MYMETA.json 66 | EBCDIC/MYMETA.yml 67 | EBCDIC/Makefile 68 | EBCDIC/Makefile.old 69 | EBCDIC/ebcdic_t.c 70 | EBCDIC/ebcdic_t.exh 71 | EBCDIC/ebcdic_t.fnm 72 | EBCDIC/ebcdic_t.h 73 | EBCDIC/pm_to_blib 74 | Encode.c 75 | JP/JP.c 76 | JP/JP.xs 77 | JP/MYMETA.json 78 | JP/MYMETA.yml 79 | JP/Makefile 80 | JP/Makefile.old 81 | JP/cp_00_t.c 82 | JP/cp_00_t.exh 83 | JP/cp_00_t.fnm 84 | JP/cp_00_t.h 85 | JP/eu_01_t.c 86 | JP/eu_01_t.exh 87 | JP/eu_01_t.fnm 88 | JP/eu_01_t.h 89 | JP/ji_02_t.c 90 | JP/ji_02_t.exh 91 | JP/ji_02_t.fnm 92 | JP/ji_02_t.h 93 | JP/ji_03_t.c 94 | JP/ji_03_t.exh 95 | JP/ji_03_t.fnm 96 | JP/ji_03_t.h 97 | JP/ji_04_t.c 98 | JP/ji_04_t.exh 99 | JP/ji_04_t.fnm 100 | JP/ji_04_t.h 101 | JP/ma_05_t.c 102 | JP/ma_05_t.exh 103 | JP/ma_05_t.fnm 104 | JP/ma_05_t.h 105 | JP/pm_to_blib 106 | JP/sh_06_t.c 107 | JP/sh_06_t.exh 108 | JP/sh_06_t.fnm 109 | JP/sh_06_t.h 110 | KR/KR.c 111 | KR/KR.xs 112 | KR/MYMETA.json 113 | KR/MYMETA.yml 114 | KR/Makefile 115 | KR/Makefile.old 116 | KR/cp_00_t.c 117 | KR/cp_00_t.exh 118 | KR/cp_00_t.fnm 119 | KR/cp_00_t.h 120 | KR/eu_01_t.c 121 | KR/eu_01_t.exh 122 | KR/eu_01_t.fnm 123 | KR/eu_01_t.h 124 | KR/jo_02_t.c 125 | KR/jo_02_t.exh 126 | KR/jo_02_t.fnm 127 | KR/jo_02_t.h 128 | KR/ks_03_t.c 129 | KR/ks_03_t.exh 130 | KR/ks_03_t.fnm 131 | KR/ks_03_t.h 132 | KR/ma_04_t.c 133 | KR/ma_04_t.exh 134 | KR/ma_04_t.fnm 135 | KR/ma_04_t.h 136 | KR/pm_to_blib 137 | Makefile~ 138 | Symbol/MYMETA.json 139 | Symbol/MYMETA.yml 140 | Symbol/Makefile 141 | Symbol/Makefile.old 142 | Symbol/Symbol.c 143 | Symbol/Symbol.xs 144 | Symbol/pm_to_blib 145 | Symbol/symbol_t.c 146 | Symbol/symbol_t.exh 147 | Symbol/symbol_t.fnm 148 | Symbol/symbol_t.h 149 | TW/MYMETA.json 150 | TW/MYMETA.yml 151 | TW/Makefile 152 | TW/Makefile.old 153 | TW/TW.c 154 | TW/TW.xs 155 | TW/bi_00_t.c 156 | TW/bi_00_t.exh 157 | TW/bi_00_t.fnm 158 | TW/bi_00_t.h 159 | TW/bi_01_t.c 160 | TW/bi_01_t.exh 161 | TW/bi_01_t.fnm 162 | TW/bi_01_t.h 163 | TW/cp_02_t.c 164 | TW/cp_02_t.exh 165 | TW/cp_02_t.fnm 166 | TW/cp_02_t.h 167 | TW/ma_03_t.c 168 | TW/ma_03_t.exh 169 | TW/ma_03_t.fnm 170 | TW/ma_03_t.h 171 | TW/pm_to_blib 172 | Unicode/MYMETA.json 173 | Unicode/MYMETA.yml 174 | Unicode/Makefile 175 | Unicode/Makefile.old 176 | Unicode/Unicode.c 177 | Unicode/pm_to_blib 178 | def_t.c 179 | def_t.exh 180 | def_t.fnm 181 | def_t.h 182 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # To give due honour to those who have made the Encode module what it 2 | # is today, here are easily-from-changelogs-extractable people and their 3 | # (hopefully) current and preferred email addresses (as of early 2002, 4 | # if known). 5 | # 6 | # The use of this database for anything else than Encode and/or Perl 7 | # development is strictly forbidden. (Passive distribution with the Perl 8 | # source code kit or CPAN is, of course, allowed.) 9 | # 10 | # This list is in alphabetical order. 11 | -- 12 | Alex Davies 13 | Andreas J. Koenig 14 | Anton Tagunov 15 | Autrijus Tang 16 | Benjamin Goldberg 17 | Bjoern Hoehrmann 18 | Bjoern Jacke 19 | Chris Nandor 20 | Craig A. Berry 21 | Curtis Jewell 22 | Dan Kogai 23 | Dave Evans 24 | Deng Liu 25 | Dominic Dunlop 26 | Elizabeth Mattijsen 27 | Gerrit P. Haase 28 | Gisle Aas 29 | Graham Barr 30 | Gurusamy Sarathy 31 | H.Merijn Brand 32 | Hugo van der Sanden 33 | Inaba Hiroto 34 | Jarkko Hietaniemi 35 | Jesse Vincent 36 | Jungshik Shin 37 | KONNO Hiroharu 38 | Laszlo Molnar 39 | MATSUNO Tokuhiro 40 | MORIYAMA Masayuki 41 | Makamaka 42 | Mark-Jason Dominus 43 | Mattia Barbon 44 | Michael G Schwern 45 | Miron Cuperman 46 | Nicholas Clark 47 | Nick Ing-Simmons 48 | Paul Marquess 49 | Peter Prymmer 50 | Philip Newton 51 | Piotr Fusik 52 | Rafael Garcia-Suarez 53 | Robin Barker 54 | SADAHIRO Tomoyuki 55 | SUGAWARA Hajime 56 | SUZUKI Norio 57 | Simon Cozens 58 | Slaven Rezic 59 | Spider Boardman 60 | Steve Hay 61 | Steve Peters 62 | Tatsuhiko Miyagawa 63 | Tels 64 | Tony Cook 65 | Vadim Konovalov 66 | Yitzchak Scott-Thoennes 67 | -------------------------------------------------------------------------------- /Byte/Byte.pm: -------------------------------------------------------------------------------- 1 | package Encode::Byte; 2 | use strict; 3 | use warnings; 4 | use Encode; 5 | our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 6 | 7 | use XSLoader; 8 | XSLoader::load( __PACKAGE__, $VERSION ); 9 | 10 | 1; 11 | __END__ 12 | 13 | =head1 NAME 14 | 15 | Encode::Byte - Single Byte Encodings 16 | 17 | =head1 SYNOPSIS 18 | 19 | use Encode qw/encode decode/; 20 | $greek = encode("iso-8859-7", $utf8); # loads Encode::Byte implicitly 21 | $utf8 = decode("iso-8859-7", $greek); # ditto 22 | 23 | =head1 ABSTRACT 24 | 25 | This module implements various single byte encodings. For most cases it uses 26 | \x80-\xff (upper half) to map non-ASCII characters. Encodings 27 | supported are as follows. 28 | 29 | Canonical Alias Description 30 | -------------------------------------------------------------------- 31 | # ISO 8859 series 32 | (iso-8859-1 is in built-in) 33 | iso-8859-2 latin2 [ISO] 34 | iso-8859-3 latin3 [ISO] 35 | iso-8859-4 latin4 [ISO] 36 | iso-8859-5 [ISO] 37 | iso-8859-6 [ISO] 38 | iso-8859-7 [ISO] 39 | iso-8859-8 [ISO] 40 | iso-8859-9 latin5 [ISO] 41 | iso-8859-10 latin6 [ISO] 42 | iso-8859-11 43 | (iso-8859-12 is nonexistent) 44 | iso-8859-13 latin7 [ISO] 45 | iso-8859-14 latin8 [ISO] 46 | iso-8859-15 latin9 [ISO] 47 | iso-8859-16 latin10 [ISO] 48 | 49 | # Cyrillic 50 | koi8-f 51 | koi8-r cp878 [RFC1489] 52 | koi8-u [RFC2319] 53 | 54 | # Vietnamese 55 | viscii 56 | 57 | # all cp* are also available as ibm-*, ms-*, and windows-* 58 | # also see L 59 | 60 | cp424 61 | cp437 62 | cp737 63 | cp775 64 | cp850 65 | cp852 66 | cp855 67 | cp856 68 | cp857 69 | cp860 70 | cp861 71 | cp862 72 | cp863 73 | cp864 74 | cp865 75 | cp866 76 | cp869 77 | cp874 78 | cp1006 79 | cp1250 WinLatin2 80 | cp1251 WinCyrillic 81 | cp1252 WinLatin1 82 | cp1253 WinGreek 83 | cp1254 WinTurkish 84 | cp1255 WinHebrew 85 | cp1256 WinArabic 86 | cp1257 WinBaltic 87 | cp1258 WinVietnamese 88 | 89 | # Macintosh 90 | # Also see L 91 | MacArabic 92 | MacCentralEurRoman 93 | MacCroatian 94 | MacCyrillic 95 | MacFarsi 96 | MacGreek 97 | MacHebrew 98 | MacIcelandic 99 | MacRoman 100 | MacRomanian 101 | MacRumanian 102 | MacSami 103 | MacThai 104 | MacTurkish 105 | MacUkrainian 106 | 107 | # More vendor encodings 108 | AdobeStandardEncoding 109 | nextstep 110 | hp-roman8 111 | 112 | =head1 DESCRIPTION 113 | 114 | To find how to use this module in detail, see L. 115 | 116 | =head1 SEE ALSO 117 | 118 | L 119 | 120 | =cut 121 | -------------------------------------------------------------------------------- /Byte/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | use File::Spec::Functions; 5 | 6 | my $name = 'Byte'; 7 | my %tables = ( 8 | byte_t => 9 | [ 10 | # misc. vendors 11 | # 'gsm0338.ucm', now in Encode::GSM0338 12 | 'nextstep.ucm', 13 | 'hp-roman8.ucm', 14 | 'viscii.ucm', 15 | 'adobeStdenc.ucm', 16 | # koi8 17 | 'koi8-f.ucm', 'koi8-r.ucm', 'koi8-u.ucm', 18 | # Mac 19 | qw( 20 | macArabic.ucm 21 | macCentEuro.ucm 22 | macCroatian.ucm 23 | macCyrillic.ucm 24 | macFarsi.ucm 25 | macGreek.ucm 26 | macHebrew.ucm 27 | macIceland.ucm 28 | macRoman.ucm 29 | macROMnn.ucm 30 | macRUMnn.ucm 31 | macSami.ucm 32 | macThai.ucm 33 | macTurkish.ucm 34 | macUkraine.ucm 35 | ), 36 | ], 37 | ); 38 | 39 | my %not_here = 40 | map {$_ => 1} 41 | ( 42 | '8859-1.ucm', 'cp1252.ucm', # default 43 | qw(cp037.ucm cp1026.ucm cp1047.ucm cp500.ucm cp875.ucm), # EBCDIC 44 | qw(cp932.ucm cp936.ucm cp949.ucm cp950.ucm), # CJK 45 | ); 46 | 47 | opendir(ENC,catdir(updir(),'ucm')) or die $!; 48 | while (defined(my $file = readdir(ENC))) 49 | { 50 | $file =~ /^(8859|cp).*\.ucm$/io or next; 51 | $not_here{$file} and next; 52 | push(@{$tables{byte_t}},$file); 53 | } 54 | closedir(ENC); 55 | 56 | WriteMakefile( 57 | INC => "-I../Encode", 58 | NAME => 'Encode::'.$name, 59 | VERSION_FROM => "$name.pm", 60 | OBJECT => '$(O_FILES)', 61 | 'dist' => { 62 | COMPRESS => 'gzip -9f', 63 | SUFFIX => 'gz', 64 | DIST_DEFAULT => 'all tardist', 65 | }, 66 | MAN3PODS => {}, 67 | # OS 390 winges about line numbers > 64K ??? 68 | XSOPT => '-nolinenumbers', 69 | ); 70 | 71 | package MY; 72 | 73 | sub post_initialize 74 | { 75 | my ($self) = @_; 76 | my %o; 77 | my $x = $self->{'OBJ_EXT'}; 78 | # Add the table O_FILES 79 | foreach my $e (keys %tables) 80 | { 81 | $o{$e.$x} = 1; 82 | } 83 | $o{"$name$x"} = 1; 84 | $self->{'O_FILES'} = [sort keys %o]; 85 | my @files = ("$name.xs"); 86 | $self->{'C'} = ["$name.c"]; 87 | $self->{SOURCE} .= " $name.c" 88 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 89 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 90 | my %xs; 91 | foreach my $table (sort keys %tables) { 92 | push (@{$self->{'C'}},"$table.c"); 93 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 94 | # get built. 95 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 96 | push (@files,$table.$ext); 97 | } 98 | $self->{SOURCE} .= " $table.c" 99 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 100 | } 101 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 102 | $self->{'clean'}{'FILES'} .= join(' ',@files); 103 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 104 | print XS <<'END'; 105 | #define PERL_NO_GET_CONTEXT 106 | #include 107 | #include 108 | #include 109 | #include "encode.h" 110 | END 111 | foreach my $table (sort keys %tables) { 112 | print XS qq[#include "${table}.h"\n]; 113 | } 114 | print XS <<"END"; 115 | 116 | static void 117 | Encode_XSEncoding(pTHX_ encode_t *enc) 118 | { 119 | dSP; 120 | HV *stash = gv_stashpv("Encode::XS", TRUE); 121 | SV *iv = newSViv(PTR2IV(enc)); 122 | SV *sv = sv_bless(newRV_noinc(iv),stash); 123 | int i = 0; 124 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 125 | constness, in the hope that perl won't mess with it. */ 126 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 127 | SvFLAGS(iv) |= SVp_POK; 128 | SvPVX(iv) = (char*) enc->name[0]; 129 | PUSHMARK(sp); 130 | XPUSHs(sv); 131 | while (enc->name[i]) 132 | { 133 | const char *name = enc->name[i++]; 134 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 135 | } 136 | PUTBACK; 137 | call_pv("Encode::define_encoding",G_DISCARD); 138 | SvREFCNT_dec(sv); 139 | } 140 | 141 | MODULE = Encode::$name PACKAGE = Encode::$name 142 | PROTOTYPES: DISABLE 143 | BOOT: 144 | { 145 | END 146 | foreach my $table (sort keys %tables) { 147 | print XS qq[#include "${table}.exh"\n]; 148 | } 149 | print XS "}\n"; 150 | close(XS); 151 | return "# Built $name.xs\n\n"; 152 | } 153 | 154 | sub postamble 155 | { 156 | my $self = shift; 157 | my $dir = $self->catdir($self->updir,'ucm'); 158 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 159 | $str .= "$name.c : $name.xs "; 160 | foreach my $table (sort keys %tables) 161 | { 162 | $str .= " $table.c"; 163 | } 164 | $str .= "\n\n"; 165 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 166 | 167 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 168 | foreach my $table (sort keys %tables) 169 | { 170 | my $numlines = 1; 171 | my $lengthsofar = length($str); 172 | my $continuator = ''; 173 | $str .= "$table.c : $enc2xs Makefile.PL"; 174 | foreach my $file (sort (@{$tables{$table}})) 175 | { 176 | $str .= $continuator.' '.$self->catfile($dir,$file); 177 | if ( length($str)-$lengthsofar > 128*$numlines ) 178 | { 179 | $continuator .= " \\\n\t"; 180 | $numlines++; 181 | } else { 182 | $continuator = ''; 183 | } 184 | } 185 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 186 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 187 | my $ucopts = '-"Q" -"O"'; 188 | $str .= 189 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 190 | open (FILELIST, ">$table.fnm") 191 | || die "Could not open $table.fnm: $!"; 192 | foreach my $file (sort (@{$tables{$table}})) 193 | { 194 | print FILELIST $self->catfile($dir,$file) . "\n"; 195 | } 196 | close(FILELIST); 197 | } 198 | return $str; 199 | } 200 | 201 | -------------------------------------------------------------------------------- /CN/CN.pm: -------------------------------------------------------------------------------- 1 | package Encode::CN; 2 | BEGIN { 3 | if ( ord("A") == 193 ) { 4 | die "Encode::CN not supported on EBCDIC\n"; 5 | } 6 | } 7 | use strict; 8 | use warnings; 9 | use Encode; 10 | our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 11 | use XSLoader; 12 | XSLoader::load( __PACKAGE__, $VERSION ); 13 | 14 | # Relocated from Encode.pm 15 | 16 | use Encode::CN::HZ; 17 | 18 | # use Encode::CN::2022_CN; 19 | 20 | 1; 21 | __END__ 22 | 23 | =head1 NAME 24 | 25 | Encode::CN - China-based Chinese Encodings 26 | 27 | =head1 SYNOPSIS 28 | 29 | use Encode qw/encode decode/; 30 | $euc_cn = encode("euc-cn", $utf8); # loads Encode::CN implicitly 31 | $utf8 = decode("euc-cn", $euc_cn); # ditto 32 | 33 | =head1 DESCRIPTION 34 | 35 | This module implements China-based Chinese charset encodings. 36 | Encodings supported are as follows. 37 | 38 | Canonical Alias Description 39 | -------------------------------------------------------------------- 40 | euc-cn /\beuc.*cn$/i EUC (Extended Unix Character) 41 | /\bcn.*euc$/i 42 | /\bGB[-_ ]?2312(?:\D.*$|$)/i (see below) 43 | gb2312-raw The raw (low-bit) GB2312 character map 44 | gb12345-raw Traditional chinese counterpart to 45 | GB2312 (raw) 46 | iso-ir-165 GB2312 + GB6345 + GB8565 + additions 47 | MacChineseSimp GB2312 + Apple Additions 48 | cp936 Code Page 936, also known as GBK 49 | (Extended GuoBiao) 50 | hz 7-bit escaped GB2312 encoding 51 | -------------------------------------------------------------------- 52 | 53 | To find how to use this module in detail, see L. 54 | 55 | =head1 NOTES 56 | 57 | Due to size concerns, C (an extension to C) is distributed 58 | separately on CPAN, under the name L. That module 59 | also contains extra Taiwan-based encodings. 60 | 61 | =head1 BUGS 62 | 63 | When you see C on mails and web pages, they really 64 | mean C encodings. To fix that, C is aliased to C. 65 | Use C when you really mean it. 66 | 67 | The ASCII region (0x00-0x7f) is preserved for all encodings, even though 68 | this conflicts with mappings by the Unicode Consortium. 69 | 70 | =head1 SEE ALSO 71 | 72 | L 73 | 74 | =cut 75 | -------------------------------------------------------------------------------- /CN/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | use strict; 5 | 6 | my %tables = (euc_cn_t => ['euc-cn.ucm', 7 | 'cp936.ucm', 8 | 'macChinsimp.ucm', 9 | ], 10 | '2312_t' => ['gb2312.ucm'], 11 | '12345_t' => ['gb12345.ucm'], 12 | ir_165_t => ['ir-165.ucm'], 13 | ); 14 | 15 | unless ($ENV{AGGREGATE_TABLES}){ 16 | my @ucm; 17 | for my $k (keys %tables){ 18 | push @ucm, @{$tables{$k}}; 19 | } 20 | %tables = (); 21 | my $seq = 0; 22 | for my $ucm (sort @ucm){ 23 | # 8.3 compliance ! 24 | my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); 25 | $tables{$t} = [ $ucm ]; 26 | } 27 | } 28 | 29 | my $name = 'CN'; 30 | 31 | WriteMakefile( 32 | INC => "-I../Encode", 33 | NAME => 'Encode::'.$name, 34 | VERSION_FROM => "$name.pm", 35 | OBJECT => '$(O_FILES)', 36 | 'dist' => { 37 | COMPRESS => 'gzip -9f', 38 | SUFFIX => 'gz', 39 | DIST_DEFAULT => 'all tardist', 40 | }, 41 | MAN3PODS => {}, 42 | # OS 390 winges about line numbers > 64K ??? 43 | XSOPT => '-nolinenumbers', 44 | XSPROTOARG => '-noprototypes', 45 | ); 46 | 47 | package MY; 48 | 49 | sub post_initialize 50 | { 51 | my ($self) = @_; 52 | my %o; 53 | my $x = $self->{'OBJ_EXT'}; 54 | # Add the table O_FILES 55 | foreach my $e (keys %tables) 56 | { 57 | $o{$e.$x} = 1; 58 | } 59 | $o{"$name$x"} = 1; 60 | $self->{'O_FILES'} = [sort keys %o]; 61 | my @files = ("$name.xs"); 62 | $self->{'C'} = ["$name.c"]; 63 | $self->{SOURCE} .= " $name.c" 64 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 65 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 66 | my %xs; 67 | foreach my $table (sort keys %tables) { 68 | push (@{$self->{'C'}},"$table.c"); 69 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 70 | # get built. 71 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 72 | push (@files,$table.$ext); 73 | } 74 | $self->{SOURCE} .= " $table.c" 75 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 76 | } 77 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 78 | $self->{'clean'}{'FILES'} .= join(' ',@files); 79 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 80 | print XS <<'END'; 81 | #define PERL_NO_GET_CONTEXT 82 | #include 83 | #include 84 | #include 85 | #include "encode.h" 86 | END 87 | foreach my $table (sort keys %tables) { 88 | print XS qq[#include "${table}.h"\n]; 89 | } 90 | print XS <<"END"; 91 | 92 | static void 93 | Encode_XSEncoding(pTHX_ encode_t *enc) 94 | { 95 | dSP; 96 | HV *stash = gv_stashpv("Encode::XS", TRUE); 97 | SV *iv = newSViv(PTR2IV(enc)); 98 | SV *sv = sv_bless(newRV_noinc(iv),stash); 99 | int i = 0; 100 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 101 | constness, in the hope that perl won't mess with it. */ 102 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 103 | SvFLAGS(iv) |= SVp_POK; 104 | SvPVX(iv) = (char*) enc->name[0]; 105 | PUSHMARK(sp); 106 | XPUSHs(sv); 107 | while (enc->name[i]) 108 | { 109 | const char *name = enc->name[i++]; 110 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 111 | } 112 | PUTBACK; 113 | call_pv("Encode::define_encoding",G_DISCARD); 114 | SvREFCNT_dec(sv); 115 | } 116 | 117 | MODULE = Encode::$name PACKAGE = Encode::$name 118 | PROTOTYPES: DISABLE 119 | BOOT: 120 | { 121 | END 122 | foreach my $table (sort keys %tables) { 123 | print XS qq[#include "${table}.exh"\n]; 124 | } 125 | print XS "}\n"; 126 | close(XS); 127 | return "# Built $name.xs\n\n"; 128 | } 129 | 130 | sub postamble 131 | { 132 | my $self = shift; 133 | my $dir = $self->catdir($self->updir,'ucm'); 134 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 135 | $str .= "$name.c : $name.xs "; 136 | foreach my $table (sort keys %tables) 137 | { 138 | $str .= " $table.c"; 139 | } 140 | $str .= "\n\n"; 141 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 142 | 143 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 144 | foreach my $table (sort keys %tables) 145 | { 146 | my $numlines = 1; 147 | my $lengthsofar = length($str); 148 | my $continuator = ''; 149 | $str .= "$table.c : $enc2xs Makefile.PL"; 150 | foreach my $file (@{$tables{$table}}) 151 | { 152 | $str .= $continuator.' '.$self->catfile($dir,$file); 153 | if ( length($str)-$lengthsofar > 128*$numlines ) 154 | { 155 | $continuator .= " \\\n\t"; 156 | $numlines++; 157 | } else { 158 | $continuator = ''; 159 | } 160 | } 161 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 162 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 163 | my $ucopts = '-"Q"'; 164 | $str .= 165 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 166 | open (FILELIST, ">$table.fnm") 167 | || die "Could not open $table.fnm: $!"; 168 | foreach my $file (@{$tables{$table}}) 169 | { 170 | print FILELIST $self->catfile($dir,$file) . "\n"; 171 | } 172 | close(FILELIST); 173 | } 174 | return $str; 175 | } 176 | 177 | -------------------------------------------------------------------------------- /EBCDIC/EBCDIC.pm: -------------------------------------------------------------------------------- 1 | package Encode::EBCDIC; 2 | use strict; 3 | use warnings; 4 | use Encode; 5 | our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 6 | 7 | use XSLoader; 8 | XSLoader::load( __PACKAGE__, $VERSION ); 9 | 10 | 1; 11 | __END__ 12 | 13 | =head1 NAME 14 | 15 | Encode::EBCDIC - EBCDIC Encodings 16 | 17 | =head1 SYNOPSIS 18 | 19 | use Encode qw/encode decode/; 20 | $posix_bc = encode("posix-bc", $utf8); # loads Encode::EBCDIC implicitly 21 | $utf8 = decode("", $posix_bc); # ditto 22 | 23 | =head1 ABSTRACT 24 | 25 | This module implements various EBCDIC-Based encodings. Encodings 26 | supported are as follows. 27 | 28 | Canonical Alias Description 29 | -------------------------------------------------------------------- 30 | cp37 31 | cp500 32 | cp875 33 | cp1026 34 | cp1047 35 | posix-bc 36 | 37 | =head1 DESCRIPTION 38 | 39 | To find how to use this module in detail, see L. 40 | 41 | =head1 SEE ALSO 42 | 43 | L, L 44 | 45 | =cut 46 | -------------------------------------------------------------------------------- /EBCDIC/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | 5 | my $name = 'EBCDIC'; 6 | my %tables = ( 7 | ebcdic_t => 8 | ['posix-bc.ucm', 9 | qw(cp037.ucm cp1026.ucm cp1047.ucm cp500.ucm cp875.ucm), 10 | ], 11 | ); 12 | 13 | WriteMakefile( 14 | INC => "-I../Encode", 15 | NAME => 'Encode::'.$name, 16 | VERSION_FROM => "$name.pm", 17 | OBJECT => '$(O_FILES)', 18 | 'dist' => { 19 | COMPRESS => 'gzip -9f', 20 | SUFFIX => 'gz', 21 | DIST_DEFAULT => 'all tardist', 22 | }, 23 | MAN3PODS => {}, 24 | # OS 390 winges about line numbers > 64K ??? 25 | XSOPT => '-nolinenumbers', 26 | ); 27 | 28 | package MY; 29 | 30 | sub post_initialize 31 | { 32 | my ($self) = @_; 33 | my %o; 34 | my $x = $self->{'OBJ_EXT'}; 35 | # Add the table O_FILES 36 | foreach my $e (keys %tables) 37 | { 38 | $o{$e.$x} = 1; 39 | } 40 | $o{"$name$x"} = 1; 41 | $self->{'O_FILES'} = [sort keys %o]; 42 | my @files = ("$name.xs"); 43 | $self->{'C'} = ["$name.c"]; 44 | $self->{SOURCE} .= " $name.c" 45 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 46 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 47 | my %xs; 48 | foreach my $table (sort keys %tables) { 49 | push (@{$self->{'C'}},"$table.c"); 50 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 51 | # get built. 52 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 53 | push (@files,$table.$ext); 54 | } 55 | $self->{SOURCE} .= " $table.c" 56 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 57 | } 58 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 59 | $self->{'clean'}{'FILES'} .= join(' ',@files); 60 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 61 | print XS <<'END'; 62 | #define PERL_NO_GET_CONTEXT 63 | #include 64 | #include 65 | #include 66 | #include "encode.h" 67 | END 68 | foreach my $table (sort keys %tables) { 69 | print XS qq[#include "${table}.h"\n]; 70 | } 71 | print XS <<"END"; 72 | 73 | static void 74 | Encode_XSEncoding(pTHX_ encode_t *enc) 75 | { 76 | dSP; 77 | HV *stash = gv_stashpv("Encode::XS", TRUE); 78 | SV *iv = newSViv(PTR2IV(enc)); 79 | SV *sv = sv_bless(newRV_noinc(iv),stash); 80 | int i = 0; 81 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 82 | constness, in the hope that perl won't mess with it. */ 83 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 84 | SvFLAGS(iv) |= SVp_POK; 85 | SvPVX(iv) = (char*) enc->name[0]; 86 | PUSHMARK(sp); 87 | XPUSHs(sv); 88 | while (enc->name[i]) 89 | { 90 | const char *name = enc->name[i++]; 91 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 92 | } 93 | PUTBACK; 94 | call_pv("Encode::define_encoding",G_DISCARD); 95 | SvREFCNT_dec(sv); 96 | } 97 | 98 | MODULE = Encode::$name PACKAGE = Encode::$name 99 | PROTOTYPES: DISABLE 100 | BOOT: 101 | { 102 | END 103 | foreach my $table (sort keys %tables) { 104 | print XS qq[#include "${table}.exh"\n]; 105 | } 106 | print XS "}\n"; 107 | close(XS); 108 | return "# Built $name.xs\n\n"; 109 | } 110 | 111 | sub postamble 112 | { 113 | my $self = shift; 114 | my $dir = $self->catdir($self->updir,'ucm'); 115 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 116 | $str .= "$name.c : $name.xs "; 117 | foreach my $table (sort keys %tables) 118 | { 119 | $str .= " $table.c"; 120 | } 121 | $str .= "\n\n"; 122 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 123 | 124 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 125 | foreach my $table (sort keys %tables) 126 | { 127 | my $numlines = 1; 128 | my $lengthsofar = length($str); 129 | my $continuator = ''; 130 | $str .= "$table.c : $enc2xs Makefile.PL"; 131 | foreach my $file (@{$tables{$table}}) 132 | { 133 | $str .= $continuator.' '.$self->catfile($dir,$file); 134 | if ( length($str)-$lengthsofar > 128*$numlines ) 135 | { 136 | $continuator .= " \\\n\t"; 137 | $numlines++; 138 | } else { 139 | $continuator = ''; 140 | } 141 | } 142 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 143 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 144 | my $ucopts = '-"Q" -"O"'; 145 | $str .= 146 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 147 | open (FILELIST, ">$table.fnm") 148 | || die "Could not open $table.fnm: $!"; 149 | foreach my $file (@{$tables{$table}}) 150 | { 151 | print FILELIST $self->catfile($dir,$file) . "\n"; 152 | } 153 | close(FILELIST); 154 | } 155 | return $str; 156 | } 157 | 158 | -------------------------------------------------------------------------------- /Encode/Changes.e2x: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: Changes.e2x,v 2.0 2004/05/16 20:55:15 dankogai Exp $ 3 | # Revision history for Perl extension Encode::$_Name_. 4 | # 5 | 6 | 0.01 $_Now_ 7 | Autogenerated by enc2xs version $_Version_. 8 | -------------------------------------------------------------------------------- /Encode/ConfigLocal_PM.e2x: -------------------------------------------------------------------------------- 1 | # 2 | # Local demand-load module list 3 | # 4 | # You should not edit this file by hand! use "enc2xs -C" 5 | # 6 | package Encode::ConfigLocal; 7 | our $VERSION = $_LocalVer_; 8 | 9 | use strict; 10 | 11 | $_ModLines_ 12 | 13 | 1; 14 | -------------------------------------------------------------------------------- /Encode/Makefile_PL.e2x: -------------------------------------------------------------------------------- 1 | # 2 | # This file is auto-generated by: 3 | # enc2xs version $_Version_ 4 | # $_Now_ 5 | # 6 | use 5.7.2; 7 | use strict; 8 | use ExtUtils::MakeMaker; 9 | use Config; 10 | 11 | # Please edit the following to the taste! 12 | my $name = '$_Name_'; 13 | my %tables = ( 14 | $_Name__t => [ $_TableFiles_ ], 15 | ); 16 | 17 | #### DO NOT EDIT BEYOND THIS POINT! 18 | require File::Spec; 19 | my ($enc2xs, $encode_h) = (); 20 | my @path_ext = (''); 21 | @path_ext = split(';', $ENV{PATHEXT}) if $^O eq 'MSWin32'; 22 | PATHLOOP: 23 | for my $d (@Config{qw/bin sitebin vendorbin/}, 24 | (split /$Config{path_sep}/o, $ENV{PATH})){ 25 | for my $f (qw/enc2xs enc2xs5.7.3/){ 26 | my $path = File::Spec->catfile($d, $f); 27 | for my $ext (@path_ext) { 28 | my $bin = "$path$ext"; 29 | -r "$bin" and $enc2xs = $bin and last PATHLOOP; 30 | } 31 | } 32 | } 33 | $enc2xs or die "enc2xs not found!"; 34 | print "enc2xs is $enc2xs\n"; 35 | my %encode_h = (); 36 | for my $d (@INC){ 37 | my $dir = File::Spec->catfile($d, "Encode"); 38 | my $file = File::Spec->catfile($dir, "encode.h"); 39 | -f $file and $encode_h{$dir} = -M $file; 40 | } 41 | %encode_h or die "encode.h not found!"; 42 | # find the latest one 43 | ($encode_h) = sort {$encode_h{$b} <=> $encode_h{$a}} keys %encode_h; 44 | print "encode.h is at $encode_h\n"; 45 | 46 | WriteMakefile( 47 | INC => "-I$encode_h", 48 | #### END_OF_HEADER -- DO NOT EDIT THIS LINE BY HAND! #### 49 | NAME => 'Encode::'.$name, 50 | VERSION_FROM => "$name.pm", 51 | OBJECT => '$(O_FILES)', 52 | 'dist' => { 53 | COMPRESS => 'gzip -9f', 54 | SUFFIX => 'gz', 55 | DIST_DEFAULT => 'all tardist', 56 | }, 57 | MAN3PODS => {}, 58 | PREREQ_PM => { 59 | 'Encode' => "1.41", 60 | }, 61 | # OS 390 winges about line numbers > 64K ??? 62 | XSOPT => '-nolinenumbers', 63 | ); 64 | 65 | package MY; 66 | 67 | sub post_initialize 68 | { 69 | my ($self) = @_; 70 | my %o; 71 | my $x = $self->{'OBJ_EXT'}; 72 | # Add the table O_FILES 73 | foreach my $e (keys %tables) 74 | { 75 | $o{$e.$x} = 1; 76 | } 77 | $o{"$name$x"} = 1; 78 | $self->{'O_FILES'} = [sort keys %o]; 79 | my @files = ("$name.xs"); 80 | $self->{'C'} = ["$name.c"]; 81 | # The next two lines to make MacPerl Happy -- dankogai via pudge 82 | $self->{SOURCE} .= " $name.c" 83 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 84 | # $self->{'H'} = [$self->catfile($self->updir,'encode.h')]; 85 | my %xs; 86 | foreach my $table (sort keys %tables) { 87 | push (@{$self->{'C'}},"$table.c"); 88 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 89 | # get built. 90 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 91 | push (@files,$table.$ext); 92 | } 93 | } 94 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 95 | $self->{'clean'}{'FILES'} .= join(' ',@files); 96 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 97 | print XS <<'END'; 98 | #include 99 | #include 100 | #include 101 | #include "encode.h" 102 | END 103 | foreach my $table (sort keys %tables) { 104 | print XS qq[#include "${table}.h"\n]; 105 | } 106 | print XS <<"END"; 107 | 108 | static void 109 | Encode_XSEncoding(pTHX_ encode_t *enc) 110 | { 111 | dSP; 112 | HV *stash = gv_stashpv("Encode::XS", TRUE); 113 | SV *iv = newSViv(PTR2IV(enc)); 114 | SV *sv = sv_bless(newRV_noinc(iv),stash); 115 | int i = 0; 116 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 117 | constness, in the hope that perl won't mess with it. */ 118 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 119 | SvFLAGS(iv) |= SVp_POK; 120 | SvPVX(iv) = (char*) enc->name[0]; 121 | PUSHMARK(sp); 122 | XPUSHs(sv); 123 | while (enc->name[i]) 124 | { 125 | const char *name = enc->name[i++]; 126 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 127 | } 128 | PUTBACK; 129 | call_pv("Encode::define_encoding",G_DISCARD); 130 | SvREFCNT_dec(sv); 131 | } 132 | 133 | MODULE = Encode::$name PACKAGE = Encode::$name 134 | PROTOTYPES: DISABLE 135 | BOOT: 136 | { 137 | END 138 | foreach my $table (sort keys %tables) { 139 | print XS qq[#include "${table}.exh"\n]; 140 | } 141 | print XS "}\n"; 142 | close(XS); 143 | return "# Built $name.xs\n\n"; 144 | } 145 | 146 | sub postamble 147 | { 148 | my $self = shift; 149 | my $dir = "."; # $self->catdir('Encode'); 150 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 151 | $str .= "$name.c : $name.xs "; 152 | foreach my $table (sort keys %tables) 153 | { 154 | $str .= " $table.c"; 155 | } 156 | $str .= "\n\n"; 157 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 158 | 159 | foreach my $table (sort keys %tables) 160 | { 161 | my $numlines = 1; 162 | my $lengthsofar = length($str); 163 | my $continuator = ''; 164 | $str .= "$table.c : Makefile.PL"; 165 | foreach my $file (@{$tables{$table}}) 166 | { 167 | $str .= $continuator.' '.$self->catfile($dir,$file); 168 | if ( length($str)-$lengthsofar > 128*$numlines ) 169 | { 170 | $continuator .= " \\\n\t"; 171 | $numlines++; 172 | } else { 173 | $continuator = ''; 174 | } 175 | } 176 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 177 | my $ucopts = '-"Q"'; 178 | $str .= 179 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 180 | open (FILELIST, ">$table.fnm") 181 | || die "Could not open $table.fnm: $!"; 182 | foreach my $file (@{$tables{$table}}) 183 | { 184 | print FILELIST $self->catfile($dir,$file) . "\n"; 185 | } 186 | close(FILELIST); 187 | } 188 | return $str; 189 | } 190 | 191 | -------------------------------------------------------------------------------- /Encode/README.e2x: -------------------------------------------------------------------------------- 1 | Encode::$_Name_ version 0.1 2 | ======== 3 | 4 | NAME 5 | Encode::$_Name_ - 6 | 7 | SYNOPSIS 8 | use Encode::$_Name_; 9 | # 10 | ABSTRACT 11 | 12 | INSTALLATION 13 | 14 | To install this module type the following: 15 | 16 | perl Makefile.PL 17 | make 18 | make test 19 | make install 20 | 21 | DEPENDENCIES 22 | 23 | This module requires perl version 5.7.3 or later. 24 | 25 | COPYRIGHT AND LICENCE 26 | 27 | Copyright (C) 2002 Your Name 28 | 29 | This library is free software; you can redistribute it and/or modify 30 | it under the same terms as Perl itself. 31 | 32 | -------------------------------------------------------------------------------- /Encode/_PM.e2x: -------------------------------------------------------------------------------- 1 | package Encode::$_Name_; 2 | our $VERSION = "0.01"; 3 | 4 | use Encode; 5 | use XSLoader; 6 | XSLoader::load(__PACKAGE__,$VERSION); 7 | 8 | 1; 9 | __END__ 10 | 11 | =head1 NAME 12 | 13 | Encode::$_Name_ - New Encoding 14 | 15 | =head1 SYNOPSIS 16 | 17 | You got to fill this in! 18 | 19 | =head1 SEE ALSO 20 | 21 | L 22 | 23 | =cut 24 | -------------------------------------------------------------------------------- /Encode/_T.e2x: -------------------------------------------------------------------------------- 1 | use strict; 2 | # Adjust the number here! 3 | use Test::More tests => 2; 4 | 5 | BEGIN { 6 | use_ok('Encode'); 7 | use_ok('Encode::$_Name_'); 8 | } 9 | # Add more test here! 10 | -------------------------------------------------------------------------------- /JP/JP.pm: -------------------------------------------------------------------------------- 1 | package Encode::JP; 2 | BEGIN { 3 | if ( ord("A") == 193 ) { 4 | die "Encode::JP not supported on EBCDIC\n"; 5 | } 6 | } 7 | use strict; 8 | use warnings; 9 | use Encode; 10 | our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 11 | 12 | use XSLoader; 13 | XSLoader::load( __PACKAGE__, $VERSION ); 14 | 15 | use Encode::JP::JIS7; 16 | 17 | 1; 18 | __END__ 19 | 20 | =head1 NAME 21 | 22 | Encode::JP - Japanese Encodings 23 | 24 | =head1 SYNOPSIS 25 | 26 | use Encode qw/encode decode/; 27 | $euc_jp = encode("euc-jp", $utf8); # loads Encode::JP implicitly 28 | $utf8 = decode("euc-jp", $euc_jp); # ditto 29 | 30 | =head1 ABSTRACT 31 | 32 | This module implements Japanese charset encodings. Encodings 33 | supported are as follows. 34 | 35 | Canonical Alias Description 36 | -------------------------------------------------------------------- 37 | euc-jp /\beuc.*jp$/i EUC (Extended Unix Character) 38 | /\bjp.*euc/i 39 | /\bujis$/i 40 | shiftjis /\bshift.*jis$/i Shift JIS (aka MS Kanji) 41 | /\bsjis$/i 42 | 7bit-jis /\bjis$/i 7bit JIS 43 | iso-2022-jp ISO-2022-JP [RFC1468] 44 | = 7bit JIS with all Halfwidth Kana 45 | converted to Fullwidth 46 | iso-2022-jp-1 ISO-2022-JP-1 [RFC2237] 47 | = ISO-2022-JP with JIS X 0212-1990 48 | support. See below 49 | MacJapanese Shift JIS + Apple vendor mappings 50 | cp932 /\bwindows-31j$/i Code Page 932 51 | = Shift JIS + MS/IBM vendor mappings 52 | jis0201-raw JIS0201, raw format 53 | jis0208-raw JIS0208, raw format 54 | jis0212-raw JIS0212, raw format 55 | -------------------------------------------------------------------- 56 | 57 | =head1 DESCRIPTION 58 | 59 | To find out how to use this module in detail, see L. 60 | 61 | =head1 Note on ISO-2022-JP(-1)? 62 | 63 | ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which 64 | adds support for JIS X 0212-1990. That means you can use the same 65 | code to decode to utf8 but not vice versa. 66 | 67 | $utf8 = decode('iso-2022-jp-1', $stream); 68 | 69 | and 70 | 71 | $utf8 = decode('iso-2022-jp', $stream); 72 | 73 | yield the same result but 74 | 75 | $with_0212 = encode('iso-2022-jp-1', $utf8); 76 | 77 | is now different from 78 | 79 | $without_0212 = encode('iso-2022-jp', $utf8 ); 80 | 81 | In the latter case, characters that map to 0212 are first converted 82 | to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu' or 83 | 'geta mark') then fed to the decoding engine. U+FFFD is not used, 84 | in order to preserve text layout as much as possible. 85 | 86 | =head1 BUGS 87 | 88 | The ASCII region (0x00-0x7f) is preserved for all encodings, even 89 | though this conflicts with mappings by the Unicode Consortium. 90 | 91 | =head1 SEE ALSO 92 | 93 | L 94 | 95 | =cut 96 | -------------------------------------------------------------------------------- /JP/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | use strict; 5 | 6 | my %tables = ( 7 | euc_jp_t => ['euc-jp.ucm'], 8 | sjis_t => ['shiftjis.ucm', 9 | 'macJapanese.ucm', 10 | 'cp932.ucm'], 11 | raw_t => [ 12 | qw(jis0201.ucm jis0208.ucm jis0212.ucm) 13 | ], 14 | ); 15 | 16 | unless ($ENV{AGGREGATE_TABLES}){ 17 | my @ucm; 18 | for my $k (keys %tables){ 19 | push @ucm, @{$tables{$k}}; 20 | } 21 | %tables = (); 22 | my $seq = 0; 23 | for my $ucm (sort @ucm){ 24 | # 8.3 compliance ! 25 | my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); 26 | $tables{$t} = [ $ucm ]; 27 | } 28 | } 29 | 30 | my $name = 'JP'; 31 | 32 | WriteMakefile( 33 | INC => "-I../Encode", 34 | NAME => 'Encode::'.$name, 35 | VERSION_FROM => "$name.pm", 36 | OBJECT => '$(O_FILES)', 37 | 'dist' => { 38 | COMPRESS => 'gzip -9f', 39 | SUFFIX => 'gz', 40 | DIST_DEFAULT => 'all tardist', 41 | }, 42 | MAN3PODS => {}, 43 | # OS 390 winges about line numbers > 64K ??? 44 | XSOPT => '-nolinenumbers', 45 | ); 46 | 47 | package MY; 48 | 49 | sub post_initialize 50 | { 51 | my ($self) = @_; 52 | my %o; 53 | my $x = $self->{'OBJ_EXT'}; 54 | # Add the table O_FILES 55 | foreach my $e (keys %tables) 56 | { 57 | $o{$e.$x} = 1; 58 | } 59 | $o{"$name$x"} = 1; 60 | $self->{'O_FILES'} = [sort keys %o]; 61 | my @files = ("$name.xs"); 62 | $self->{'C'} = ["$name.c"]; 63 | $self->{SOURCE} .= " $name.c" 64 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 65 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 66 | my %xs; 67 | foreach my $table (sort keys %tables) { 68 | push (@{$self->{'C'}},"$table.c"); 69 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 70 | # get built. 71 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 72 | push (@files,$table.$ext); 73 | } 74 | $self->{SOURCE} .= " $table.c" 75 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 76 | } 77 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 78 | $self->{'clean'}{'FILES'} .= join(' ',@files); 79 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 80 | print XS <<'END'; 81 | #define PERL_NO_GET_CONTEXT 82 | #include 83 | #include 84 | #include 85 | #include "encode.h" 86 | END 87 | foreach my $table (sort keys %tables) { 88 | print XS qq[#include "${table}.h"\n]; 89 | } 90 | print XS <<"END"; 91 | 92 | static void 93 | Encode_XSEncoding(pTHX_ encode_t *enc) 94 | { 95 | dSP; 96 | HV *stash = gv_stashpv("Encode::XS", TRUE); 97 | SV *iv = newSViv(PTR2IV(enc)); 98 | SV *sv = sv_bless(newRV_noinc(iv),stash); 99 | int i = 0; 100 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 101 | constness, in the hope that perl won't mess with it. */ 102 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 103 | SvFLAGS(iv) |= SVp_POK; 104 | SvPVX(iv) = (char*) enc->name[0]; 105 | PUSHMARK(sp); 106 | XPUSHs(sv); 107 | while (enc->name[i]) 108 | { 109 | const char *name = enc->name[i++]; 110 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 111 | } 112 | PUTBACK; 113 | call_pv("Encode::define_encoding",G_DISCARD); 114 | SvREFCNT_dec(sv); 115 | } 116 | 117 | MODULE = Encode::$name PACKAGE = Encode::$name 118 | PROTOTYPES: DISABLE 119 | BOOT: 120 | { 121 | END 122 | foreach my $table (sort keys %tables) { 123 | print XS qq[#include "${table}.exh"\n]; 124 | } 125 | print XS "}\n"; 126 | close(XS); 127 | return "# Built $name.xs\n\n"; 128 | } 129 | 130 | sub postamble 131 | { 132 | my $self = shift; 133 | my $dir = $self->catdir($self->updir,'ucm'); 134 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 135 | $str .= "$name.c : $name.xs "; 136 | foreach my $table (sort keys %tables) 137 | { 138 | $str .= " $table.c"; 139 | } 140 | $str .= "\n\n"; 141 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 142 | 143 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 144 | foreach my $table (sort keys %tables) 145 | { 146 | my $numlines = 1; 147 | my $lengthsofar = length($str); 148 | my $continuator = ''; 149 | $str .= "$table.c : $enc2xs Makefile.PL"; 150 | foreach my $file (@{$tables{$table}}) 151 | { 152 | $str .= $continuator.' '.$self->catfile($dir,$file); 153 | if ( length($str)-$lengthsofar > 128*$numlines ) 154 | { 155 | $continuator .= " \\\n\t"; 156 | $numlines++; 157 | } else { 158 | $continuator = ''; 159 | } 160 | } 161 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 162 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 163 | my $ucopts = '-"Q"'; 164 | $str .= 165 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 166 | open (FILELIST, ">$table.fnm") 167 | || die "Could not open $table.fnm: $!"; 168 | foreach my $file (@{$tables{$table}}) 169 | { 170 | print FILELIST $self->catfile($dir,$file) . "\n"; 171 | } 172 | close(FILELIST); 173 | } 174 | return $str; 175 | } 176 | 177 | -------------------------------------------------------------------------------- /KR/KR.pm: -------------------------------------------------------------------------------- 1 | package Encode::KR; 2 | BEGIN { 3 | if ( ord("A") == 193 ) { 4 | die "Encode::KR not supported on EBCDIC\n"; 5 | } 6 | } 7 | use strict; 8 | use warnings; 9 | use Encode; 10 | our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 11 | use XSLoader; 12 | XSLoader::load( __PACKAGE__, $VERSION ); 13 | 14 | use Encode::KR::2022_KR; 15 | 16 | 1; 17 | __END__ 18 | 19 | =head1 NAME 20 | 21 | Encode::KR - Korean Encodings 22 | 23 | =head1 SYNOPSIS 24 | 25 | use Encode qw/encode decode/; 26 | $euc_kr = encode("euc-kr", $utf8); # loads Encode::KR implicitly 27 | $utf8 = decode("euc-kr", $euc_kr); # ditto 28 | 29 | =head1 DESCRIPTION 30 | 31 | This module implements Korean charset encodings. Encodings supported 32 | are as follows. 33 | 34 | 35 | Canonical Alias Description 36 | -------------------------------------------------------------------- 37 | euc-kr /\beuc.*kr$/i EUC (Extended Unix Character) 38 | /\bkr.*euc$/i 39 | ksc5601-raw Korean standard code set (as is) 40 | cp949 /(?:x-)?uhc$/i 41 | /(?:x-)?windows-949$/i 42 | /\bks_c_5601-1987$/i 43 | Code Page 949 (EUC-KR + 8,822 44 | (additional Hangul syllables) 45 | MacKorean EUC-KR + Apple Vendor Mappings 46 | johab JOHAB A supplementary encoding defined in 47 | Annex 3 of KS X 1001:1998 48 | iso-2022-kr iso-2022-kr [RFC1557] 49 | -------------------------------------------------------------------- 50 | 51 | To find how to use this module in detail, see L. 52 | 53 | =head1 BUGS 54 | 55 | When you see C on mails and web pages, they really 56 | mean "cp949" encodings. To fix that, the following aliases are set; 57 | 58 | qr/(?:x-)?uhc$/i => '"cp949"' 59 | qr/(?:x-)?windows-949$/i => '"cp949"' 60 | qr/ks_c_5601-1987$/i => '"cp949"' 61 | 62 | The ASCII region (0x00-0x7f) is preserved for all encodings, even 63 | though this conflicts with mappings by the Unicode Consortium. 64 | 65 | =head1 SEE ALSO 66 | 67 | L 68 | 69 | =cut 70 | -------------------------------------------------------------------------------- /KR/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | use strict; 5 | 6 | my %tables = (euc_kr_t => ['euc-kr.ucm', 7 | 'macKorean.ucm', 8 | 'cp949.ucm', 9 | ], 10 | '5601_t' => ['ksc5601.ucm'], 11 | johab_t => ['johab.ucm'], 12 | ); 13 | 14 | unless ($ENV{AGGREGATE_TABLES}){ 15 | my @ucm; 16 | for my $k (keys %tables){ 17 | push @ucm, @{$tables{$k}}; 18 | } 19 | %tables = (); 20 | my $seq = 0; 21 | for my $ucm (sort @ucm){ 22 | # 8.3 compliance ! 23 | my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); 24 | $tables{$t} = [ $ucm ]; 25 | } 26 | } 27 | 28 | my $name = 'KR'; 29 | 30 | WriteMakefile( 31 | INC => "-I../Encode", 32 | NAME => 'Encode::'.$name, 33 | VERSION_FROM => "$name.pm", 34 | OBJECT => '$(O_FILES)', 35 | 'dist' => { 36 | COMPRESS => 'gzip -9f', 37 | SUFFIX => 'gz', 38 | DIST_DEFAULT => 'all tardist', 39 | }, 40 | MAN3PODS => {}, 41 | # OS 390 winges about line numbers > 64K ??? 42 | XSOPT => '-nolinenumbers', 43 | ); 44 | 45 | package MY; 46 | 47 | sub post_initialize 48 | { 49 | my ($self) = @_; 50 | my %o; 51 | my $x = $self->{'OBJ_EXT'}; 52 | # Add the table O_FILES 53 | foreach my $e (keys %tables) 54 | { 55 | $o{$e.$x} = 1; 56 | } 57 | $o{"$name$x"} = 1; 58 | $self->{'O_FILES'} = [sort keys %o]; 59 | my @files = ("$name.xs"); 60 | $self->{'C'} = ["$name.c"]; 61 | $self->{SOURCE} .= " $name.c" 62 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 63 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 64 | my %xs; 65 | foreach my $table (sort keys %tables) { 66 | push (@{$self->{'C'}},"$table.c"); 67 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 68 | # get built. 69 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 70 | push (@files,$table.$ext); 71 | } 72 | $self->{SOURCE} .= " $table.c" 73 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 74 | } 75 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 76 | $self->{'clean'}{'FILES'} .= join(' ',@files); 77 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 78 | print XS <<'END'; 79 | #define PERL_NO_GET_CONTEXT 80 | #include 81 | #include 82 | #include 83 | #include "encode.h" 84 | END 85 | foreach my $table (sort keys %tables) { 86 | print XS qq[#include "${table}.h"\n]; 87 | } 88 | print XS <<"END"; 89 | 90 | static void 91 | Encode_XSEncoding(pTHX_ encode_t *enc) 92 | { 93 | dSP; 94 | HV *stash = gv_stashpv("Encode::XS", TRUE); 95 | SV *iv = newSViv(PTR2IV(enc)); 96 | SV *sv = sv_bless(newRV_noinc(iv),stash); 97 | int i = 0; 98 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 99 | constness, in the hope that perl won't mess with it. */ 100 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 101 | SvFLAGS(iv) |= SVp_POK; 102 | SvPVX(iv) = (char*) enc->name[0]; 103 | PUSHMARK(sp); 104 | XPUSHs(sv); 105 | while (enc->name[i]) 106 | { 107 | const char *name = enc->name[i++]; 108 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 109 | } 110 | PUTBACK; 111 | call_pv("Encode::define_encoding",G_DISCARD); 112 | SvREFCNT_dec(sv); 113 | } 114 | 115 | MODULE = Encode::$name PACKAGE = Encode::$name 116 | PROTOTYPES: DISABLE 117 | BOOT: 118 | { 119 | END 120 | foreach my $table (sort keys %tables) { 121 | print XS qq[#include "${table}.exh"\n]; 122 | } 123 | print XS "}\n"; 124 | close(XS); 125 | return "# Built $name.xs\n\n"; 126 | } 127 | 128 | sub postamble 129 | { 130 | my $self = shift; 131 | my $dir = $self->catdir($self->updir,'ucm'); 132 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 133 | $str .= "$name.c : $name.xs "; 134 | foreach my $table (sort keys %tables) 135 | { 136 | $str .= " $table.c"; 137 | } 138 | $str .= "\n\n"; 139 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 140 | 141 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 142 | foreach my $table (sort keys %tables) 143 | { 144 | my $numlines = 1; 145 | my $lengthsofar = length($str); 146 | my $continuator = ''; 147 | $str .= "$table.c : $enc2xs Makefile.PL"; 148 | foreach my $file (@{$tables{$table}}) 149 | { 150 | $str .= $continuator.' '.$self->catfile($dir,$file); 151 | if ( length($str)-$lengthsofar > 128*$numlines ) 152 | { 153 | $continuator .= " \\\n\t"; 154 | $numlines++; 155 | } else { 156 | $continuator = ''; 157 | } 158 | } 159 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 160 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 161 | my $ucopts = '-"Q"'; 162 | $str .= 163 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 164 | open (FILELIST, ">$table.fnm") 165 | || die "Could not open $table.fnm: $!"; 166 | foreach my $file (@{$tables{$table}}) 167 | { 168 | print FILELIST $self->catfile($dir,$file) . "\n"; 169 | } 170 | close(FILELIST); 171 | } 172 | return $str; 173 | } 174 | 175 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | NAME 2 | Encode - character encodings 3 | 4 | SYNOPSIS 5 | use Encode; 6 | 7 | DESCRIPTION 8 | The "Encode" module provides the interfaces between Perl's 9 | strings and the rest of the system. Perl strings are 10 | sequences of characters. 11 | 12 | See "perldoc Encode" for the rest of the story 13 | 14 | INSTALLATION 15 | 16 | To install this module, type the following: 17 | 18 | perl Makefile.PL 19 | make 20 | make test 21 | make install 22 | 23 | To install scripts under bin/ directories also, 24 | 25 | perl Makefile.PL MORE_SCRIPTS 26 | make && make test && make install 27 | 28 | By default, only enc2xs and piconv are installed. 29 | 30 | To install *.ucm files also, say 31 | 32 | perl Makefile.PL INSTALL_UCM 33 | make && make test && make install 34 | 35 | By default, *.ucm are not installed. 36 | 37 | DEPENDENCIES 38 | 39 | This module requires perl5.7.3 or later. 40 | 41 | MAINTAINER 42 | 43 | This project was originated by Nick Ing-Simmons and later maintained by 44 | Dan Kogai . See AUTHORS for the full list of people 45 | involved. 46 | 47 | QUESTIONS? 48 | 49 | If you have any questions which "perldoc Encode" does not answer, please 50 | feel free to ask at perl-unicode@perl.org. 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CI on Perl 5.{36,34} on {Linux,macOS,Windows}](https://github.com/dankogai/p5-encode/actions/workflows/platforms.yml/badge.svg)](https://github.com/dankogai/p5-encode/actions/workflows/platforms.yml) 2 | 3 | ## NAME 4 | 5 | Encode - character encodings 6 | 7 | ## SYNOPSIS 8 | 9 | use Encode; 10 | 11 | ## COMMAND LINE 12 | 13 | $ encguess README.md 14 | README.md US-ASCII 15 | 16 | See `encguess -h` for more options. 17 | 18 | ## DESCRIPTION 19 | 20 | The "Encode" module provides the interfaces between Perl's 21 | strings and the rest of the system. Perl strings are 22 | sequences of characters. 23 | 24 | See `perldoc Encode` for the rest of the story 25 | 26 | ## INSTALLATION 27 | 28 | To install this module, type the following: 29 | 30 | perl Makefile.PL 31 | make 32 | make test 33 | make install 34 | 35 | To install scripts under bin/ directories also, 36 | 37 | perl Makefile.PL MORE_SCRIPTS 38 | make && make test && make install 39 | 40 | By default, only enc2xs and piconv are installed. 41 | 42 | To install *.ucm files also, say 43 | 44 | perl Makefile.PL INSTALL_UCM 45 | make && make test && make install 46 | 47 | By default, *.ucm are not installed. 48 | 49 | ## DEPENDENCIES 50 | 51 | This module requires perl 5.7.3 or later. 52 | 53 | ## MAINTAINER 54 | 55 | This project was originated by Nick Ing-Simmons and later maintained by 56 | Dan Kogai ``. See AUTHORS for the full list of people 57 | involved. 58 | 59 | ## QUESTIONS? 60 | 61 | If you have any questions which `perldoc Encode` does not answer, please 62 | feel free to ask at `perl-unicode@perl.org`. 63 | -------------------------------------------------------------------------------- /Symbol/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | 5 | my $name = 'Symbol'; 6 | my %tables = ( 7 | symbol_t => [qw( 8 | symbol.ucm 9 | dingbats.ucm 10 | adobeSymbol.ucm 11 | adobeZdingbat.ucm 12 | macSymbol.ucm 13 | macDingbats.ucm 14 | ) 15 | ], 16 | ); 17 | 18 | WriteMakefile( 19 | INC => "-I../Encode", 20 | NAME => 'Encode::'.$name, 21 | VERSION_FROM => "$name.pm", 22 | OBJECT => '$(O_FILES)', 23 | 'dist' => { 24 | COMPRESS => 'gzip -9f', 25 | SUFFIX => 'gz', 26 | DIST_DEFAULT => 'all tardist', 27 | }, 28 | MAN3PODS => {}, 29 | # OS 390 winges about line numbers > 64K ??? 30 | XSOPT => '-nolinenumbers', 31 | ); 32 | 33 | package MY; 34 | 35 | sub post_initialize 36 | { 37 | my ($self) = @_; 38 | my %o; 39 | my $x = $self->{'OBJ_EXT'}; 40 | # Add the table O_FILES 41 | foreach my $e (keys %tables) 42 | { 43 | $o{$e.$x} = 1; 44 | } 45 | $o{"$name$x"} = 1; 46 | $self->{'O_FILES'} = [sort keys %o]; 47 | my @files = ("$name.xs"); 48 | $self->{'C'} = ["$name.c"]; 49 | $self->{SOURCE} .= " $name.c" 50 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 51 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 52 | my %xs; 53 | foreach my $table (sort keys %tables) { 54 | push (@{$self->{'C'}},"$table.c"); 55 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 56 | # get built. 57 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 58 | push (@files,$table.$ext); 59 | } 60 | $self->{SOURCE} .= " $table.c" 61 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 62 | } 63 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 64 | $self->{'clean'}{'FILES'} .= join(' ',@files); 65 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 66 | print XS <<'END'; 67 | #define PERL_NO_GET_CONTEXT 68 | #include 69 | #include 70 | #include 71 | #include "encode.h" 72 | END 73 | foreach my $table (sort keys %tables) { 74 | print XS qq[#include "${table}.h"\n]; 75 | } 76 | print XS <<"END"; 77 | 78 | static void 79 | Encode_XSEncoding(pTHX_ encode_t *enc) 80 | { 81 | dSP; 82 | HV *stash = gv_stashpv("Encode::XS", TRUE); 83 | SV *iv = newSViv(PTR2IV(enc)); 84 | SV *sv = sv_bless(newRV_noinc(iv),stash); 85 | int i = 0; 86 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 87 | constness, in the hope that perl won't mess with it. */ 88 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 89 | SvFLAGS(iv) |= SVp_POK; 90 | SvPVX(iv) = (char*) enc->name[0]; 91 | PUSHMARK(sp); 92 | XPUSHs(sv); 93 | while (enc->name[i]) 94 | { 95 | const char *name = enc->name[i++]; 96 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 97 | } 98 | PUTBACK; 99 | call_pv("Encode::define_encoding",G_DISCARD); 100 | SvREFCNT_dec(sv); 101 | } 102 | 103 | MODULE = Encode::$name PACKAGE = Encode::$name 104 | PROTOTYPES: DISABLE 105 | BOOT: 106 | { 107 | END 108 | foreach my $table (sort keys %tables) { 109 | print XS qq[#include "${table}.exh"\n]; 110 | } 111 | print XS "}\n"; 112 | close(XS); 113 | return "# Built $name.xs\n\n"; 114 | } 115 | 116 | sub postamble 117 | { 118 | my $self = shift; 119 | my $dir = $self->catdir($self->updir,'ucm'); 120 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 121 | $str .= "$name.c : $name.xs "; 122 | foreach my $table (sort keys %tables) 123 | { 124 | $str .= " $table.c"; 125 | } 126 | $str .= "\n\n"; 127 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 128 | 129 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 130 | foreach my $table (sort keys %tables) 131 | { 132 | my $numlines = 1; 133 | my $lengthsofar = length($str); 134 | my $continuator = ''; 135 | $str .= "$table.c : $enc2xs Makefile.PL"; 136 | foreach my $file (@{$tables{$table}}) 137 | { 138 | $str .= $continuator.' '.$self->catfile($dir,$file); 139 | if ( length($str)-$lengthsofar > 128*$numlines ) 140 | { 141 | $continuator .= " \\\n\t"; 142 | $numlines++; 143 | } else { 144 | $continuator = ''; 145 | } 146 | } 147 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 148 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 149 | my $ucopts = '-"Q" -"O"'; 150 | $str .= 151 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 152 | open (FILELIST, ">$table.fnm") 153 | || die "Could not open $table.fnm: $!"; 154 | foreach my $file (@{$tables{$table}}) 155 | { 156 | print FILELIST $self->catfile($dir,$file) . "\n"; 157 | } 158 | close(FILELIST); 159 | } 160 | return $str; 161 | } 162 | 163 | -------------------------------------------------------------------------------- /Symbol/Symbol.pm: -------------------------------------------------------------------------------- 1 | package Encode::Symbol; 2 | use strict; 3 | use warnings; 4 | use Encode; 5 | our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 6 | 7 | use XSLoader; 8 | XSLoader::load( __PACKAGE__, $VERSION ); 9 | 10 | 1; 11 | __END__ 12 | 13 | =head1 NAME 14 | 15 | Encode::Symbol - Symbol Encodings 16 | 17 | =head1 SYNOPSIS 18 | 19 | use Encode qw/encode decode/; 20 | $symbol = encode("symbol", $utf8); # loads Encode::Symbol implicitly 21 | $utf8 = decode("", $symbol); # ditto 22 | 23 | =head1 ABSTRACT 24 | 25 | This module implements symbol and dingbats encodings. Encodings 26 | supported are as follows. 27 | 28 | Canonical Alias Description 29 | -------------------------------------------------------------------- 30 | symbol 31 | dingbats 32 | AdobeZDingbat 33 | AdobeSymbol 34 | MacDingbats 35 | 36 | =head1 DESCRIPTION 37 | 38 | To find out how to use this module in detail, see L. 39 | 40 | =head1 SEE ALSO 41 | 42 | L 43 | 44 | =cut 45 | -------------------------------------------------------------------------------- /TW/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | use strict; 5 | 6 | my %tables = (big5_t => ['big5-eten.ucm', 7 | 'big5-hkscs.ucm', 8 | 'macChintrad.ucm', 9 | 'cp950.ucm'], 10 | ); 11 | 12 | unless ($ENV{AGGREGATE_TABLES}){ 13 | my @ucm; 14 | for my $k (keys %tables){ 15 | push @ucm, @{$tables{$k}}; 16 | } 17 | %tables = (); 18 | my $seq = 0; 19 | for my $ucm (sort @ucm){ 20 | # 8.3 compliance ! 21 | my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); 22 | $tables{$t} = [ $ucm ]; 23 | } 24 | } 25 | 26 | my $name = 'TW'; 27 | 28 | WriteMakefile( 29 | INC => "-I../Encode", 30 | NAME => 'Encode::'.$name, 31 | VERSION_FROM => "$name.pm", 32 | OBJECT => '$(O_FILES)', 33 | 'dist' => { 34 | COMPRESS => 'gzip -9f', 35 | SUFFIX => 'gz', 36 | DIST_DEFAULT => 'all tardist', 37 | }, 38 | MAN3PODS => {}, 39 | # OS 390 winges about line numbers > 64K ??? 40 | XSOPT => '-nolinenumbers', 41 | ); 42 | 43 | package MY; 44 | 45 | sub post_initialize 46 | { 47 | my ($self) = @_; 48 | my %o; 49 | my $x = $self->{'OBJ_EXT'}; 50 | # Add the table O_FILES 51 | foreach my $e (keys %tables) 52 | { 53 | $o{$e.$x} = 1; 54 | } 55 | $o{"$name$x"} = 1; 56 | $self->{'O_FILES'} = [sort keys %o]; 57 | my @files = ("$name.xs"); 58 | $self->{'C'} = ["$name.c"]; 59 | $self->{SOURCE} .= " $name.c" 60 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/; 61 | $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; 62 | my %xs; 63 | foreach my $table (sort keys %tables) { 64 | push (@{$self->{'C'}},"$table.c"); 65 | # Do NOT add $table.h etc. to H_FILES unless we own up as to how they 66 | # get built. 67 | foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { 68 | push (@files,$table.$ext); 69 | } 70 | $self->{SOURCE} .= " $table.c" 71 | if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; 72 | } 73 | $self->{'XS'} = { "$name.xs" => "$name.c" }; 74 | $self->{'clean'}{'FILES'} .= join(' ',@files); 75 | open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; 76 | print XS <<'END'; 77 | #define PERL_NO_GET_CONTEXT 78 | #include 79 | #include 80 | #include 81 | #include "encode.h" 82 | END 83 | foreach my $table (sort keys %tables) { 84 | print XS qq[#include "${table}.h"\n]; 85 | } 86 | print XS <<"END"; 87 | 88 | static void 89 | Encode_XSEncoding(pTHX_ encode_t *enc) 90 | { 91 | dSP; 92 | HV *stash = gv_stashpv("Encode::XS", TRUE); 93 | SV *iv = newSViv(PTR2IV(enc)); 94 | SV *sv = sv_bless(newRV_noinc(iv),stash); 95 | int i = 0; 96 | /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 97 | constness, in the hope that perl won't mess with it. */ 98 | assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 99 | SvFLAGS(iv) |= SVp_POK; 100 | SvPVX(iv) = (char*) enc->name[0]; 101 | PUSHMARK(sp); 102 | XPUSHs(sv); 103 | while (enc->name[i]) 104 | { 105 | const char *name = enc->name[i++]; 106 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 107 | } 108 | PUTBACK; 109 | call_pv("Encode::define_encoding",G_DISCARD); 110 | SvREFCNT_dec(sv); 111 | } 112 | 113 | MODULE = Encode::$name PACKAGE = Encode::$name 114 | PROTOTYPES: DISABLE 115 | BOOT: 116 | { 117 | END 118 | foreach my $table (sort keys %tables) { 119 | print XS qq[#include "${table}.exh"\n]; 120 | } 121 | print XS "}\n"; 122 | close(XS); 123 | return "# Built $name.xs\n\n"; 124 | } 125 | 126 | sub postamble 127 | { 128 | my $self = shift; 129 | my $dir = $self->catdir($self->updir,'ucm'); 130 | my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; 131 | $str .= "$name.c : $name.xs "; 132 | foreach my $table (sort keys %tables) 133 | { 134 | $str .= " $table.c"; 135 | } 136 | $str .= "\n\n"; 137 | $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; 138 | 139 | my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); 140 | foreach my $table (sort keys %tables) 141 | { 142 | my $numlines = 1; 143 | my $lengthsofar = length($str); 144 | my $continuator = ''; 145 | $str .= "$table.c : $enc2xs Makefile.PL"; 146 | foreach my $file (@{$tables{$table}}) 147 | { 148 | $str .= $continuator.' '.$self->catfile($dir,$file); 149 | if ( length($str)-$lengthsofar > 128*$numlines ) 150 | { 151 | $continuator .= " \\\n\t"; 152 | $numlines++; 153 | } else { 154 | $continuator = ''; 155 | } 156 | } 157 | my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; 158 | $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; 159 | my $ucopts = '-"Q"'; 160 | $str .= 161 | qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; 162 | open (FILELIST, ">$table.fnm") 163 | || die "Could not open $table.fnm: $!"; 164 | foreach my $file (@{$tables{$table}}) 165 | { 166 | print FILELIST $self->catfile($dir,$file) . "\n"; 167 | } 168 | close(FILELIST); 169 | } 170 | return $str; 171 | } 172 | 173 | -------------------------------------------------------------------------------- /TW/TW.pm: -------------------------------------------------------------------------------- 1 | package Encode::TW; 2 | BEGIN { 3 | if ( ord("A") == 193 ) { 4 | die "Encode::TW not supported on EBCDIC\n"; 5 | } 6 | } 7 | use strict; 8 | use warnings; 9 | use Encode; 10 | our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 11 | use XSLoader; 12 | XSLoader::load( __PACKAGE__, $VERSION ); 13 | 14 | 1; 15 | __END__ 16 | 17 | =head1 NAME 18 | 19 | Encode::TW - Taiwan-based Chinese Encodings 20 | 21 | =head1 SYNOPSIS 22 | 23 | use Encode qw/encode decode/; 24 | $big5 = encode("big5", $utf8); # loads Encode::TW implicitly 25 | $utf8 = decode("big5", $big5); # ditto 26 | 27 | =head1 DESCRIPTION 28 | 29 | This module implements tradition Chinese charset encodings as used 30 | in Taiwan and Hong Kong. 31 | Encodings supported are as follows. 32 | 33 | Canonical Alias Description 34 | -------------------------------------------------------------------- 35 | big5-eten /\bbig-?5$/i Big5 encoding (with ETen extensions) 36 | /\bbig5-?et(en)?$/i 37 | /\btca-?big5$/i 38 | big5-hkscs /\bbig5-?hk(scs)?$/i 39 | /\bhk(scs)?-?big5$/i 40 | Big5 + Cantonese characters in Hong Kong 41 | MacChineseTrad Big5 + Apple Vendor Mappings 42 | cp950 Code Page 950 43 | = Big5 + Microsoft vendor mappings 44 | -------------------------------------------------------------------- 45 | 46 | To find out how to use this module in detail, see L. 47 | 48 | =head1 NOTES 49 | 50 | Due to size concerns, C (Extended Unix Character), C 51 | (Chinese Character Code for Information Interchange), C 52 | (CMEX's Big5+) and C (CMEX's Big5e) are distributed separately 53 | on CPAN, under the name L. That module also contains 54 | extra China-based encodings. 55 | 56 | =head1 BUGS 57 | 58 | Since the original C encoding (1984) is not supported anywhere 59 | (glibc and DOS-based systems uses C to mean C; Microsoft 60 | uses C to mean C), a conscious decision was made to alias 61 | C to C, which is the de facto superset of the original 62 | big5. 63 | 64 | The C encoding files are not complete. For common C 65 | manipulation, please use C in L, which contains 66 | planes 1-7. 67 | 68 | The ASCII region (0x00-0x7f) is preserved for all encodings, even 69 | though this conflicts with mappings by the Unicode Consortium. 70 | 71 | =head1 SEE ALSO 72 | 73 | L 74 | 75 | =cut 76 | -------------------------------------------------------------------------------- /Unicode/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.7.2; 2 | use strict; 3 | use ExtUtils::MakeMaker; 4 | 5 | WriteMakefile( 6 | INC => "-I../Encode", 7 | NAME => 'Encode::Unicode', 8 | VERSION_FROM => "Unicode.pm", 9 | MAN3PODS => {}, 10 | ); 11 | 12 | -------------------------------------------------------------------------------- /bin/encguess: -------------------------------------------------------------------------------- 1 | #!./perl 2 | use 5.008001; 3 | BEGIN { pop @INC if $INC[-1] eq '.' } 4 | use strict; 5 | use warnings; 6 | use Encode; 7 | use Getopt::Std; 8 | use Carp; 9 | use Encode::Guess; 10 | $Getopt::Std::STANDARD_HELP_VERSION = 1; 11 | 12 | my %opt; 13 | getopts( "huSs:", \%opt ); 14 | my @suspect_list; 15 | list_valid_suspects() and exit if $opt{S}; 16 | @suspect_list = split /:,/, $opt{s} if $opt{s}; 17 | HELP_MESSAGE() if $opt{h}; 18 | HELP_MESSAGE() unless @ARGV; 19 | do_guess($_) for @ARGV; 20 | 21 | sub read_file { 22 | my $filename = shift; 23 | local $/; 24 | open my $fh, '<:raw', $filename or croak "$filename:$!"; 25 | my $content = <$fh>; 26 | close $fh; 27 | return $content; 28 | } 29 | 30 | sub do_guess { 31 | my $filename = shift; 32 | my $data = read_file($filename); 33 | my $enc = guess_encoding( $data, @suspect_list ); 34 | if ( !ref($enc) && $opt{u} ) { 35 | return 1; 36 | } 37 | print "$filename\t"; 38 | if ( ref($enc) ) { 39 | print $enc->mime_name(); 40 | } 41 | else { 42 | print "unknown"; 43 | } 44 | print "\n"; 45 | return 1; 46 | } 47 | 48 | sub list_valid_suspects { 49 | print join( "\n", Encode->encodings(":all") ); 50 | print "\n"; 51 | return 1; 52 | } 53 | 54 | sub HELP_MESSAGE { 55 | exec 'pod2usage', $0 or die "pod2usage: $!" 56 | } 57 | __END__ 58 | =head1 NAME 59 | 60 | encguess - guess character encodings of files 61 | 62 | =head1 VERSION 63 | 64 | $Id: encguess,v 0.4 2023/11/10 01:10:50 dankogai Exp $ 65 | 66 | =head1 SYNOPSIS 67 | 68 | encguess [switches] filename... 69 | 70 | =head2 SWITCHES 71 | 72 | =over 2 73 | 74 | =item -h 75 | 76 | show this message and exit. 77 | 78 | =item -s 79 | 80 | specify a list of "suspect encoding types" to test, 81 | separated by either C<:> or C<,> 82 | 83 | =item -S 84 | 85 | output a list of all acceptable encoding types that can be used with 86 | the -s param 87 | 88 | =item -u 89 | 90 | suppress display of unidentified types 91 | 92 | =back 93 | 94 | =head2 EXAMPLES: 95 | 96 | =over 2 97 | 98 | =item * 99 | 100 | Guess encoding of a file named C, using only the default 101 | suspect types. 102 | 103 | encguess test.txt 104 | 105 | =item * 106 | 107 | Guess the encoding type of a file named C, using the suspect 108 | types C. 109 | 110 | encguess -s euc-jp,shiftjis,7bit-jis test.txt 111 | encguess -s euc-jp:shiftjis:7bit-jis test.txt 112 | 113 | =item * 114 | 115 | Guess the encoding type of several files, do not display results for 116 | unidentified files. 117 | 118 | encguess -us euc-jp,shiftjis,7bit-jis test*.txt 119 | 120 | =back 121 | 122 | =head1 DESCRIPTION 123 | 124 | The encoding identification is done by checking one encoding type at a 125 | time until all but the right type are eliminated. The set of encoding 126 | types to try is defined by the -s parameter and defaults to ascii, 127 | utf8 and UTF-16/32 with BOM. This can be overridden by passing one or 128 | more encoding types via the -s parameter. If you need to pass in 129 | multiple suspect encoding types, use a quoted string with the a space 130 | separating each value. 131 | 132 | =head1 SEE ALSO 133 | 134 | L, L 135 | 136 | =head1 LICENSE AND COPYRIGHT 137 | 138 | Copyright 2015 Michael LaGrasta and Dan Kogai. 139 | 140 | This program is free software; you can redistribute it and/or modify it 141 | under the terms of the Artistic License (2.0). You may obtain a 142 | copy of the full license at: 143 | 144 | L 145 | 146 | =cut 147 | -------------------------------------------------------------------------------- /bin/ucm2table: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # $Id: ucm2table,v 2.1 2006/05/03 18:24:10 dankogai Exp $ 3 | # 4 | 5 | use 5.006; 6 | use strict; 7 | use Getopt::Std; 8 | my %Opt; 9 | getopts("aeu", \%Opt); 10 | my %Chartab; 11 | 12 | my $Hex = '[0-9A-Fa-f]'; 13 | while(<>){ 14 | chomp; 15 | my ($uni, $enc, $fb) = 16 | /^\s+(\S+)\s+\|(\d)/o or next; 17 | $fb eq '0' or next; 18 | my @byte = (); 19 | my $ord = 0; 20 | while($enc =~ /\G\\x($Hex+)/iog){ 21 | my $byte = hex($1); 22 | push @byte, $byte; 23 | $ord <<= 8; $ord += $byte; 24 | }; 25 | # print join('', @byte), " => $ord \n"; 26 | if ($Opt{u}){ 27 | $Chartab{$ord} = pack("U", hex($uni)); 28 | }else{ 29 | $Chartab{$ord} = pack("C*", @byte); 30 | } 31 | } 32 | 33 | my $start = $Opt{a} ? 0x20 : 0xa0; 34 | 35 | for (my $x = $start; $x <= 0xffff; $x += 32) { 36 | my $line = ''; 37 | for my $i (0..31){ 38 | my $num = $x+$i; $num eq 0x7f and next; # skip delete 39 | my $char = $Chartab{$num}; 40 | $line .= !$char ? " " : 41 | ($num < 0x7f ) ? " $char" : $char ; 42 | } 43 | $line =~ /^\s+$/o and next; 44 | printf "0x%04x: $line\n", $x; 45 | } 46 | -------------------------------------------------------------------------------- /bin/ucmlint: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl 2 | # 3 | # $Id: ucmlint,v 2.4 2017/06/10 17:23:50 dankogai Exp $ 4 | # 5 | 6 | BEGIN { pop @INC if $INC[-1] eq '.' } 7 | use strict; 8 | our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 9 | 10 | use Getopt::Std; 11 | our %Opt; 12 | getopts("Dehfv", \%Opt); 13 | 14 | if ($Opt{e}){ 15 | eval { require Encode } or die "can't load Encode : $@"; 16 | } 17 | 18 | $Opt{h} and help(); 19 | @ARGV or help(); 20 | 21 | sub help{ 22 | print <<""; 23 | $0 -[Dehfv] [ucm files ...] 24 | -D debug mode on 25 | -e test with Encode module also (requires perl 5.7.3 or higher) 26 | -h shows this message 27 | -f forces roundtrip check even for |[123] 28 | -v verbose mode 29 | 30 | } 31 | 32 | $| = 1; 33 | my (%Hdr, %U2E, %E2U, %Fallback); 34 | my $in_charmap = 0; 35 | my $nerror = 0; 36 | my $nwarning = 0; 37 | 38 | sub nit($;$){ 39 | my ($msg, $level) = @_; 40 | my $lstr; 41 | if ($level == 2){ 42 | $lstr = 'notice'; 43 | }elsif ($level == 1){ 44 | $lstr = 'warning'; $nwarning++; 45 | }else{ 46 | $lstr = 'error'; $nerror++; 47 | } 48 | print "$ARGV:$lstr in line $.: $msg\n"; 49 | } 50 | 51 | for $ARGV (@ARGV){ 52 | open UCM, $ARGV or die "$ARGV:$!"; 53 | %Hdr = %U2E = %E2U = %Fallback = (); 54 | $in_charmap = $nerror = $nwarning = 0; 55 | $. = 0; 56 | while(){ 57 | chomp; 58 | s/\s*#.*$//o; /^$/ and next; 59 | if ($_ eq "CHARMAP"){ 60 | $in_charmap = 1; 61 | for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ 62 | exists $Hdr{$must} or nit "<$must> nonexistent"; 63 | } 64 | $Hdr{mb_cur_min} > $Hdr{mb_cur_max} 65 | and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", 66 | $Hdr{mb_cur_min},$Hdr{mb_cur_max}); 67 | $in_charmap = 1; 68 | next; 69 | } 70 | unless ($in_charmap){ 71 | my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; 72 | $Opt{D} and warn "$hkey => $hvalue"; 73 | if ($hkey eq "code_set_name"){ # name check 74 | exists $Hdr{code_set_name} 75 | and nit "Duplicate : $hkey"; 76 | } 77 | if ($hkey eq "code_set_alias"){ # alias check 78 | $hvalue eq $Hdr{code_set_name} 79 | and nit qq(alias "$hvalue" is already in ); 80 | } 81 | $Hdr{$hkey} = $hvalue; 82 | }else{ 83 | my $name = $Hdr{code_set_name}; 84 | my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; 85 | $Opt{v} and nit $_, 2; 86 | my $uni = uniparse($unistr); 87 | my $enc = encparse($encstr); 88 | $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; 89 | $fb = $1; 90 | $Opt{f} and $fb = 0; 91 | unless ($fb == 3){ # check uni -> enc 92 | if (exists $U2E{$uni}){ 93 | nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; 94 | }else{ 95 | $U2E{$uni} = $enc; 96 | $Fallback{$uni}{$enc} = 1 if $fb == 1; 97 | if ($Opt{e}) { 98 | my $e = hex2enc($enc); 99 | my $u = hex2uni($uni); 100 | my $eu = Encode::encode($name, $u); 101 | $e eq $eu 102 | or nit qq(encode('$name', $uni) != $enc); 103 | } 104 | } 105 | } 106 | unless ($fb == 1){ # check enc -> uni 107 | if (exists $E2U{$enc}){ 108 | nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; 109 | }else{ 110 | $E2U{$enc} = $uni; 111 | $Fallback{$enc}{$uni} = 1 if $fb == 3; 112 | if ($Opt{e}) { 113 | my $e = hex2enc($enc); 114 | my $u = hex2uni($uni); 115 | $Opt{D} and warn "$uni, $enc"; 116 | my $de = Encode::decode($name, $e); 117 | $de eq $u 118 | or nit qq(decode('$name', $enc) != $uni); 119 | } 120 | } 121 | } 122 | # warn "$uni, $enc, $fb"; 123 | } 124 | } 125 | $in_charmap or nit "Where is CHARMAP?"; 126 | checkRT(); 127 | printf ("$ARGV: %s error%s found\n", 128 | ($nerror == 0 ? 'no' : $nerror), 129 | ($nerror > 1 ? 's' : '')); 130 | } 131 | 132 | exit; 133 | 134 | sub hex2enc{ 135 | pack("C*", map {hex($_)} split(",", shift)); 136 | } 137 | sub hex2uni{ 138 | join("", map { chr(hex($_)) } split(",", shift)); 139 | } 140 | 141 | sub checkRT{ 142 | for my $uni (keys %E2U){ 143 | my $enc = $U2E{$uni} or next; # okay 144 | $E2U{$U2E{$uni}} eq $uni or $Fallback{$uni}{$enc} or 145 | nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; 146 | } 147 | for my $enc (keys %E2U){ 148 | my $uni = $E2U{$enc} or next; # okay 149 | $U2E{$E2U{$enc}} eq $enc or $Fallback{$enc}{$uni} or 150 | nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; 151 | } 152 | } 153 | 154 | 155 | sub uniparse{ 156 | my $str = shift; 157 | my @u; 158 | push @u, $1 while($str =~ /\G/ig); 159 | for my $u (@u){ 160 | $u =~ /^([0-9A-Za-z]+)$/o 161 | or nit "malformed Unicode character: $u"; 162 | } 163 | return join(',', @u); 164 | } 165 | 166 | sub encparse{ 167 | my $str = shift; 168 | my @e; 169 | for my $e (split /\\x/io, $str){ 170 | $e or next; # first \x 171 | $e =~ /^([0-9A-Za-z]{1,2})$/io 172 | or nit "Hex $e in $str is bogus"; 173 | push @e, $1; 174 | } 175 | return join(',', @e); 176 | } 177 | 178 | 179 | 180 | __END__ 181 | 182 | A UCM file looks like this. 183 | 184 | # 185 | # Comments 186 | # 187 | "US-ascii" # Required 188 | "ascii" # Optional 189 | 1 # Required; usually 1 190 | 1 # Max. # of bytes/char 191 | \x3F # Substitution char 192 | # 193 | CHARMAP 194 | \x00 |0 # 195 | \x01 |0 # 196 | \x02 |0 # 197 | .... 198 | \x7C |0 # VERTICAL LINE 199 | \x7D |0 # RIGHT CURLY BRACKET 200 | \x7E |0 # TILDE 201 | \x7F |0 # 202 | END CHARMAP 203 | 204 | -------------------------------------------------------------------------------- /bin/ucmsort: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl 2 | # 3 | # $Id: ucmsort,v 2.2 2006/05/03 18:24:10 dankogai Exp $ 4 | # 5 | use strict; 6 | my @lines; 7 | my ($head, $tail); 8 | while (<>){ 9 | unless (m/^.*//o; 21 | push @lines,[ $u, @words ]; 22 | } 23 | 24 | print $head; 25 | for (sort { 26 | hex($a->[0]) <=> hex($b->[0]) # Unicode descending order 27 | or $a->[2] cmp $b->[2] # fallback descending order 28 | or $a->[1] cmp $b->[1] # Encoding descending order 29 | } 30 | @lines) { 31 | my $u = shift @$_; 32 | print join(" " => "", @$_), "\n"; 33 | } 34 | print $tail; 35 | __END__ 36 | -------------------------------------------------------------------------------- /encengine.c: -------------------------------------------------------------------------------- 1 | /* 2 | Data structures for encoding transformations. 3 | 4 | Perl works internally in either a native 'byte' encoding or 5 | in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" 6 | representation. When we do we can use utf8_to_uv(). 7 | 8 | Most character encodings are either simple byte mappings or 9 | variable length multi-byte encodings. UTF-8 can be viewed as a 10 | rather extreme case of the latter. 11 | 12 | So to solve an important part of perl's encode needs we need to solve the 13 | "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate 14 | case. (Where one of multi-bytes will usually be UTF-8.) 15 | 16 | The other type of encoding is a shift encoding where a prefix sequence 17 | determines what subsequent bytes mean. Such encodings have state. 18 | 19 | We also need to handle case where a character in one encoding has to be 20 | represented as multiple characters in the other. e.g. letter+diacritic. 21 | 22 | The process can be considered as pseudo perl: 23 | 24 | my $dst = ''; 25 | while (length($src)) 26 | { 27 | my $size = src_count($src); 28 | my $in_seq = substr($src,0,$size,''); 29 | my $out_seq = $s2d_hash{$in_seq}; 30 | if (defined $out_seq) 31 | { 32 | $dst .= $out_seq; 33 | } 34 | else 35 | { 36 | # an error condition 37 | } 38 | } 39 | return $dst; 40 | 41 | That has the following components: 42 | &src_count - a "rule" for how many bytes make up the next character in the 43 | source. 44 | %s2d_hash - a mapping from input sequences to output sequences 45 | 46 | The problem with that scheme is that it does not allow the output 47 | character repertoire to affect the characters considered from the 48 | input. 49 | 50 | So we use a "trie" representation which can also be considered 51 | a state machine: 52 | 53 | my $dst = ''; 54 | my $seq = \@s2d_seq; 55 | my $next = \@s2d_next; 56 | while (length($src)) 57 | { 58 | my $byte = $substr($src,0,1,''); 59 | my $out_seq = $seq->[$byte]; 60 | if (defined $out_seq) 61 | { 62 | $dst .= $out_seq; 63 | } 64 | else 65 | { 66 | # an error condition 67 | } 68 | ($next,$seq) = @$next->[$byte] if $next; 69 | } 70 | return $dst; 71 | 72 | There is now a pair of data structures to represent everything. 73 | It is valid for output sequence at a particular point to 74 | be defined but zero length, that just means "don't know yet". 75 | For the single byte case there is no 'next' so new tables will be the same as 76 | the original tables. For a multi-byte case a prefix byte will flip to the tables 77 | for the next page (adding nothing to the output), then the tables for the page 78 | will provide the actual output and set tables back to original base page. 79 | 80 | This scheme can also handle shift encodings. 81 | 82 | A slight enhancement to the scheme also allows for look-ahead - if 83 | we add a flag to re-add the removed byte to the source we could handle 84 | a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS) 85 | ab -> a (and take b back please) 86 | 87 | */ 88 | 89 | #define PERL_NO_GET_CONTEXT 90 | #include 91 | #include 92 | #include "encode.h" 93 | 94 | int 95 | do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, 96 | STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen) 97 | { 98 | const U8 *s = src; 99 | const U8 *send = s + *slen; 100 | const U8 *last = s; 101 | U8 *d = dst; 102 | U8 *dend = d + dlen, *dlast = d; 103 | int code = 0; 104 | if (!dst) 105 | return ENCODE_NOSPACE; 106 | while (s < send) { 107 | const encpage_t *e = enc; 108 | U8 byte = *s; 109 | while (byte > e->max) 110 | e++; 111 | if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { 112 | const U8 *cend = s + (e->slen & 0x7f); 113 | if (cend <= send) { 114 | STRLEN n; 115 | if ((n = e->dlen)) { 116 | const U8 *out = e->seq + n * (byte - e->min); 117 | U8 *oend = d + n; 118 | if (dst) { 119 | if (oend <= dend) { 120 | while (d < oend) 121 | *d++ = *out++; 122 | } 123 | else { 124 | /* Out of space */ 125 | code = ENCODE_NOSPACE; 126 | break; 127 | } 128 | } 129 | else 130 | d = oend; 131 | } 132 | enc = e->next; 133 | s++; 134 | if (s == cend) { 135 | if (approx && (e->slen & 0x80)) 136 | code = ENCODE_FALLBACK; 137 | last = s; 138 | if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { 139 | code = ENCODE_FOUND_TERM; 140 | break; 141 | } 142 | dlast = d; 143 | } 144 | } 145 | else { 146 | /* partial source character */ 147 | code = ENCODE_PARTIAL; 148 | break; 149 | } 150 | } 151 | else { 152 | /* Cannot represent */ 153 | code = ENCODE_NOREP; 154 | break; 155 | } 156 | } 157 | *slen = last - src; 158 | *dout = d - dst; 159 | return code; 160 | } 161 | -------------------------------------------------------------------------------- /lib/Encode/CJKConstants.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/lib/Encode/CJKConstants.pm -------------------------------------------------------------------------------- /lib/Encode/Config.pm: -------------------------------------------------------------------------------- 1 | # 2 | # Demand-load module list 3 | # 4 | package Encode::Config; 5 | our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 6 | 7 | use strict; 8 | use warnings; 9 | 10 | our %ExtModule = ( 11 | 12 | # Encode::Byte 13 | #iso-8859-1 is in Encode.pm itself 14 | 'iso-8859-2' => 'Encode::Byte', 15 | 'iso-8859-3' => 'Encode::Byte', 16 | 'iso-8859-4' => 'Encode::Byte', 17 | 'iso-8859-5' => 'Encode::Byte', 18 | 'iso-8859-6' => 'Encode::Byte', 19 | 'iso-8859-7' => 'Encode::Byte', 20 | 'iso-8859-8' => 'Encode::Byte', 21 | 'iso-8859-9' => 'Encode::Byte', 22 | 'iso-8859-10' => 'Encode::Byte', 23 | 'iso-8859-11' => 'Encode::Byte', 24 | 'iso-8859-13' => 'Encode::Byte', 25 | 'iso-8859-14' => 'Encode::Byte', 26 | 'iso-8859-15' => 'Encode::Byte', 27 | 'iso-8859-16' => 'Encode::Byte', 28 | 'koi8-f' => 'Encode::Byte', 29 | 'koi8-r' => 'Encode::Byte', 30 | 'koi8-u' => 'Encode::Byte', 31 | 'viscii' => 'Encode::Byte', 32 | 'cp424' => 'Encode::Byte', 33 | 'cp437' => 'Encode::Byte', 34 | 'cp737' => 'Encode::Byte', 35 | 'cp775' => 'Encode::Byte', 36 | 'cp850' => 'Encode::Byte', 37 | 'cp852' => 'Encode::Byte', 38 | 'cp855' => 'Encode::Byte', 39 | 'cp856' => 'Encode::Byte', 40 | 'cp857' => 'Encode::Byte', 41 | 'cp858' => 'Encode::Byte', 42 | 'cp860' => 'Encode::Byte', 43 | 'cp861' => 'Encode::Byte', 44 | 'cp862' => 'Encode::Byte', 45 | 'cp863' => 'Encode::Byte', 46 | 'cp864' => 'Encode::Byte', 47 | 'cp865' => 'Encode::Byte', 48 | 'cp866' => 'Encode::Byte', 49 | 'cp869' => 'Encode::Byte', 50 | 'cp874' => 'Encode::Byte', 51 | 'cp1006' => 'Encode::Byte', 52 | 'cp1250' => 'Encode::Byte', 53 | 'cp1251' => 'Encode::Byte', 54 | 'cp1252' => 'Encode::Byte', 55 | 'cp1253' => 'Encode::Byte', 56 | 'cp1254' => 'Encode::Byte', 57 | 'cp1255' => 'Encode::Byte', 58 | 'cp1256' => 'Encode::Byte', 59 | 'cp1257' => 'Encode::Byte', 60 | 'cp1258' => 'Encode::Byte', 61 | 'AdobeStandardEncoding' => 'Encode::Byte', 62 | 'MacArabic' => 'Encode::Byte', 63 | 'MacCentralEurRoman' => 'Encode::Byte', 64 | 'MacCroatian' => 'Encode::Byte', 65 | 'MacCyrillic' => 'Encode::Byte', 66 | 'MacFarsi' => 'Encode::Byte', 67 | 'MacGreek' => 'Encode::Byte', 68 | 'MacHebrew' => 'Encode::Byte', 69 | 'MacIcelandic' => 'Encode::Byte', 70 | 'MacRoman' => 'Encode::Byte', 71 | 'MacRomanian' => 'Encode::Byte', 72 | 'MacRumanian' => 'Encode::Byte', 73 | 'MacSami' => 'Encode::Byte', 74 | 'MacThai' => 'Encode::Byte', 75 | 'MacTurkish' => 'Encode::Byte', 76 | 'MacUkrainian' => 'Encode::Byte', 77 | 'nextstep' => 'Encode::Byte', 78 | 'hp-roman8' => 'Encode::Byte', 79 | #'gsm0338' => 'Encode::Byte', 80 | 'gsm0338' => 'Encode::GSM0338', 81 | 82 | # Encode::EBCDIC 83 | 'cp37' => 'Encode::EBCDIC', 84 | 'cp500' => 'Encode::EBCDIC', 85 | 'cp875' => 'Encode::EBCDIC', 86 | 'cp1026' => 'Encode::EBCDIC', 87 | 'cp1047' => 'Encode::EBCDIC', 88 | 'posix-bc' => 'Encode::EBCDIC', 89 | 90 | # Encode::Symbol 91 | 'dingbats' => 'Encode::Symbol', 92 | 'symbol' => 'Encode::Symbol', 93 | 'AdobeSymbol' => 'Encode::Symbol', 94 | 'AdobeZdingbat' => 'Encode::Symbol', 95 | 'MacDingbats' => 'Encode::Symbol', 96 | 'MacSymbol' => 'Encode::Symbol', 97 | 98 | # Encode::Unicode 99 | 'UCS-2BE' => 'Encode::Unicode', 100 | 'UCS-2LE' => 'Encode::Unicode', 101 | 'UTF-16' => 'Encode::Unicode', 102 | 'UTF-16BE' => 'Encode::Unicode', 103 | 'UTF-16LE' => 'Encode::Unicode', 104 | 'UTF-32' => 'Encode::Unicode', 105 | 'UTF-32BE' => 'Encode::Unicode', 106 | 'UTF-32LE' => 'Encode::Unicode', 107 | 'UTF-7' => 'Encode::Unicode::UTF7', 108 | ); 109 | 110 | unless ( ord("A") == 193 ) { 111 | %ExtModule = ( 112 | %ExtModule, 113 | 'euc-cn' => 'Encode::CN', 114 | 'gb12345-raw' => 'Encode::CN', 115 | 'gb2312-raw' => 'Encode::CN', 116 | 'hz' => 'Encode::CN', 117 | 'iso-ir-165' => 'Encode::CN', 118 | 'cp936' => 'Encode::CN', 119 | 'MacChineseSimp' => 'Encode::CN', 120 | 121 | '7bit-jis' => 'Encode::JP', 122 | 'euc-jp' => 'Encode::JP', 123 | 'iso-2022-jp' => 'Encode::JP', 124 | 'iso-2022-jp-1' => 'Encode::JP', 125 | 'jis0201-raw' => 'Encode::JP', 126 | 'jis0208-raw' => 'Encode::JP', 127 | 'jis0212-raw' => 'Encode::JP', 128 | 'cp932' => 'Encode::JP', 129 | 'MacJapanese' => 'Encode::JP', 130 | 'shiftjis' => 'Encode::JP', 131 | 132 | 'euc-kr' => 'Encode::KR', 133 | 'iso-2022-kr' => 'Encode::KR', 134 | 'johab' => 'Encode::KR', 135 | 'ksc5601-raw' => 'Encode::KR', 136 | 'cp949' => 'Encode::KR', 137 | 'MacKorean' => 'Encode::KR', 138 | 139 | 'big5-eten' => 'Encode::TW', 140 | 'big5-hkscs' => 'Encode::TW', 141 | 'cp950' => 'Encode::TW', 142 | 'MacChineseTrad' => 'Encode::TW', 143 | 144 | #'big5plus' => 'Encode::HanExtra', 145 | #'euc-tw' => 'Encode::HanExtra', 146 | #'gb18030' => 'Encode::HanExtra', 147 | 148 | 'MIME-Header' => 'Encode::MIME::Header', 149 | 'MIME-B' => 'Encode::MIME::Header', 150 | 'MIME-Q' => 'Encode::MIME::Header', 151 | 152 | 'MIME-Header-ISO_2022_JP' => 'Encode::MIME::Header::ISO_2022_JP', 153 | ); 154 | } 155 | 156 | # 157 | # Why not export ? to keep ConfigLocal Happy! 158 | # 159 | while ( my ( $enc, $mod ) = each %ExtModule ) { 160 | $Encode::ExtModule{$enc} = $mod; 161 | } 162 | 163 | 1; 164 | __END__ 165 | 166 | =head1 NAME 167 | 168 | Encode::Config -- internally used by Encode 169 | 170 | =cut 171 | -------------------------------------------------------------------------------- /lib/Encode/JP/H2Z.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/lib/Encode/JP/H2Z.pm -------------------------------------------------------------------------------- /lib/Encode/JP/JIS7.pm: -------------------------------------------------------------------------------- 1 | package Encode::JP::JIS7; 2 | use strict; 3 | use warnings; 4 | our $VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 5 | 6 | use Encode qw(:fallbacks); 7 | 8 | for my $name ( '7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1' ) { 9 | my $h2z = ( $name eq '7bit-jis' ) ? 0 : 1; 10 | my $jis0212 = ( $name eq 'iso-2022-jp' ) ? 0 : 1; 11 | 12 | my $obj = bless { 13 | Name => $name, 14 | h2z => $h2z, 15 | jis0212 => $jis0212, 16 | } => __PACKAGE__; 17 | Encode::define_encoding($obj, $name); 18 | } 19 | 20 | use parent qw(Encode::Encoding); 21 | 22 | # we override this to 1 so PerlIO works 23 | sub needs_lines { 1 } 24 | 25 | use Encode::CJKConstants qw(:all); 26 | 27 | # 28 | # decode is identical for all 2022 variants 29 | # 30 | 31 | sub decode($$;$) { 32 | my ( $obj, $str, $chk ) = @_; 33 | return undef unless defined $str; 34 | my $residue = ''; 35 | if ($chk) { 36 | $str =~ s/([^\x00-\x7f].*)$//so and $residue = $1; 37 | } 38 | $residue .= jis_euc( \$str ); 39 | $_[1] = $residue if $chk; 40 | return Encode::decode( 'euc-jp', $str, FB_PERLQQ ); 41 | } 42 | 43 | # 44 | # encode is different 45 | # 46 | 47 | sub encode($$;$) { 48 | require Encode::JP::H2Z; 49 | my ( $obj, $utf8, $chk ) = @_; 50 | return undef unless defined $utf8; 51 | 52 | # empty the input string in the stack so perlio is ok 53 | $_[1] = '' if $chk; 54 | my ( $h2z, $jis0212 ) = @$obj{qw(h2z jis0212)}; 55 | my $octet = Encode::encode( 'euc-jp', $utf8, $chk || 0 ); 56 | $h2z and &Encode::JP::H2Z::h2z( \$octet ); 57 | euc_jis( \$octet, $jis0212 ); 58 | return $octet; 59 | } 60 | 61 | # 62 | # cat_decode 63 | # 64 | my $re_scan_jis_g = qr{ 65 | \G ( ($RE{JIS_0212}) | $RE{JIS_0208} | 66 | ($RE{ISO_ASC}) | ($RE{JIS_KANA}) | ) 67 | ([^\e]*) 68 | }x; 69 | 70 | sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk) 71 | my ( $obj, undef, undef, $pos, $trm ) = @_; # currently ignores $chk 72 | my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; 73 | local ${^ENCODING}; 74 | use bytes; 75 | my $opos = pos($$rsrc); 76 | pos($$rsrc) = $pos; 77 | while ( $$rsrc =~ /$re_scan_jis_g/gc ) { 78 | my ( $esc, $esc_0212, $esc_asc, $esc_kana, $chunk ) = 79 | ( $1, $2, $3, $4, $5 ); 80 | 81 | unless ($chunk) { $esc or last; next; } 82 | 83 | if ( $esc && !$esc_asc ) { 84 | $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; 85 | if ($esc_kana) { 86 | $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; 87 | } 88 | elsif ($esc_0212) { 89 | $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; 90 | } 91 | $chunk = Encode::decode( 'euc-jp', $chunk, 0 ); 92 | } 93 | elsif ( ( my $npos = index( $chunk, $trm ) ) >= 0 ) { 94 | $$rdst .= substr( $chunk, 0, $npos + length($trm) ); 95 | $$rpos += length($esc) + $npos + length($trm); 96 | pos($$rsrc) = $opos; 97 | return 1; 98 | } 99 | $$rdst .= $chunk; 100 | $$rpos = pos($$rsrc); 101 | } 102 | $$rpos = pos($$rsrc); 103 | pos($$rsrc) = $opos; 104 | return ''; 105 | } 106 | 107 | # JIS<->EUC 108 | my $re_scan_jis = qr{ 109 | (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*) 110 | }x; 111 | 112 | sub jis_euc { 113 | local ${^ENCODING}; 114 | my $r_str = shift; 115 | $$r_str =~ s($re_scan_jis) 116 | { 117 | my ($esc_0212, $esc_asc, $esc_kana, $chunk) = 118 | ($1, $2, $3, $4); 119 | if (!$esc_asc) { 120 | $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; 121 | if ($esc_kana) { 122 | $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; 123 | } 124 | elsif ($esc_0212) { 125 | $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; 126 | } 127 | } 128 | $chunk; 129 | }geox; 130 | my ($residue) = ( $$r_str =~ s/(\e.*)$//so ); 131 | return $residue; 132 | } 133 | 134 | sub euc_jis { 135 | no warnings qw(uninitialized); 136 | local ${^ENCODING}; 137 | my $r_str = shift; 138 | my $jis0212 = shift; 139 | $$r_str =~ s{ 140 | ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) 141 | }{ 142 | my $chunk = $1; 143 | my $esc = 144 | ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} : 145 | ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} : 146 | $ESC{JIS_0208}; 147 | if ($esc eq $ESC{JIS_0212} && !$jis0212){ 148 | # fallback to '?' 149 | $chunk =~ tr/\xA1-\xFE/\x3F/; 150 | }else{ 151 | $chunk =~ tr/\xA1-\xFE/\x21-\x7E/; 152 | } 153 | $esc . $chunk . $ESC{ASC}; 154 | }geox; 155 | $$r_str =~ s/\Q$ESC{ASC}\E 156 | (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; 157 | $$r_str; 158 | } 159 | 160 | 1; 161 | __END__ 162 | 163 | 164 | =head1 NAME 165 | 166 | Encode::JP::JIS7 -- internally used by Encode::JP 167 | 168 | =cut 169 | -------------------------------------------------------------------------------- /lib/Encode/KR/2022_KR.pm: -------------------------------------------------------------------------------- 1 | package Encode::KR::2022_KR; 2 | use strict; 3 | use warnings; 4 | our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 5 | 6 | use Encode qw(:fallbacks); 7 | 8 | use parent qw(Encode::Encoding); 9 | __PACKAGE__->Define('iso-2022-kr'); 10 | 11 | sub needs_lines { 1 } 12 | 13 | sub perlio_ok { 14 | return 0; # for the time being 15 | } 16 | 17 | sub decode { 18 | my ( $obj, $str, $chk ) = @_; 19 | return undef unless defined $str; 20 | my $res = $str; 21 | my $residue = iso_euc( \$res ); 22 | 23 | # This is for PerlIO 24 | $_[1] = $residue if $chk; 25 | return Encode::decode( 'euc-kr', $res, FB_PERLQQ ); 26 | } 27 | 28 | sub encode { 29 | my ( $obj, $utf8, $chk ) = @_; 30 | return undef unless defined $utf8; 31 | 32 | # empty the input string in the stack so perlio is ok 33 | $_[1] = '' if $chk; 34 | my $octet = Encode::encode( 'euc-kr', $utf8, FB_PERLQQ ); 35 | euc_iso( \$octet ); 36 | return $octet; 37 | } 38 | 39 | use Encode::CJKConstants qw(:all); 40 | 41 | # ISO<->EUC 42 | 43 | sub iso_euc { 44 | my $r_str = shift; 45 | $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator 46 | $$r_str =~ s{ # replace characters in GL 47 | \x0e # between SO(\x0e) and SI(\x0f) 48 | ([^\x0f]*) # with characters in GR 49 | \x0f 50 | } 51 | { 52 | my $out= $1; 53 | $out =~ tr/\x21-\x7e/\xa1-\xfe/; 54 | $out; 55 | }geox; 56 | my ($residue) = ( $$r_str =~ s/(\e.*)$//so ); 57 | return $residue; 58 | } 59 | 60 | sub euc_iso { 61 | no warnings qw(uninitialized); 62 | my $r_str = shift; 63 | substr( $$r_str, 0, 0 ) = 64 | $ESC{'2022_KR'}; # put the designator at the beg. 65 | $$r_str =~ 66 | s{ # move KS X 1001 characters in GR to GL 67 | ($RE{EUC_C}+) # and enclose them with SO and SI 68 | }{ 69 | my $str = $1; 70 | $str =~ tr/\xA1-\xFE/\x21-\x7E/; 71 | "\x0e" . $str . "\x0f"; 72 | }geox; 73 | $$r_str; 74 | } 75 | 76 | 1; 77 | __END__ 78 | 79 | =head1 NAME 80 | 81 | Encode::KR::2022_KR -- internally used by Encode::KR 82 | 83 | =cut 84 | -------------------------------------------------------------------------------- /lib/Encode/MIME/Header/ISO_2022_JP.pm: -------------------------------------------------------------------------------- 1 | package Encode::MIME::Header::ISO_2022_JP; 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use parent qw(Encode::MIME::Header); 7 | 8 | my $obj = 9 | bless { decode_b => '1', decode_q => '1', encode => 'B', bpl => 76, Name => 'MIME-Header-ISO_2022_JP' } => 10 | __PACKAGE__; 11 | Encode::define_encoding($obj, 'MIME-Header-ISO_2022_JP'); 12 | 13 | use constant HEAD => '=?ISO-2022-JP?B?'; 14 | use constant TAIL => '?='; 15 | 16 | use Encode::CJKConstants qw(%RE); 17 | 18 | our $VERSION = do { my @r = ( q$Revision: 1.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 19 | 20 | # I owe the below codes totally to 21 | # Jcode by Dan Kogai & http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 22 | 23 | sub encode { 24 | my $self = shift; 25 | my $str = shift; 26 | return undef unless defined $str; 27 | 28 | utf8::encode($str) if ( Encode::is_utf8($str) ); 29 | Encode::from_to( $str, 'utf8', 'euc-jp' ); 30 | 31 | my ($trailing_crlf) = ( $str =~ /(\n|\r|\x0d\x0a)$/o ); 32 | 33 | $str = _mime_unstructured_header( $str, $self->{bpl} ); 34 | 35 | not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; 36 | 37 | return $str; 38 | } 39 | 40 | sub _mime_unstructured_header { 41 | my ( $oldheader, $bpl ) = @_; 42 | my $crlf = $oldheader =~ /\n$/; 43 | my ( $header, @words, @wordstmp, $i ) = (''); 44 | 45 | $oldheader =~ s/\s+$//; 46 | 47 | @wordstmp = split /\s+/, $oldheader; 48 | 49 | for ( $i = 0 ; $i < $#wordstmp ; $i++ ) { 50 | if ( $wordstmp[$i] !~ /^[\x21-\x7E]+$/ 51 | and $wordstmp[ $i + 1 ] !~ /^[\x21-\x7E]+$/ ) 52 | { 53 | $wordstmp[ $i + 1 ] = "$wordstmp[$i] $wordstmp[$i + 1]"; 54 | } 55 | else { 56 | push( @words, $wordstmp[$i] ); 57 | } 58 | } 59 | 60 | push( @words, $wordstmp[-1] ); 61 | 62 | for my $word (@words) { 63 | if ( $word =~ /^[\x21-\x7E]+$/ ) { 64 | $header =~ /(?:.*\n)*(.*)/; 65 | if ( length($1) + length($word) > $bpl ) { 66 | $header .= "\n $word"; 67 | } 68 | else { 69 | $header .= $word; 70 | } 71 | } 72 | else { 73 | $header = _add_encoded_word( $word, $header, $bpl ); 74 | } 75 | 76 | $header =~ /(?:.*\n)*(.*)/; 77 | 78 | if ( length($1) == $bpl ) { 79 | $header .= "\n "; 80 | } 81 | else { 82 | $header .= ' '; 83 | } 84 | } 85 | 86 | $header =~ s/\n? $//mg; 87 | 88 | $crlf ? "$header\n" : $header; 89 | } 90 | 91 | sub _add_encoded_word { 92 | my ( $str, $line, $bpl ) = @_; 93 | my $result = ''; 94 | 95 | while ( length($str) ) { 96 | my $target = $str; 97 | $str = ''; 98 | 99 | if ( 100 | length($line) + 22 + 101 | ( $target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o ) * 8 > $bpl ) 102 | { 103 | $line =~ s/[ \t\n\r]*$/\n/; 104 | $result .= $line; 105 | $line = ' '; 106 | } 107 | 108 | while (1) { 109 | my $iso_2022_jp = $target; 110 | Encode::from_to( $iso_2022_jp, 'euc-jp', 'iso-2022-jp' ); 111 | 112 | my $encoded = 113 | HEAD . MIME::Base64::encode_base64( $iso_2022_jp, '' ) . TAIL; 114 | 115 | if ( length($encoded) + length($line) > $bpl ) { 116 | $target =~ 117 | s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; 118 | $str = $1 . $str; 119 | } 120 | else { 121 | $line .= $encoded; 122 | last; 123 | } 124 | } 125 | 126 | } 127 | 128 | $result . $line; 129 | } 130 | 131 | 1; 132 | __END__ 133 | 134 | -------------------------------------------------------------------------------- /lib/Encode/MIME/Name.pm: -------------------------------------------------------------------------------- 1 | package Encode::MIME::Name; 2 | use strict; 3 | use warnings; 4 | our $VERSION = do { my @r = ( q$Revision: 1.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 5 | 6 | # NOTE: This table must be 1:1 mapping 7 | our %MIME_NAME_OF = ( 8 | 'AdobeStandardEncoding' => 'Adobe-Standard-Encoding', 9 | 'AdobeSymbol' => 'Adobe-Symbol-Encoding', 10 | 'ascii' => 'US-ASCII', 11 | 'big5-hkscs' => 'Big5-HKSCS', 12 | 'cp1026' => 'IBM1026', 13 | 'cp1047' => 'IBM1047', 14 | 'cp1250' => 'windows-1250', 15 | 'cp1251' => 'windows-1251', 16 | 'cp1252' => 'windows-1252', 17 | 'cp1253' => 'windows-1253', 18 | 'cp1254' => 'windows-1254', 19 | 'cp1255' => 'windows-1255', 20 | 'cp1256' => 'windows-1256', 21 | 'cp1257' => 'windows-1257', 22 | 'cp1258' => 'windows-1258', 23 | 'cp37' => 'IBM037', 24 | 'cp424' => 'IBM424', 25 | 'cp437' => 'IBM437', 26 | 'cp500' => 'IBM500', 27 | 'cp775' => 'IBM775', 28 | 'cp850' => 'IBM850', 29 | 'cp852' => 'IBM852', 30 | 'cp855' => 'IBM855', 31 | 'cp857' => 'IBM857', 32 | 'cp860' => 'IBM860', 33 | 'cp861' => 'IBM861', 34 | 'cp862' => 'IBM862', 35 | 'cp863' => 'IBM863', 36 | 'cp864' => 'IBM864', 37 | 'cp865' => 'IBM865', 38 | 'cp866' => 'IBM866', 39 | 'cp869' => 'IBM869', 40 | 'cp936' => 'GBK', 41 | 'euc-cn' => 'EUC-CN', 42 | 'euc-jp' => 'EUC-JP', 43 | 'euc-kr' => 'EUC-KR', 44 | #'gb2312-raw' => 'GB2312', # no, you're wrong, I18N::Charset 45 | 'hp-roman8' => 'hp-roman8', 46 | 'hz' => 'HZ-GB-2312', 47 | 'iso-2022-jp' => 'ISO-2022-JP', 48 | 'iso-2022-jp-1' => 'ISO-2022-JP-1', 49 | 'iso-2022-kr' => 'ISO-2022-KR', 50 | 'iso-8859-1' => 'ISO-8859-1', 51 | 'iso-8859-10' => 'ISO-8859-10', 52 | 'iso-8859-13' => 'ISO-8859-13', 53 | 'iso-8859-14' => 'ISO-8859-14', 54 | 'iso-8859-15' => 'ISO-8859-15', 55 | 'iso-8859-16' => 'ISO-8859-16', 56 | 'iso-8859-2' => 'ISO-8859-2', 57 | 'iso-8859-3' => 'ISO-8859-3', 58 | 'iso-8859-4' => 'ISO-8859-4', 59 | 'iso-8859-5' => 'ISO-8859-5', 60 | 'iso-8859-6' => 'ISO-8859-6', 61 | 'iso-8859-7' => 'ISO-8859-7', 62 | 'iso-8859-8' => 'ISO-8859-8', 63 | 'iso-8859-9' => 'ISO-8859-9', 64 | #'jis0201-raw' => 'JIS_X0201', 65 | #'jis0208-raw' => 'JIS_C6226-1983', 66 | #'jis0212-raw' => 'JIS_X0212-1990', 67 | 'koi8-r' => 'KOI8-R', 68 | 'koi8-u' => 'KOI8-U', 69 | #'ksc5601-raw' => 'KS_C_5601-1987', 70 | 'shiftjis' => 'Shift_JIS', 71 | 'UTF-16' => 'UTF-16', 72 | 'UTF-16BE' => 'UTF-16BE', 73 | 'UTF-16LE' => 'UTF-16LE', 74 | 'UTF-32' => 'UTF-32', 75 | 'UTF-32BE' => 'UTF-32BE', 76 | 'UTF-32LE' => 'UTF-32LE', 77 | 'UTF-7' => 'UTF-7', 78 | 'utf-8-strict' => 'UTF-8', 79 | 'viscii' => 'VISCII', 80 | ); 81 | 82 | # NOTE: %MIME_NAME_OF is still 1:1 mapping 83 | our %ENCODE_NAME_OF = map { uc $MIME_NAME_OF{$_} => $_ } keys %MIME_NAME_OF; 84 | 85 | # Add additional 1:N mapping 86 | $MIME_NAME_OF{'utf8'} = 'UTF-8'; 87 | 88 | sub get_mime_name($) { $MIME_NAME_OF{$_[0]} }; 89 | 90 | sub get_encode_name($) { $ENCODE_NAME_OF{uc $_[0]} }; 91 | 92 | 1; 93 | __END__ 94 | 95 | =head1 NAME 96 | 97 | Encode::MIME::NAME -- internally used by Encode 98 | 99 | =head1 SEE ALSO 100 | 101 | L 102 | 103 | =cut 104 | -------------------------------------------------------------------------------- /lib/Encode/PerlIO.pod: -------------------------------------------------------------------------------- 1 | =head1 NAME 2 | 3 | Encode::PerlIO -- a detailed document on Encode and PerlIO 4 | 5 | =head1 Overview 6 | 7 | It is very common to want to do encoding transformations when 8 | reading or writing files, network connections, pipes etc. 9 | If Perl is configured to use the new 'perlio' IO system then 10 | C provides a "layer" (see L) which can transform 11 | data as it is read or written. 12 | 13 | Here is how the blind poet would modernise the encoding: 14 | 15 | use Encode; 16 | open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek'); 17 | open(my $utf8,'>:utf8','iliad.utf8'); 18 | my @epic = <$iliad>; 19 | print $utf8 @epic; 20 | close($utf8); 21 | close($illiad); 22 | 23 | In addition, the new IO system can also be configured to read/write 24 | UTF-8 encoded characters (as noted above, this is efficient): 25 | 26 | open(my $fh,'>:utf8','anything'); 27 | print $fh "Any \x{0021} string \N{SMILEY FACE}\n"; 28 | 29 | Either of the above forms of "layer" specifications can be made the default 30 | for a lexical scope with the C pragma. See L. 31 | 32 | Once a handle is open, its layers can be altered using C. 33 | 34 | Without any such configuration, or if Perl itself is built using the 35 | system's own IO, then write operations assume that the file handle 36 | accepts only I and will C if a character larger than 255 is 37 | written to the handle. When reading, each octet from the handle becomes 38 | a byte-in-a-character. Note that this default is the same behaviour 39 | as bytes-only languages (including Perl before v5.6) would have, 40 | and is sufficient to handle native 8-bit encodings e.g. iso-8859-1, 41 | EBCDIC etc. and any legacy mechanisms for handling other encodings 42 | and binary data. 43 | 44 | In other cases, it is the program's responsibility to transform 45 | characters into bytes using the API above before doing writes, and to 46 | transform the bytes read from a handle into characters before doing 47 | "character operations" (e.g. C, C, ...). 48 | 49 | You can also use PerlIO to convert larger amounts of data you don't 50 | want to bring into memory. For example, to convert between ISO-8859-1 51 | (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines): 52 | 53 | open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!; 54 | open(G, ">:utf8", "data.utf") or die $!; 55 | while () { print G } 56 | 57 | # Could also do "print G " but that would pull 58 | # the whole file into memory just to write it out again. 59 | 60 | More examples: 61 | 62 | open(my $f, "<:encoding(cp1252)") 63 | open(my $g, ">:encoding(iso-8859-2)") 64 | open(my $h, ">:encoding(latin9)") # iso-8859-15 65 | 66 | See also L for how to change the default encoding of the 67 | data in your script. 68 | 69 | =head1 How does it work? 70 | 71 | Here is a crude diagram of how filehandle, PerlIO, and Encode 72 | interact. 73 | 74 | filehandle <-> PerlIO PerlIO <-> scalar (read/printed) 75 | \ / 76 | Encode 77 | 78 | When PerlIO receives data from either direction, it fills a buffer 79 | (currently with 1024 bytes) and passes the buffer to Encode. 80 | Encode tries to convert the valid part and passes it back to PerlIO, 81 | leaving invalid parts (usually a partial character) in the buffer. 82 | PerlIO then appends more data to the buffer, calls Encode again, 83 | and so on until the data stream ends. 84 | 85 | To do so, PerlIO always calls (de|en)code methods with CHECK set to 1. 86 | This ensures that the method stops at the right place when it 87 | encounters partial character. The following is what happens when 88 | PerlIO and Encode tries to encode (from utf8) more than 1024 bytes 89 | and the buffer boundary happens to be in the middle of a character. 90 | 91 | A B C .... ~ \x{3000} .... 92 | 41 42 43 .... 7E e3 80 80 .... 93 | <- buffer ---------------> 94 | << encoded >>>>>>>>>> 95 | <- next buffer ------ 96 | 97 | Encode converts from the beginning to \x7E, leaving \xe3 in the buffer 98 | because it is invalid (partial character). 99 | 100 | Unfortunately, this scheme does not work well with escape-based 101 | encodings such as ISO-2022-JP. 102 | 103 | =head1 Line Buffering 104 | 105 | Now let's see what happens when you try to decode from ISO-2022-JP and 106 | the buffer ends in the middle of a character. 107 | 108 | JIS208-ESC \x{5f3e} 109 | A B C .... ~ \e $ B |DAN | .... 110 | 41 42 43 .... 7E 1b 24 41 43 46 .... 111 | <- buffer ---------------------------> 112 | << encoded >>>>>>>>>>>>>>>>>>>>>>> 113 | 114 | As you see, the next buffer begins with \x43. But \x43 is 'C' in 115 | ASCII, which is wrong in this case because we are now in JISX 0208 116 | area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC, 117 | in escape-based encodings you can't tell if a given octet is a whole 118 | character or just part of it. 119 | 120 | Fortunately PerlIO also supports line buffer if you tell PerlIO to use 121 | one instead of fixed buffer. Since ISO-2022-JP is guaranteed to revert to ASCII at the end of the line, partial 122 | character will never happen when line buffer is used. 123 | 124 | To tell PerlIO to use line buffer, implement -Eneeds_lines method 125 | for your encoding object. See L for details. 126 | 127 | Thanks to these efforts most encodings that come with Encode support 128 | PerlIO but that still leaves following encodings. 129 | 130 | iso-2022-kr 131 | MIME-B 132 | MIME-Header 133 | MIME-Q 134 | 135 | Fortunately iso-2022-kr is hardly used (according to Jungshik) and 136 | MIME-* are very unlikely to be fed to PerlIO because they are for mail 137 | headers. See L for details. 138 | 139 | =head2 How can I tell whether my encoding fully supports PerlIO ? 140 | 141 | As of this writing, any encoding whose class belongs to Encode::XS and 142 | Encode::Unicode works. The Encode module has a C method 143 | which you can use before applying PerlIO encoding to the filehandle. 144 | Here is an example: 145 | 146 | my $use_perlio = perlio_ok($enc); 147 | my $layer = $use_perlio ? "<:raw" : "<:encoding($enc)"; 148 | open my $fh, $layer, $file or die "$file : $!"; 149 | while(<$fh>){ 150 | $_ = decode($enc, $_) unless $use_perlio; 151 | # .... 152 | } 153 | 154 | =head1 SEE ALSO 155 | 156 | L, 157 | L, 158 | L, 159 | L, 160 | L, 161 | L, 162 | L, 163 | L, 164 | the Perl Unicode Mailing List Eperl-unicode@perl.orgE 165 | 166 | =cut 167 | 168 | -------------------------------------------------------------------------------- /lib/Encode/Unicode/UTF7.pm: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: UTF7.pm,v 2.10 2017/06/10 17:23:50 dankogai Exp $ 3 | # 4 | package Encode::Unicode::UTF7; 5 | use strict; 6 | use warnings; 7 | use parent qw(Encode::Encoding); 8 | __PACKAGE__->Define('UTF-7'); 9 | our $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 10 | use MIME::Base64; 11 | use Encode qw(find_encoding); 12 | 13 | # 14 | # Algorithms taken from Unicode::String by Gisle Aas 15 | # 16 | 17 | our $OPTIONAL_DIRECT_CHARS = 1; 18 | my $specials = quotemeta "\'(),-./:?"; 19 | $OPTIONAL_DIRECT_CHARS 20 | and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}"; 21 | 22 | # \s will not work because it matches U+3000 DEOGRAPHIC SPACE 23 | # We use qr/[\n\r\t\ ] instead 24 | my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/; 25 | my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/; 26 | my $e_utf16 = find_encoding("UTF-16BE"); 27 | 28 | sub needs_lines { 1 } 29 | 30 | sub encode($$;$) { 31 | my ( $obj, $str, $chk ) = @_; 32 | return undef unless defined $str; 33 | my $len = length($str); 34 | pos($str) = 0; 35 | my $bytes = substr($str, 0, 0); # to propagate taintedness 36 | while ( pos($str) < $len ) { 37 | if ( $str =~ /\G($re_asis+)/ogc ) { 38 | my $octets = $1; 39 | utf8::downgrade($octets); 40 | $bytes .= $octets; 41 | } 42 | elsif ( $str =~ /\G($re_encoded+)/ogsc ) { 43 | if ( $1 eq "+" ) { 44 | $bytes .= "+-"; 45 | } 46 | else { 47 | my $s = $1; 48 | my $base64 = encode_base64( $e_utf16->encode($s), '' ); 49 | $base64 =~ s/=+$//; 50 | $bytes .= "+$base64-"; 51 | } 52 | } 53 | else { 54 | die "This should not happen! (pos=" . pos($str) . ")"; 55 | } 56 | } 57 | $_[1] = '' if $chk; 58 | return $bytes; 59 | } 60 | 61 | sub decode($$;$) { 62 | use re 'taint'; 63 | my ( $obj, $bytes, $chk ) = @_; 64 | return undef unless defined $bytes; 65 | my $len = length($bytes); 66 | my $str = substr($bytes, 0, 0); # to propagate taintedness; 67 | pos($bytes) = 0; 68 | no warnings 'uninitialized'; 69 | while ( pos($bytes) < $len ) { 70 | if ( $bytes =~ /\G([^+]+)/ogc ) { 71 | $str .= $1; 72 | } 73 | elsif ( $bytes =~ /\G\+-/ogc ) { 74 | $str .= "+"; 75 | } 76 | elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) { 77 | my $base64 = $1; 78 | my $pad = length($base64) % 4; 79 | $base64 .= "=" x ( 4 - $pad ) if $pad; 80 | $str .= $e_utf16->decode( decode_base64($base64) ); 81 | } 82 | elsif ( $bytes =~ /\G\+/ogc ) { 83 | $^W and warn "Bad UTF7 data escape"; 84 | $str .= "+"; 85 | } 86 | else { 87 | die "This should not happen " . pos($bytes); 88 | } 89 | } 90 | $_[1] = '' if $chk; 91 | return $str; 92 | } 93 | 1; 94 | __END__ 95 | 96 | =head1 NAME 97 | 98 | Encode::Unicode::UTF7 -- UTF-7 encoding 99 | 100 | =head1 SYNOPSIS 101 | 102 | use Encode qw/encode decode/; 103 | $utf7 = encode("UTF-7", $utf8); 104 | $utf8 = decode("UTF-7", $ucs2); 105 | 106 | =head1 ABSTRACT 107 | 108 | This module implements UTF-7 encoding documented in RFC 2152. UTF-7, 109 | as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It 110 | is designed to be MTA-safe and expected to be a standard way to 111 | exchange Unicoded mails via mails. But with the advent of UTF-8 and 112 | 8-bit compliant MTAs, UTF-7 is hardly ever used. 113 | 114 | UTF-7 was not supported by Encode until version 1.95 because of that. 115 | But Unicode::String, a module by Gisle Aas which adds Unicode supports 116 | to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added 117 | so Encode can supersede Unicode::String 100%. 118 | 119 | =head1 In Practice 120 | 121 | When you want to encode Unicode for mails and web pages, however, do 122 | not use UTF-7 unless you are sure your recipients and readers can 123 | handle it. Very few MUAs and WWW Browsers support these days (only 124 | Mozilla seems to support one). For general cases, use UTF-8 for 125 | message body and MIME-Header for header instead. 126 | 127 | =head1 SEE ALSO 128 | 129 | L, L, L 130 | 131 | RFC 2781 L 132 | 133 | =cut 134 | -------------------------------------------------------------------------------- /t/Aliases.t: -------------------------------------------------------------------------------- 1 | #!../perl 2 | 3 | BEGIN { 4 | if ($ENV{'PERL_CORE'}){ 5 | chdir 't'; 6 | unshift @INC, '../lib'; 7 | } 8 | require Config; Config->import(); 9 | if ($Config{'extensions'} !~ /\bEncode\b/) { 10 | print "1..0 # Skip: Encode was not built\n"; 11 | exit 0; 12 | } 13 | } 14 | 15 | use strict; 16 | use Encode; 17 | use Encode::Alias; 18 | my %a2c; 19 | my @override_tests; 20 | my $ON_EBCDIC; 21 | 22 | sub init_a2c{ 23 | %a2c = ( 24 | 'US-ascii' => 'ascii', 25 | 'ISO-646-US' => 'ascii', 26 | 'UTF-8' => 'utf-8-strict', 27 | 'en_US.UTF-8' => 'utf-8-strict', 28 | 'UCS-2' => 'UCS-2BE', 29 | 'UCS2' => 'UCS-2BE', 30 | 'iso-10646-1' => 'UCS-2BE', 31 | 'ucs2-le' => 'UCS-2LE', 32 | 'ucs2-be' => 'UCS-2BE', 33 | 'utf16' => 'UTF-16', 34 | 'utf32' => 'UTF-32', 35 | 'utf16-be' => 'UTF-16BE', 36 | 'utf32-be' => 'UTF-32BE', 37 | 'utf16-le' => 'UTF-16LE', 38 | 'utf32-le' => 'UTF-32LE', 39 | 'UCS4-BE' => 'UTF-32BE', 40 | 'UCS-4-LE' => 'UTF-32LE', 41 | 'cyrillic' => 'iso-8859-5', 42 | 'arabic' => 'iso-8859-6', 43 | 'greek' => 'iso-8859-7', 44 | 'hebrew' => 'iso-8859-8', 45 | 'iso-8859-8-I' => 'iso-8859-8', 46 | 'thai' => 'iso-8859-11', 47 | 'tis620' => 'iso-8859-11', 48 | 'tis-620' => 'iso-8859-11', 49 | 'WinLatin1' => 'cp1252', 50 | 'WinLatin2' => 'cp1250', 51 | 'WinCyrillic' => 'cp1251', 52 | 'WinGreek' => 'cp1253', 53 | 'WinTurkish' => 'cp1254', 54 | 'WinHebrew' => 'cp1255', 55 | 'WinArabic' => 'cp1256', 56 | 'WinBaltic' => 'cp1257', 57 | 'WinVietnamese' => 'cp1258', 58 | 'Macintosh' => 'MacRoman', 59 | 'koi8r' => 'koi8-r', 60 | 'koi8u' => 'koi8-u', 61 | 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp', 62 | 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp', 63 | 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn', 64 | 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn', 65 | 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr', 66 | 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr', 67 | 'ujis' => $ON_EBCDIC ? '' : 'euc-jp', 68 | 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis', 69 | 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis', 70 | 'jis' => $ON_EBCDIC ? '' : '7bit-jis', 71 | 'big-5' => $ON_EBCDIC ? '' : 'big5-eten', 72 | 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5-eten', 73 | 'tca-big5' => $ON_EBCDIC ? '' : 'big5-eten', 74 | 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs', 75 | 'hkscs-big5' => $ON_EBCDIC ? '' : 'big5-hkscs', 76 | 'GB_2312-80' => $ON_EBCDIC ? '' : 'euc-cn', 77 | 'KS_C_5601-1987' => $ON_EBCDIC ? '' : 'cp949', 78 | # 79 | 'gb12345-raw' => $ON_EBCDIC ? '' : 'gb12345-raw', 80 | 'gb2312-raw' => $ON_EBCDIC ? '' : 'gb2312-raw', 81 | 'jis0201-raw' => $ON_EBCDIC ? '' : 'jis0201-raw', 82 | 'jis0208-raw' => $ON_EBCDIC ? '' : 'jis0208-raw', 83 | 'jis0212-raw' => $ON_EBCDIC ? '' : 'jis0212-raw', 84 | 'ksc5601-raw' => $ON_EBCDIC ? '' : 'ksc5601-raw', 85 | 'cp65000' => 'UTF-7', 86 | 'cp65001' => 'utf-8-strict', 87 | ); 88 | 89 | for my $i (1..11,13..16){ 90 | $a2c{"ISO 8859 $i"} = "iso-8859-$i"; 91 | } 92 | for my $i (1..10){ 93 | $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; 94 | } 95 | for my $k (keys %Encode::Alias::Winlatin2cp){ 96 | my $v = $Encode::Alias::Winlatin2cp{$k}; 97 | $a2c{"Win" . ucfirst($k)} = "cp" . $v; 98 | $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v; 99 | $a2c{"cp-" . $v} = "cp" . $v; 100 | } 101 | my @a2c = keys %a2c; 102 | for my $k (@a2c){ 103 | $a2c{uc($k)} = $a2c{$k}; 104 | $a2c{lc($k)} = $a2c{$k}; 105 | $a2c{lcfirst($k)} = $a2c{$k}; 106 | $a2c{ucfirst($k)} = $a2c{$k}; 107 | } 108 | } 109 | 110 | BEGIN{ 111 | $ON_EBCDIC = ord("A") == 193; 112 | @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC'; 113 | $Encode::ON_EBCDIC = $ON_EBCDIC; 114 | init_a2c(); 115 | @override_tests = qw( 116 | myascii:cp1252 117 | mygreek:cp1253 118 | myhebrew:iso-8859-2 119 | myarabic:cp1256 120 | ueightsomething:utf-8-strict 121 | unknown: 122 | ); 123 | } 124 | 125 | if ($ON_EBCDIC){ 126 | delete @Encode::ExtModule{ 127 | qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165 MacChineseSimp 128 | euc-jp iso-2022-jp 7bit-jis shiftjis MacJapanese cp932 129 | euc-kr ksc5601 cp949 MacKorean 130 | big5 big5-hkscs cp950 MacChineseTrad 131 | gb18030 big5plus euc-tw) 132 | }; 133 | } 134 | 135 | use Test::More tests => (scalar keys %a2c) * 3 + @override_tests; 136 | 137 | print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n"; 138 | 139 | foreach my $a (keys %a2c){ 140 | print "# $a => $a2c{$a}\n"; 141 | my $e = Encode::find_encoding($a); 142 | is((defined($e) and $e->name), $a2c{$a},$a) 143 | or warn "alias was $a";; 144 | } 145 | 146 | # now we override some of the aliases and see if it works fine 147 | 148 | define_alias( 149 | qr/ascii/i => '"WinLatin1"', 150 | qr/cyrillic/i => '"WinCyrillic"', 151 | qr/arabic/i => '"WinArabic"', 152 | qr/greek/i => '"WinGreek"', 153 | qr/hebrew/i => '"WinHebrew"' 154 | ); 155 | 156 | Encode::find_encoding("myhebrew"); # polute alias cache 157 | 158 | define_alias( sub { 159 | my $enc = shift; 160 | return "iso-8859-2" if $enc =~ /hebrew/i; 161 | return "does-not-exist" if $enc =~ /arabic/i; # should then use other override alias 162 | return "utf-8" if $enc =~ /eight/i; 163 | return "unknown"; 164 | }); 165 | 166 | print "# alias test with alias overrides\n"; 167 | 168 | for my $test (@override_tests) { 169 | my($a, $c) = split /:/, $test; 170 | my $e = Encode::find_encoding($a); 171 | is((defined($e) and $e->name), $c, $a); 172 | } 173 | 174 | print "# alias undef test\n"; 175 | 176 | Encode::Alias->undef_aliases; 177 | foreach my $a (keys %a2c){ 178 | my $e = Encode::find_encoding($a); 179 | ok(!defined($e) || $e->name =~ /-raw$/o,"Undef $a") 180 | or warn "alias was $a"; 181 | } 182 | 183 | print "# alias reinit test\n"; 184 | 185 | Encode::Alias->init_aliases; 186 | init_a2c(); 187 | foreach my $a (keys %a2c){ 188 | my $e = Encode::find_encoding($a); 189 | is((defined($e) and $e->name), $a2c{$a}, "Reinit $a") 190 | or warn "alias was $a"; 191 | } 192 | __END__ 193 | for my $k (keys %a2c){ 194 | $k =~ /[A-Z]/ and next; 195 | print "$k => $a2c{$k}\n"; 196 | } 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /t/CJKT.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | require Config; Config->import(); 3 | if ($Config{'extensions'} !~ /\bEncode\b/) { 4 | print "1..0 # Skip: Encode was not built\n"; 5 | exit 0; 6 | } 7 | if (ord("A") == 193) { 8 | print "1..0 # Skip: EBCDIC\n"; 9 | exit 0; 10 | } 11 | # should work w/o PerlIO now! 12 | # unless (PerlIO::Layer->find('perlio')){ 13 | # print "1..0 # Skip: PerlIO required\n"; 14 | # exit 0; 15 | # } 16 | $| = 1; 17 | } 18 | use strict; 19 | use Test::More tests => 60; 20 | use Encode; 21 | use File::Basename; 22 | use File::Spec; 23 | use File::Compare qw(compare_text); 24 | our $DEBUG = shift || 0; 25 | 26 | my %Charset = 27 | ( 28 | 'big5-eten' => [qw(big5-eten)], 29 | 'big5-hkscs' => [qw(big5-hkscs)], 30 | gb2312 => [qw(euc-cn hz)], 31 | jisx0201 => [qw(euc-jp shiftjis 7bit-jis)], 32 | jisx0208 => [qw(euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1)], 33 | jisx0212 => [qw(euc-jp 7bit-jis iso-2022-jp-1)], 34 | ksc5601 => [qw(euc-kr iso-2022-kr johab)], 35 | ); 36 | 37 | 38 | my $dir = dirname(__FILE__); 39 | my $seq = 1; 40 | 41 | for my $charset (sort keys %Charset){ 42 | my ($src, $uni, $dst, $txt); 43 | 44 | my $transcoder = find_encoding($Charset{$charset}[0]) or die; 45 | 46 | my $src_enc = File::Spec->catfile($dir,"$charset.enc"); 47 | my $src_utf = File::Spec->catfile($dir,"$charset.utf"); 48 | my $dst_enc = File::Spec->catfile($dir,"$$.enc"); 49 | my $dst_utf = File::Spec->catfile($dir,"$$.utf8"); 50 | 51 | open $src, "<$src_enc" or die "$src_enc : $!"; 52 | 53 | if (PerlIO::Layer->find('perlio')){ 54 | binmode($src, ":bytes"); # needed when :utf8 in default open layer 55 | } 56 | 57 | $txt = join('',<$src>); 58 | close($src); 59 | 60 | eval { $uni = $transcoder->decode($txt, 1) } or print $@; 61 | ok(defined($uni), "decode $charset"); $seq++; 62 | is(length($txt),0, "decode $charset completely"); $seq++; 63 | 64 | open $dst, ">$dst_utf" or die "$dst_utf : $!"; 65 | if (PerlIO::Layer->find('perlio')){ 66 | binmode($dst, ":utf8"); 67 | print $dst $uni; 68 | }else{ # ugh! 69 | binmode($dst); 70 | my $raw = $uni; Encode::_utf8_off($raw); 71 | print $dst $raw; 72 | } 73 | 74 | close($dst); 75 | is(compare_text($dst_utf, $src_utf), 0, "$dst_utf eq $src_utf") 76 | or ($DEBUG and rename $dst_utf, "$dst_utf.$seq"); 77 | $seq++; 78 | 79 | open $src, "<$src_utf" or die "$src_utf : $!"; 80 | if (PerlIO::Layer->find('perlio')){ 81 | binmode($src, ":utf8"); 82 | $uni = join('', <$src>); 83 | }else{ # ugh! 84 | binmode($src); 85 | $uni = join('', <$src>); 86 | Encode::_utf8_on($uni); 87 | } 88 | close $src; 89 | 90 | my $unisave = $uni; 91 | eval { $txt = $transcoder->encode($uni,1) } or print $@; 92 | ok(defined($txt), "encode $charset"); $seq++; 93 | is(length($uni), 0, "encode $charset completely"); $seq++; 94 | $uni = $unisave; 95 | 96 | open $dst,">$dst_enc" or die "$dst_utf : $!"; 97 | binmode($dst); 98 | print $dst $txt; 99 | close($dst); 100 | is(compare_text($src_enc, $dst_enc), 0 => "$dst_enc eq $src_enc") 101 | or ($DEBUG and rename $dst_enc, "$dst_enc.$seq"); 102 | $seq++; 103 | 104 | unlink($dst_utf, $dst_enc); 105 | 106 | for my $encoding (@{$Charset{$charset}}){ 107 | my $rt = decode($encoding, encode($encoding, $uni)); 108 | is ($rt, $uni, "RT $encoding"); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /t/Encoder.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: Encoder.t,v 2.2 2023/11/10 01:10:50 dankogai Exp $ 3 | # 4 | 5 | BEGIN { 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | $| = 1; 12 | } 13 | 14 | use strict; 15 | #use Test::More 'no_plan'; 16 | use Test::More tests => 516; 17 | use Encode::Encoder qw(encoder); 18 | use MIME::Base64; 19 | package Encode::Base64; 20 | use parent 'Encode::Encoding'; 21 | __PACKAGE__->Define('base64'); 22 | use MIME::Base64; 23 | sub encode{ 24 | my ($obj, $data) = @_; 25 | return encode_base64($data); 26 | } 27 | sub decode{ 28 | my ($obj, $data) = @_; 29 | return decode_base64($data); 30 | } 31 | 32 | package main; 33 | 34 | my $e = encoder("foo", "ascii"); 35 | ok ($e->data("bar")); 36 | is ($e->data, "bar"); 37 | ok ($e->encoding("latin1")); 38 | is ($e->encoding, "iso-8859-1"); 39 | 40 | my $data = ''; 41 | for my $i (0..255){ 42 | no warnings; 43 | $data .= chr($i); 44 | my $base64 = encode_base64($data); 45 | is(encoder($data)->base64, $base64, "encode"); 46 | is(encoder($base64)->bytes('base64'), $data, "decode"); 47 | } 48 | 49 | 1; 50 | __END__ 51 | -------------------------------------------------------------------------------- /t/Mod_EUCJP.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/Mod_EUCJP.pm -------------------------------------------------------------------------------- /t/Unicode.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: Unicode.t,v 2.5 2023/11/10 01:10:50 dankogai Exp $ 3 | # 4 | # This script is written entirely in ASCII, even though quoted literals 5 | # do include non-BMP unicode characters -- Are you happy, jhi? 6 | # 7 | 8 | BEGIN { 9 | require Config; Config->import(); 10 | if ($Config{'extensions'} !~ /\bEncode\b/) { 11 | print "1..0 # Skip: Encode was not built\n"; 12 | exit 0; 13 | } 14 | if (ord("A") == 193) { 15 | print "1..0 # Skip: EBCDIC\n"; 16 | exit 0; 17 | } 18 | $| = 1; 19 | } 20 | 21 | use strict; 22 | #use Test::More 'no_plan'; 23 | use Test::More tests => 56; 24 | use Encode qw(encode decode find_encoding); 25 | 26 | # 27 | # see 28 | # http://www.unicode.org/reports/tr19/ 29 | # 30 | 31 | my $dankogai = "\x{5c0f}\x{98fc}\x{3000}\x{5f3e}"; 32 | my $nasty = "$dankogai\x{1abcd}"; 33 | my $fallback = "$dankogai\x{fffd}\x{fffd}"; 34 | 35 | #hi: (0x1abcd - 0x10000) / 0x400 + 0xD800 = 0xd82a 36 | #lo: (0x1abcd - 0x10000) % 0x400 + 0xDC00 = 0xdfcd 37 | 38 | my $n_16be = 39 | pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e d8 2a df cd>); 40 | my $n_16le = 41 | pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f 2a d8 cd df>); 42 | my $f_16be = 43 | pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e ff fd>); 44 | my $f_16le = 45 | pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f fd ff>); 46 | my $n_32be = 47 | pack("C*", map {hex($_)} 48 | qw<00 00 5c 0f 00 00 98 fc 00 00 30 00 00 00 5f 3e 00 01 ab cd>); 49 | my $n_32le = 50 | pack("C*", map {hex($_)} 51 | qw<0f 5c 00 00 fc 98 00 00 00 30 00 00 3e 5f 00 00 cd ab 01 00>); 52 | 53 | my $n_16bb = pack('n', 0xFeFF) . $n_16be; 54 | my $n_16lb = pack('v', 0xFeFF) . $n_16le; 55 | my $n_32bb = pack('N', 0xFeFF) . $n_32be; 56 | my $n_32lb = pack('V', 0xFeFF) . $n_32le; 57 | 58 | is($n_16be, encode('UTF-16BE', $nasty), qq{encode UTF-16BE}); 59 | is($n_16le, encode('UTF-16LE', $nasty), qq{encode UTF-16LE}); 60 | is($n_32be, encode('UTF-32BE', $nasty), qq{encode UTF-32BE}); 61 | is($n_32le, encode('UTF-32LE', $nasty), qq{encode UTF-16LE}); 62 | 63 | is($nasty, decode('UTF-16BE', $n_16be), qq{decode UTF-16BE}); 64 | is($nasty, decode('UTF-16LE', $n_16le), qq{decode UTF-16LE}); 65 | is($nasty, decode('UTF-32BE', $n_32be), qq{decode UTF-32BE}); 66 | is($nasty, decode('UTF-32LE', $n_32le), qq{decode UTF-32LE}); 67 | 68 | is($n_16bb, encode('UTF-16', $nasty), qq{encode UTF-16}); 69 | is($n_32bb, encode('UTF-32', $nasty), qq{encode UTF-32}); 70 | is($nasty, decode('UTF-16', $n_16bb), qq{decode UTF-16, bom=be}); 71 | is($nasty, decode('UTF-16', $n_16lb), qq{decode UTF-16, bom=le}); 72 | is($nasty, decode('UTF-32', $n_32bb), qq{decode UTF-32, bom=be}); 73 | is($nasty, decode('UTF-32', $n_32lb), qq{decode UTF-32, bom=le}); 74 | 75 | is(decode('UCS-2BE', $n_16be), $fallback, "decode UCS-2BE: fallback"); 76 | is(decode('UCS-2LE', $n_16le), $fallback, "decode UCS-2LE: fallback"); 77 | eval { decode('UCS-2BE', $n_16be, 1) }; 78 | is (index($@,'UCS-2BE:'), 0, "decode UCS-2BE: exception"); 79 | eval { decode('UCS-2LE', $n_16le, 1) }; 80 | is (index($@,'UCS-2LE:'), 0, "decode UCS-2LE: exception"); 81 | is(encode('UCS-2BE', $nasty), $f_16be, "encode UCS-2BE: fallback"); 82 | is(encode('UCS-2LE', $nasty), $f_16le, "encode UCS-2LE: fallback"); 83 | eval { encode('UCS-2BE', $nasty, 1) }; 84 | is(index($@, 'UCS-2BE'), 0, "encode UCS-2BE: exception"); 85 | eval { encode('UCS-2LE', $nasty, 1) }; 86 | is(index($@, 'UCS-2LE'), 0, "encode UCS-2LE: exception"); 87 | 88 | { 89 | my %tests = ( 90 | 'UCS-2BE' => 'n*', 91 | 'UCS-2LE' => 'v*', 92 | 'UTF-16BE' => 'n*', 93 | 'UTF-16LE' => 'v*', 94 | 'UTF-32BE' => 'N*', 95 | 'UTF-32LE' => 'V*', 96 | ); 97 | 98 | while (my ($enc, $pack) = each(%tests)) { 99 | is(decode($enc, pack($pack, 0xD800, 0x263A)), "\x{FFFD}\x{263A}", 100 | "decode $enc (HI surrogate followed by WHITE SMILING FACE)"); 101 | is(decode($enc, pack($pack, 0xDC00, 0x263A)), "\x{FFFD}\x{263A}", 102 | "decode $enc (LO surrogate followed by WHITE SMILING FACE)"); 103 | } 104 | } 105 | 106 | { 107 | my %tests = ( 108 | 'UTF-16BE' => 'n*', 109 | 'UTF-16LE' => 'v*', 110 | ); 111 | 112 | while (my ($enc, $pack) = each(%tests)) { 113 | is(decode($enc, pack($pack, 0xD800)), "\x{FFFD}", 114 | "decode $enc (HI surrogate)"); 115 | is(decode($enc, pack($pack, 0x263A, 0xD800)), "\x{263A}\x{FFFD}", 116 | "decode $enc (WHITE SMILING FACE followed by HI surrogate)"); 117 | } 118 | } 119 | 120 | { 121 | my %tests = ( 122 | 'UTF-16BE' => 'n*', 123 | 'UTF-16LE' => 'v*', 124 | ); 125 | 126 | while (my ($enc, $pack) = each(%tests)) { 127 | is(encode($enc, "\x{110000}"), pack($pack, 0xFFFD), 128 | "ordinals greater than U+10FFFF is replaced with U+FFFD"); 129 | } 130 | } 131 | 132 | # 133 | # SvGROW test for (en|de)code_xs 134 | # 135 | SKIP: { 136 | my $utf8 = ''; 137 | for my $j (0,0x10){ 138 | for my $i (0..0xffff){ 139 | $j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next; 140 | $utf8 .= ord($j+$i); 141 | } 142 | for my $major ('UTF-16', 'UTF-32'){ 143 | for my $minor ('BE', 'LE'){ 144 | my $enc = $major.$minor; 145 | is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT"); 146 | } 147 | } 148 | } 149 | }; 150 | 151 | # 152 | # CJKT vs. UTF-7 153 | # 154 | 155 | use File::Spec; 156 | use File::Basename; 157 | 158 | my $dir = dirname(__FILE__); 159 | opendir my $dh, $dir or die "$dir:$!"; 160 | my @file = sort grep {/\.utf$/o} readdir $dh; 161 | closedir $dh; 162 | for my $file (@file){ 163 | my $path = File::Spec->catfile($dir, $file); 164 | open my $fh, '<', $path or die "$path:$!"; 165 | my $content; 166 | if (PerlIO::Layer->find('perlio')){ 167 | binmode $fh => ':utf8'; 168 | $content = join('' => <$fh>); 169 | }else{ # ugh! 170 | binmode $fh; 171 | $content = join('' => <$fh>); 172 | Encode::_utf8_on($content) 173 | } 174 | close $fh; 175 | is(decode("UTF-7", encode("UTF-7", $content)), $content, 176 | "UTF-7 RT:$file"); 177 | } 178 | 179 | # Magic 180 | { 181 | # see http://rt.perl.org/rt3//Ticket/Display.html?id=60472 182 | my $work = chr(0x100); 183 | my $encoding = find_encoding("UTF16-BE"); 184 | my $tied; 185 | tie $tied, SomeScalar => \$work; 186 | my $result = $encoding->encode($tied, 1); 187 | is($work, "", "check set magic was applied"); 188 | } 189 | 190 | package SomeScalar; 191 | use Tie::Scalar; 192 | use vars qw(@ISA); 193 | BEGIN { @ISA = 'Tie::Scalar' } 194 | 195 | sub TIESCALAR { 196 | my ($class, $ref) = @_; 197 | return bless $ref, $class; 198 | } 199 | 200 | sub FETCH { 201 | ${$_[0]} 202 | } 203 | 204 | sub STORE { 205 | ${$_[0]} = $_[1]; 206 | } 207 | 208 | 1; 209 | __END__ 210 | -------------------------------------------------------------------------------- /t/Unicode_trailing_nul.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ( $] < 5.009 ) { 3 | print "1..0 # Skip: Perl <= 5.9 or later required\n"; 4 | exit 0; 5 | } 6 | } 7 | use strict; 8 | use Test::More; 9 | 10 | use Encode; 11 | use File::Temp; 12 | use File::Spec; 13 | 14 | # This test relies on https://github.com/Perl/perl5/issues/10623; 15 | # if that bug is ever fixed then this test may never fail again. 16 | 17 | my $foo = Encode::decode("UTF-16LE", "/\0v\0a\0r\0/\0f\0f\0f\0f\0f\0f\0/\0u\0s\0e\0r\0s\0/\0s\0u\0p\0e\0r\0m\0a\0n\0"); 18 | 19 | my ($fh, $path) = File::Temp::tempfile( UNLINK => 1 ); 20 | 21 | note "temp file: $path"; 22 | 23 | # Perl gives the internal PV to exec .. which is buggy/wrong but 24 | # useful here: 25 | system( $^X, '-e', "open my \$fh, '>>', '$path' or die \$!; print {\$fh} \$ARGV[0]", $foo ); 26 | die if $?; 27 | 28 | my $output = do { local $/; <$fh> }; 29 | 30 | is( $output, "/var/ffffff/users/superman", 'UTF-16 decodes with trailing NUL' ); 31 | 32 | done_testing(); 33 | -------------------------------------------------------------------------------- /t/at-cn.t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/at-cn.t -------------------------------------------------------------------------------- /t/at-tw.t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/at-tw.t -------------------------------------------------------------------------------- /t/big5-eten.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/big5-eten.enc -------------------------------------------------------------------------------- /t/big5-hkscs.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/big5-hkscs.enc -------------------------------------------------------------------------------- /t/cow.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: cow.t,v 1.2 2016/08/04 03:15:58 dankogai Exp $ 3 | # 4 | use strict; 5 | use Encode (); 6 | use Test::More tests => 4; 7 | 8 | 9 | my %a = ( "L\x{c3}\x{a9}on" => "acme" ); 10 | my ($k) = ( keys %a ); 11 | Encode::_utf8_on($k); 12 | my %h = ( $k => "acme" ); 13 | is $h{"L\x{e9}on"} => 'acme'; 14 | ($k) = ( keys %h ); 15 | Encode::_utf8_off($k); 16 | %a = ( $k => "acme" ); 17 | is $h{"L\x{e9}on"} => 'acme'; 18 | # use Devel::Peek; 19 | # Dump(\%h); 20 | 21 | { # invalid input to encode/decode/from_to should not affect COW-shared scalars 22 | my $x = Encode::decode('UTF-8', "\303\244" x 4); 23 | my $orig = "$x"; # non-COW copy 24 | is($x, $orig, "copy of original string matches"); 25 | { my $y = $x; Encode::from_to($y, "UTF-8", "iso-8859-1"); } 26 | is($x, $orig, "original scalar unmodified after from_to() call"); 27 | } 28 | -------------------------------------------------------------------------------- /t/decode.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: decode.t,v 1.5 2019/01/31 04:26:40 dankogai Exp $ 3 | # 4 | use strict; 5 | use Encode qw(decode_utf8 FB_CROAK find_encoding decode); 6 | use Test::More tests => 17; 7 | use Test::Builder; 8 | 9 | sub croak_ok(&) { 10 | local $Test::Builder::Level = $Test::Builder::Level + 1; 11 | my $code = shift; 12 | eval { $code->() }; 13 | like $@, qr/does not map/; 14 | } 15 | 16 | my $bytes = "L\x{e9}on"; 17 | my $pad = "\x{30C9}"; 18 | 19 | my $orig = $bytes; 20 | croak_ok { Encode::decode_utf8($orig, FB_CROAK) }; 21 | 22 | my $orig2 = $bytes; 23 | croak_ok { Encode::decode('utf-8', $orig2, FB_CROAK) }; 24 | 25 | chop(my $new = $bytes . $pad); 26 | croak_ok { Encode::decode_utf8($new, FB_CROAK) }; 27 | 28 | my $latin1 = find_encoding('latin1'); 29 | $orig = "\N{U+0080}"; 30 | $orig =~ /(.)/; 31 | is($latin1->decode($1), $orig, '[cpan #115168] passing magic regex globals to decode'); 32 | SKIP: { 33 | skip "Perl Version ($]) is older than v5.16", 1 if $] < 5.016; 34 | *a = $orig; 35 | is($latin1->decode(*a), '*main::'.$orig, '[cpan #115168] passing typeglobs to decode'); 36 | } 37 | 38 | $orig = "\x80"; 39 | $orig =~ /(.)/; 40 | is($latin1->decode($1), "\N{U+0080}", 'passing magic regex to latin1 decode'); 41 | 42 | $orig = "\x80"; 43 | *a = $orig; 44 | is($latin1->decode(*a), "*main::\N{U+0080}", 'passing typeglob to latin1 decode'); 45 | 46 | $orig = "\N{U+0080}"; 47 | $orig =~ /(.)/; 48 | is($latin1->encode($1), "\x80", 'passing magic regex to latin1 encode'); 49 | 50 | $orig = "\xC3\x80"; 51 | $orig =~ /(..)/; 52 | is(Encode::decode_utf8($1), "\N{U+C0}", 'passing magic regex to Encode::decode_utf8'); 53 | 54 | SKIP: { 55 | skip "Perl Version ($]) is older than v5.27.1", 1 if $] < 5.027001; 56 | $orig = "\xC3\x80"; 57 | *a = $orig; 58 | is(Encode::decode_utf8(*a), "*main::\N{U+C0}", 'passing typeglob to Encode::decode_utf8'); 59 | } 60 | 61 | $orig = "\N{U+C0}"; 62 | $orig =~ /(.)/; 63 | is(Encode::encode_utf8($1), "\xC3\x80", 'passing magic regex to Encode::encode_utf8'); 64 | 65 | $orig = "\xC3\x80"; 66 | $orig =~ /(..)/; 67 | is(Encode::decode('utf-8', $1), "\N{U+C0}", 'passing magic regex to UTF-8 decode'); 68 | 69 | $orig = "\xC3\x80"; 70 | *a = $orig; 71 | is(Encode::decode('utf-8', *a), "*main::\N{U+C0}", 'passing typeglob to UTF-8 decode'); 72 | 73 | $orig = "\N{U+C0}"; 74 | $orig =~ /(.)/; 75 | is(Encode::encode('utf-8', $1), "\xC3\x80", 'passing magic regex to UTF-8 encode'); 76 | 77 | SKIP: { 78 | skip "Perl Version ($]) is older than v5.16", 3 if $] < 5.016; 79 | 80 | $orig = "\N{U+0080}"; 81 | *a = $orig; 82 | is($latin1->encode(*a), "*main::\x80", 'passing typeglob to latin1 encode'); 83 | 84 | $orig = "\N{U+C0}"; 85 | *a = $orig; 86 | is(Encode::encode_utf8(*a), "*main::\xC3\x80", 'passing typeglob to Encode::encode_utf8'); 87 | 88 | $orig = "\N{U+C0}"; 89 | *a = $orig; 90 | is(Encode::encode('utf-8', *a), "*main::\xC3\x80", 'passing typeglob to UTF-8 encode'); 91 | } 92 | -------------------------------------------------------------------------------- /t/enc_data.t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/enc_data.t -------------------------------------------------------------------------------- /t/enc_eucjp.t: -------------------------------------------------------------------------------- 1 | # $Id: enc_eucjp.t,v 2.7 2023/11/10 01:10:50 dankogai Exp $ 2 | # This is the twin of enc_utf8.t . 3 | 4 | BEGIN { 5 | require Config; Config->import(); 6 | if ($Config{'extensions'} !~ /\bEncode\b/) { 7 | print "1..0 # Skip: Encode was not built\n"; 8 | exit 0; 9 | } 10 | unless (find PerlIO::Layer 'perlio') { 11 | print "1..0 # Skip: PerlIO was not built\n"; 12 | exit 0; 13 | } 14 | if (ord("A") == 193) { 15 | print "1..0 # encoding pragma does not support EBCDIC platforms\n"; 16 | exit(0); 17 | } 18 | if ($] <= 5.008 and !$Config{perl_patchlevel}){ 19 | print "1..0 # Skip: Perl 5.8.1 or later required\n"; 20 | exit 0; 21 | } 22 | if ($] >= 5.025003 and !$Config{usecperl}){ 23 | print "1..0 # Skip: Perl <=5.25.2 or cperl required\n"; 24 | exit 0; 25 | } 26 | } 27 | 28 | use Encode qw(); 29 | $PerlIO::encoding::fallback |= Encode::ONLY_PRAGMA_WARNINGS; 30 | use warnings "utf8"; 31 | 32 | no warnings "deprecated"; 33 | use encoding 'euc-jp'; 34 | 35 | my @c = (127, 128, 255, 256); 36 | 37 | print "1.." . (scalar @c + 2) . "\n"; 38 | 39 | my @f; 40 | 41 | for my $i (0..$#c) { 42 | no warnings 'pack'; 43 | my $file = filename("f$i"); 44 | push @f, $file; 45 | open(F, ">$file") or die "$0: failed to open '$file' for writing: $!"; 46 | binmode(F, ":utf8"); 47 | print F chr($c[$i]); 48 | print F pack("C" => $c[$i]); 49 | close F; 50 | } 51 | 52 | my $t = 1; 53 | 54 | for my $i (0..$#c) { 55 | my $file = filename("f$i"); 56 | open(F, "<$file") or die "$0: failed to open '$file' for reading: $!"; 57 | binmode(F, ":utf8"); 58 | my $c = ; 59 | my $o = ord($c); 60 | print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n"; 61 | $t++; 62 | } 63 | 64 | my $f = filename("f" . @f); 65 | 66 | push @f, $f; 67 | open(F, ">$f") or die "$0: failed to open '$f' for writing: $!"; 68 | binmode(F, ":raw"); # Output raw bytes. 69 | print F chr(128); # Output illegal UTF-8. 70 | close F; 71 | open(F, $f) or die "$0: failed to open '$f' for reading: $!"; 72 | binmode(F, ":encoding(UTF-8)"); 73 | { 74 | local $^W = 1; 75 | local $SIG{__WARN__} = sub { $a = shift }; 76 | eval { }; # This should get caught. 77 | } 78 | close F; 79 | print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ? 80 | "ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n"; 81 | $t++; 82 | 83 | open(F, $f) or die "$0: failed to open '$f' for reading: $!"; 84 | binmode(F, ":encoding(utf8)"); 85 | { 86 | local $^W = 1; 87 | local $SIG{__WARN__} = sub { $a = shift }; 88 | eval { }; # This should get caught. 89 | } 90 | close F; 91 | print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? 92 | "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; 93 | $t++; 94 | 95 | # On VMS temporary file names like "f0." may be more readable than "f0" since 96 | # "f0" could be a logical name pointing elsewhere. 97 | sub filename { 98 | my $name = shift; 99 | $name .= '.' if $^O eq 'VMS'; 100 | return $name; 101 | } 102 | 103 | END { 104 | 1 while unlink @f; 105 | } 106 | -------------------------------------------------------------------------------- /t/enc_module.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/enc_module.enc -------------------------------------------------------------------------------- /t/enc_module.t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/enc_module.t -------------------------------------------------------------------------------- /t/enc_utf8.t: -------------------------------------------------------------------------------- 1 | # $Id: enc_utf8.t,v 2.7 2023/11/10 01:10:50 dankogai Exp $ 2 | # This is the twin of enc_eucjp.t . 3 | 4 | BEGIN { 5 | require Config; Config->import(); 6 | if ($Config{'extensions'} !~ /\bEncode\b/) { 7 | print "1..0 # Skip: Encode was not built\n"; 8 | exit 0; 9 | } 10 | unless (find PerlIO::Layer 'perlio') { 11 | print "1..0 # Skip: PerlIO was not built\n"; 12 | exit 0; 13 | } 14 | if (ord("A") == 193) { 15 | print "1..0 # encoding pragma does not support EBCDIC platforms\n"; 16 | exit(0); 17 | } 18 | if ($] >= 5.025003 and !$Config{usecperl}){ 19 | print "1..0 # Skip: Perl <=5.25.2 or cperl required\n"; 20 | exit 0; 21 | } 22 | } 23 | 24 | no warnings "deprecated"; 25 | use encoding 'utf8'; 26 | use warnings; 27 | 28 | my @c = (127, 128, 255, 256); 29 | 30 | print "1.." . (scalar @c + 2) . "\n"; 31 | 32 | my @f; 33 | 34 | for my $i (0..$#c) { 35 | my $file = filename("f$i"); 36 | push @f, $file; 37 | open(F, ">$file") or die "$0: failed to open '$file' for writing: $!"; 38 | binmode(F, ":utf8"); 39 | print F chr($c[$i]); 40 | close F; 41 | } 42 | 43 | my $t = 1; 44 | 45 | for my $i (0..$#c) { 46 | my $file = filename("f$i"); 47 | open(F, "<$file") or die "$0: failed to open '$file' for reading: $!"; 48 | binmode(F, ":utf8"); 49 | my $c = ; 50 | my $o = ord($c); 51 | print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n"; 52 | $t++; 53 | } 54 | 55 | my $f = filename("f" . @f); 56 | 57 | push @f, $f; 58 | open(F, ">$f") or die "$0: failed to open '$f' for writing: $!"; 59 | binmode(F, ":raw"); # Output raw bytes. 60 | print F chr(128); # Output illegal UTF-8. 61 | close F; 62 | open(F, $f) or die "$0: failed to open '$f' for reading: $!"; 63 | binmode(F, ":encoding(UTF-8)"); 64 | { 65 | local $^W = 1; 66 | local $SIG{__WARN__} = sub { $a = shift }; 67 | eval { }; # This should get caught. 68 | } 69 | close F; 70 | print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ? 71 | "ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n"; 72 | $t++; 73 | 74 | open(F, $f) or die "$0: failed to open '$f' for reading: $!"; 75 | binmode(F, ":encoding(utf8)"); 76 | { 77 | local $^W = 1; 78 | local $SIG{__WARN__} = sub { $a = shift }; 79 | eval { }; # This should get caught. 80 | } 81 | close F; 82 | print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? 83 | "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; 84 | $t++; 85 | 86 | # On VMS temporary file names like "f0." may be more readable than "f0" since 87 | # "f0" could be a logical name pointing elsewhere. 88 | sub filename { 89 | my $name = shift; 90 | $name .= '.' if $^O eq 'VMS'; 91 | return $name; 92 | } 93 | 94 | END { 95 | 1 while unlink @f; 96 | } 97 | -------------------------------------------------------------------------------- /t/encoding-locale.t: -------------------------------------------------------------------------------- 1 | # 2 | # This test aims to detect (using CPAN Testers) platforms where the locale 3 | # encoding detection doesn't work. 4 | # 5 | 6 | use strict; 7 | use warnings; 8 | 9 | use Test::More tests => 3; 10 | 11 | use encoding (); 12 | use Encode qw; 13 | 14 | my $locale_encoding = encoding::_get_locale_encoding; 15 | 16 | SKIP: { 17 | defined $locale_encoding or skip 'no locale encoding found', 3; 18 | 19 | is(ref $locale_encoding, '', '_get_locale_encoding returns a scalar value'); 20 | 21 | my $enc = find_encoding($locale_encoding); 22 | ok(defined $enc, 'encoding returned is supported') 23 | or diag("Encoding: ", explain($locale_encoding)); 24 | isa_ok($enc, 'Encode::Encoding'); 25 | eval { note($locale_encoding, ' => ', $enc->name); }; 26 | } 27 | -------------------------------------------------------------------------------- /t/encoding.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | require Config; Config->import(); 3 | if ($Config{'extensions'} !~ /\bEncode\b/) { 4 | print "1..0 # Skip: Encode was not built\n"; 5 | exit 0; 6 | } 7 | unless (find PerlIO::Layer 'perlio') { 8 | print "1..0 # Skip: PerlIO was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: encoding pragma does not support EBCDIC platforms\n"; 13 | exit(0); 14 | } 15 | if ($] >= 5.025 and !$Config{usecperl}) { 16 | print "1..0 # Skip: encoding pragma not supported in Perl 5.25 or later\n"; 17 | exit(0); 18 | } 19 | } 20 | 21 | print "1..33\n"; 22 | 23 | 24 | no warnings "deprecated"; 25 | use encoding "latin1"; # ignored (overwritten by the next line) 26 | use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) 27 | 28 | # "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is 29 | # \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), 30 | # instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) 31 | 32 | $a = "\xDF"; 33 | $b = "\x{100}"; 34 | 35 | print "not " unless ord($a) == 0x3af; 36 | print "ok 1\n"; 37 | 38 | print "not " unless ord($b) == 0x100; 39 | print "ok 2\n"; 40 | 41 | my $c; 42 | 43 | $c = $a . $b; 44 | 45 | print "not " unless ord($c) == 0x3af; 46 | print "ok 3\n"; 47 | 48 | print "not " unless length($c) == 2; 49 | print "ok 4\n"; 50 | 51 | print "not " unless ord(substr($c, 1, 1)) == 0x100; 52 | print "ok 5\n"; 53 | 54 | print "not " unless ord(chr(0xdf)) == 0x3af; # spooky 55 | print "ok 6\n"; 56 | 57 | print "not " unless ord(pack("C", 0xdf)) == 0x3af; 58 | print "ok 7\n"; 59 | 60 | # we didn't break pack/unpack, I hope 61 | 62 | print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf; 63 | print "ok 8\n"; 64 | 65 | # the first octet of UTF-8 encoded 0x3af 66 | print "not " unless unpack("U0 C", chr(0xdf)) == 0xce; 67 | print "ok 9\n"; 68 | 69 | print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf; 70 | print "ok 10\n"; 71 | 72 | print "not " unless unpack("U", chr(0xdf)) == 0x3af; 73 | print "ok 11\n"; 74 | 75 | # charnames must still work 76 | use charnames ':full'; 77 | print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf; 78 | print "ok 12\n"; 79 | 80 | # combine 81 | 82 | $c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf); 83 | 84 | print "not " unless ord($c) == 0x3af; 85 | print "ok 13\n"; 86 | 87 | print "not " unless ord(substr($c, 1, 1)) == 0xdf; 88 | print "ok 14\n"; 89 | 90 | print "not " unless ord(substr($c, 2, 1)) == 0x3af; 91 | print "ok 15\n"; 92 | 93 | # regex literals 94 | 95 | print "not " unless "\xDF" =~ /\x{3AF}/; 96 | print "ok 16\n"; 97 | 98 | print "not " unless "\x{3AF}" =~ /\xDF/; 99 | print "ok 17\n"; 100 | 101 | print "not " unless "\xDF" =~ /\xDF/; 102 | print "ok 18\n"; 103 | 104 | print "not " unless "\x{3AF}" =~ /\x{3AF}/; 105 | print "ok 19\n"; 106 | 107 | # eq, cmp 108 | 109 | my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = ( 110 | pack("C*", 0xDF ), # byte 111 | pack("C*", 0xDF, 0x20), # ($bytes2 cmp $U) > 0 112 | pack("U*", 0x3AF), # $U eq $byte 113 | pack("U*", 0xDF ), # $Ub would eq $bytev w/o use encoding 114 | pack("U*", 0x3B1), # ($g1 cmp $byte) > 0; === chr(0xe1) 115 | pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0; 116 | pack("U*", 0x3AB), # ($l cmp $byte) < 0; === chr(0xdb) 117 | ); 118 | 119 | # all the tests in this section that compare a byte encoded string 120 | # ato UTF-8 encoded are run in all possible vairants 121 | # all of the eq, ne, cmp operations tested, 122 | # $v z $u tested as well as $u z $v 123 | 124 | sub alleq($$){ 125 | my ($a,$b) = (shift, shift); 126 | $a eq $b && $b eq $a && 127 | !( $a ne $b ) && !( $b ne $a ) && 128 | ( $a cmp $b ) == 0 && ( $b cmp $a ) == 0; 129 | } 130 | 131 | sub anyeq($$){ 132 | my ($a,$b) = (shift, shift); 133 | $a eq $b || $b eq $a || 134 | !( $a ne $b ) || !( $b ne $a ) || 135 | ( $a cmp $b ) == 0 || ( $b cmp $a ) == 0; 136 | } 137 | 138 | sub allgt($$){ 139 | my ($a,$b) = (shift, shift); 140 | ( $a cmp $b ) == 1 && ( $b cmp $a ) == -1; 141 | } 142 | #match the correct UTF-8 string 143 | print "not " unless alleq($byte, $U); 144 | print "ok 20\n"; 145 | 146 | #do not match a wrong UTF-8 string 147 | print "not " if anyeq($byte, $Ub); 148 | print "ok 21\n"; 149 | 150 | #string ordering 151 | print "not " unless allgt ( $g1, $byte ) && 152 | allgt ( $g2, $byte ) && 153 | allgt ( $byte, $l ) && 154 | allgt ( $bytes, $U ); 155 | print "ok 22\n"; 156 | 157 | # upgrade, downgrade 158 | 159 | my ($u,$v,$v2); 160 | $u = $v = $v2 = pack("C*", 0xDF); 161 | utf8::upgrade($v); #explicit upgrade 162 | $v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade 163 | 164 | # implicit upgrade === explicit upgrade 165 | print "not " if do{{use bytes; $v ne $v2}} || $v ne $v2; 166 | print "ok 23\n"; 167 | 168 | # utf8::upgrade is transparent and does not break equality 169 | print "not " unless alleq( $u, $v ); 170 | print "ok 24\n"; 171 | 172 | $u = $v = pack("C*", 0xDF); 173 | utf8::upgrade($v); 174 | #test for a roundtrip, we should get back from where we left 175 | eval {utf8::downgrade( $v )}; 176 | print "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v; 177 | print "ok 25\n"; 178 | 179 | # some more eq, cmp 180 | 181 | $byte=pack("C*", 0xDF); 182 | 183 | print "not " unless pack("U*", 0x3AF) eq $byte; 184 | print "ok 26\n"; 185 | 186 | print "not " if chr(0xDF) cmp $byte; 187 | print "ok 27\n"; 188 | 189 | print "not " unless ((pack("U*", 0x3B0) cmp $byte) == 1) && 190 | ((pack("U*", 0x3AE) cmp $byte) == -1) && 191 | ((pack("U*", 0x3AF, 0x20) cmp $byte) == 1) && 192 | ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1); 193 | print "ok 28\n"; 194 | 195 | 196 | { 197 | # Used to core dump in 5.7.3 198 | no warnings; # so test goes noiselessly 199 | print ord(undef) == 0 ? "ok 29\n" : "not ok 29\n"; 200 | } 201 | 202 | { 203 | my %h1; 204 | my %h2; 205 | $h1{"\xdf"} = 41; 206 | $h2{"\x{3af}"} = 42; 207 | print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n"; 208 | print $h2{"\xdf"} == 42 ? "ok 31\n" : "not ok 31\n"; 209 | } 210 | 211 | # Order of finding the above-Latin1 code point should not matter: both should 212 | # assume Latin1/Unicode encoding 213 | { 214 | use bytes; 215 | print "not " if "\xDF\x{100}" =~ /\x{3af}\x{100}/; 216 | print "ok 32\n"; 217 | print "not " if "\x{100}\xDF" =~ /\x{100}\x{3af}/; 218 | print "ok 33\n"; 219 | } 220 | -------------------------------------------------------------------------------- /t/from_to.t: -------------------------------------------------------------------------------- 1 | # $Id: from_to.t,v 1.1 2006/01/15 15:06:36 dankogai Exp $ 2 | use strict; 3 | use Test::More tests => 3; 4 | use Encode qw(encode from_to); 5 | 6 | my $foo = encode("utf-8", "\x{5abe}"); 7 | from_to($foo, "utf-8" => "latin1", Encode::FB_HTMLCREF); 8 | ok !Encode::is_utf8($foo); 9 | is $foo, '媾'; 10 | 11 | my $bar = encode("latin-1", "\x{5abe}", Encode::FB_HTMLCREF); 12 | is $bar, '媾'; 13 | -------------------------------------------------------------------------------- /t/gb2312.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/gb2312.enc -------------------------------------------------------------------------------- /t/grow.t: -------------------------------------------------------------------------------- 1 | #!../perl 2 | our $POWER; 3 | BEGIN { 4 | if ($ENV{'PERL_CORE'}){ 5 | chdir 't'; 6 | unshift @INC, '../lib'; 7 | } 8 | require Config; Config->import(); 9 | if ($Config{'extensions'} !~ /\bEncode\b/) { 10 | print "1..0 # Skip: Encode was not built\n"; 11 | exit 0; 12 | } 13 | $POWER = 12; # up to 1 MB. You may adjust the figure here 14 | } 15 | 16 | use strict; 17 | use Encode; 18 | 19 | my $seed = ""; 20 | for my $i (0x00..0xff){ 21 | my $c = chr($i); 22 | $seed .= ($c =~ /^\p{IsPrint}/o) ? $c : " "; 23 | } 24 | 25 | use Test::More tests => $POWER*2; 26 | my $octs = $seed; 27 | use bytes (); 28 | for my $i (1..$POWER){ 29 | $octs .= $octs; 30 | my $len = bytes::length($octs); 31 | my $utf8 = Encode::decode('latin1', $octs); 32 | ok(1, "decode $len bytes"); 33 | is($octs, 34 | Encode::encode('latin1', $utf8), 35 | "encode $len bytes"); 36 | } 37 | __END__ 38 | 39 | 40 | -------------------------------------------------------------------------------- /t/gsm0338.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}){ 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | $| = 1; 12 | } 13 | 14 | use strict; 15 | use utf8; 16 | use Test::More tests => 777; 17 | use Encode; 18 | use Encode::GSM0338; 19 | use PerlIO::encoding; 20 | 21 | # perl < 5.8.8 didn't enable STOP_AT_PARTIAL by default 22 | $PerlIO::encoding::fallback |= Encode::STOP_AT_PARTIAL; 23 | 24 | my $chk = Encode::LEAVE_SRC(); 25 | 26 | # escapes 27 | # see https://www.3gpp.org/dynareport/23038.htm 28 | # see https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/15.00.00_60/ts_123038v150000p.pdf (page 22) 29 | my %esc_seq = ( 30 | "\x{20ac}" => "\x1b\x65", 31 | "\x0c" => "\x1b\x0A", 32 | "[" => "\x1b\x3C", 33 | "\\" => "\x1b\x2F", 34 | "]" => "\x1b\x3E", 35 | "^" => "\x1b\x14", 36 | "{" => "\x1b\x28", 37 | "|" => "\x1b\x40", 38 | "}" => "\x1b\x29", 39 | "~" => "\x1b\x3D", 40 | ); 41 | 42 | my %unesc_seq = reverse %esc_seq; 43 | 44 | 45 | sub eu{ 46 | $_[0] =~ /[\x00-\x1f]/ ? 47 | sprintf("\\x{%04X}", ord($_[0])) : encode_utf8($_[0]); 48 | 49 | } 50 | 51 | for my $c ( map { chr } 0 .. 127 ) { 52 | next if $c eq "\x1B"; # escape character, start of multibyte sequence 53 | my $u = $Encode::GSM0338::GSM2UNI{$c}; 54 | 55 | # default character set 56 | is decode( "gsm0338", $c, $chk ), $u, 57 | sprintf( "decode \\x%02X", ord($c) ); 58 | eval { decode( "gsm0338", $c . "\xff", $chk | Encode::FB_CROAK ) }; 59 | ok( $@, $@ ); 60 | is encode( "gsm0338", $u, $chk ), $c, sprintf( "encode %s", eu($u) ); 61 | eval { encode( "gsm0338", $u . "\x{3000}", $chk | Encode::FB_CROAK ) }; 62 | ok( $@, $@ ); 63 | 64 | is decode( "gsm0338", "\x00" . $c ), '@' . decode( "gsm0338", $c ), 65 | sprintf( '@: decode \x00+\x%02X', ord($c) ); 66 | 67 | # escape seq. 68 | my $ecs = "\x1b" . $c; 69 | if ( $unesc_seq{$ecs} ) { 70 | is decode( "gsm0338", $ecs, $chk ), $unesc_seq{$ecs}, 71 | sprintf( "ESC: decode ESC+\\x%02X", ord($c) ); 72 | is encode( "gsm0338", $unesc_seq{$ecs}, $chk ), $ecs, 73 | sprintf( "ESC: encode %s ", eu( $unesc_seq{$ecs} ) ); 74 | } 75 | else { 76 | is decode( "gsm0338", $ecs, $chk ), 77 | "\x{FFFD}", 78 | sprintf( "decode ESC+\\x%02X", ord($c) ); 79 | } 80 | } 81 | 82 | # https://rt.cpan.org/Ticket/Display.html?id=75670 83 | is decode("gsm0338", "\x09") => chr(0xC7), 'RT75670: decode'; 84 | is encode("gsm0338", chr(0xC7)) => "\x09", 'RT75670: encode'; 85 | 86 | # https://rt.cpan.org/Public/Bug/Display.html?id=124571 87 | is decode("gsm0338", encode('gsm0338', '..@@..')), '..@@..'; 88 | is decode("gsm0338", encode('gsm0338', '..@€..')), '..@€..'; 89 | 90 | # special GSM sequence, € is at 1024 byte buffer boundary 91 | my $gsm = "\x41" . "\x1B\x65" x 1024; 92 | open my $fh, '<:encoding(gsm0338)', \$gsm or die; 93 | my $uni = <$fh>; 94 | close $fh; 95 | is $uni, "A" . "€" x 1024, 'PerlIO encoding(gsm0338) read works'; 96 | -------------------------------------------------------------------------------- /t/guess.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | require Config; Config->import(); 3 | if ($Config{'extensions'} !~ /\bEncode\b/) { 4 | print "1..0 # Skip: Encode was not built\n"; 5 | exit 0; 6 | } 7 | if (ord("A") == 193) { 8 | print "1..0 # Skip: EBCDIC\n"; 9 | exit 0; 10 | } 11 | $| = 1; 12 | } 13 | 14 | use strict; 15 | use File::Basename; 16 | use File::Spec; 17 | use Encode qw(decode encode find_encoding _utf8_off); 18 | 19 | #use Test::More qw(no_plan); 20 | use Test::More tests => 32; 21 | BEGIN { use_ok("Encode::Guess") } 22 | 23 | my $ascii = join('' => map {chr($_)}(0x21..0x7e)); 24 | my $latin1 = join('' => map {chr($_)}(0xa1..0xfe)); 25 | my $utf8on = join('' => map {chr($_)}(0x3000..0x30fe)); 26 | my $utf8off = $utf8on; _utf8_off($utf8off); 27 | my $utf16 = encode('UTF-16', $utf8on); 28 | my $utf32 = encode('UTF-32', $utf8on); 29 | 30 | like(guess_encoding(''), qr/empty string/io, 'empty string'); 31 | is(guess_encoding($ascii)->name, 'ascii', 'ascii'); 32 | like(guess_encoding($latin1), qr/No appropriate encoding/io, 'no ascii'); 33 | is(guess_encoding($latin1, 'latin1')->name, 'iso-8859-1', 'iso-8859-1'); 34 | is(guess_encoding($utf8on)->name, 'utf8', 'utf8 w/ flag'); 35 | is(guess_encoding($utf8off)->name, 'utf8', 'utf8 w/o flag'); 36 | is(guess_encoding($utf16)->name, 'UTF-16', 'UTF-16'); 37 | is(guess_encoding($utf32)->name, 'UTF-32', 'UTF-32'); 38 | 39 | my $jisx0201 = File::Spec->catfile(dirname(__FILE__), 'jisx0201.utf'); 40 | my $jisx0208 = File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'); 41 | my $jisx0212 = File::Spec->catfile(dirname(__FILE__), 'jisx0212.utf'); 42 | 43 | open my $fh, $jisx0208 or die "$jisx0208: $!"; 44 | binmode($fh); 45 | $utf8off = join('' => <$fh>); 46 | close $fh; 47 | $utf8on = decode('utf8', $utf8off); 48 | 49 | my @jp = qw(7bit-jis shiftjis euc-jp); 50 | 51 | Encode::Guess->set_suspects(@jp); 52 | 53 | for my $jp (@jp){ 54 | my $test = encode($jp, $utf8on); 55 | is(guess_encoding($test)->name, $jp, "JP:$jp"); 56 | } 57 | 58 | is (decode('Guess', encode('euc-jp', $utf8on)), $utf8on, "decode('Guess')"); 59 | eval{ encode('Guess', $utf8on) }; 60 | like($@, qr/not defined/io, "no encode()"); 61 | 62 | { 63 | my $warning; 64 | local $SIG{__WARN__} = sub { $warning = shift }; 65 | my $euc_jp = my $euc_jp_clone = encode('euc-jp', $utf8on); 66 | Encode::from_to($euc_jp, 'Guess', 'euc-jp'); 67 | is $euc_jp_clone, $euc_jp, "from_to(..., 'Guess')"; 68 | ok !$warning, "no warning"; 69 | diag $warning if $warning; 70 | } 71 | 72 | my %CJKT = 73 | ( 74 | 'euc-cn' => File::Spec->catfile(dirname(__FILE__), 'gb2312.utf'), 75 | 'euc-jp' => File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'), 76 | 'euc-kr' => File::Spec->catfile(dirname(__FILE__), 'ksc5601.utf'), 77 | 'big5-eten' => File::Spec->catfile(dirname(__FILE__), 'big5-eten.utf'), 78 | ); 79 | 80 | Encode::Guess->set_suspects(keys %CJKT); 81 | 82 | for my $name (keys %CJKT){ 83 | open my $fh, $CJKT{$name} or die "$CJKT{$name}: $!"; 84 | binmode($fh); 85 | $utf8off = join('' => <$fh>); 86 | close $fh; 87 | 88 | my $test = encode($name, decode('utf8', $utf8off)); 89 | is(guess_encoding($test)->name, $name, "CJKT:$name"); 90 | } 91 | 92 | my $ambiguous = "\x{5c0f}\x{98fc}\x{5f3e}"; 93 | my $english = "The quick brown fox jumps over the black lazy dog."; 94 | for my $utf (qw/UTF-16 UTF-32/){ 95 | for my $bl (qw/BE LE/){ 96 | my $test = encode("$utf$bl" => $english); 97 | is(guess_encoding($test)->name, "$utf$bl", "$utf$bl"); 98 | } 99 | } 100 | for my $bl (qw/BE LE/){ 101 | my $test = encode("UTF-16$bl" => $ambiguous); 102 | my $result = guess_encoding($test); 103 | ok(! ref($result), "UTF-16$bl:$result"); 104 | } 105 | 106 | 107 | 108 | Encode::Guess->set_suspects(); 109 | for my $jp (@jp){ 110 | # intentionally set $1 a priori -- see Changes 111 | my $test = "English"; 112 | '$1' =~ m/^(.*)/o; 113 | is(guess_encoding($test, ($jp))->name, 'ascii', 114 | "ascii vs $jp (\$1 messed)"); 115 | $test = encode($jp, $test . "\n\x{65e5}\x{672c}\x{8a9e}"); 116 | is(guess_encoding($test, ($jp))->name, 117 | $jp, "$jp vs ascii (\$1 messed)"); 118 | } 119 | 120 | __END__; 121 | -------------------------------------------------------------------------------- /t/isa.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: isa.t,v 1.1 2015/04/02 12:08:24 dankogai Exp $ 3 | # 4 | use strict; 5 | use Encode qw/find_encoding/; 6 | use Test::More; 7 | my @enc = Encode->encodings(":all"); 8 | plan tests => 0+@enc; 9 | isa_ok find_encoding($_), "Encode::Encoding" for @enc; 10 | 11 | -------------------------------------------------------------------------------- /t/jis7-fallback.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Test::More 'no_plan'; 3 | use Encode ':fallbacks'; 4 | 5 | my $str = "\x{0647}"; 6 | my @data = grep length, map { chomp; $_ } ; 7 | 8 | while (my($in, $out) = splice(@data, 0, 2)) { 9 | my $copy = $str; 10 | is Encode::encode("iso-2022-jp", $copy, eval $in), $out; 11 | } 12 | 13 | __DATA__ 14 | FB_PERLQQ 15 | \x{0647} 16 | 17 | FB_HTMLCREF 18 | ه 19 | 20 | FB_XMLCREF 21 | ه 22 | -------------------------------------------------------------------------------- /t/jisx0201.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/jisx0201.enc -------------------------------------------------------------------------------- /t/jisx0201.utf: -------------------------------------------------------------------------------- 1 | 0x00a0: 。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソ 2 | 0x00c0: タチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚ 3 | -------------------------------------------------------------------------------- /t/jisx0208.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/jisx0208.enc -------------------------------------------------------------------------------- /t/jisx0212.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/jisx0212.enc -------------------------------------------------------------------------------- /t/jperl.t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/jperl.t -------------------------------------------------------------------------------- /t/ksc5601.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/ksc5601.enc -------------------------------------------------------------------------------- /t/mime-name.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: mime-name.t,v 1.4 2023/11/10 01:10:50 dankogai Exp $ 3 | # This script is written in utf8 4 | # 5 | BEGIN { 6 | if ($ENV{'PERL_CORE'}){ 7 | chdir 't'; 8 | unshift @INC, '../lib'; 9 | } 10 | require Config; Config->import(); 11 | if ($Config{'extensions'} !~ /\bEncode\b/) { 12 | print "1..0 # Skip: Encode was not built\n"; 13 | exit 0; 14 | } 15 | if (ord("A") == 193) { 16 | print "1..0 # Skip: EBCDIC\n"; 17 | exit 0; 18 | } 19 | $| = 1; 20 | } 21 | 22 | use strict; 23 | use warnings; 24 | use Encode; 25 | #use Test::More qw(no_plan); 26 | use Test::More tests => 281; 27 | 28 | BEGIN { 29 | use_ok("Encode::MIME::Name"); 30 | } 31 | 32 | for my $canon ( sort keys %Encode::MIME::Name::MIME_NAME_OF ) { 33 | my $enc = find_encoding($canon); 34 | my $mime_name = $Encode::MIME::Name::MIME_NAME_OF{$canon}; 35 | is $enc->mime_name, $mime_name, 36 | qq(find_encoding($canon)->mime_name eq $mime_name); 37 | is $enc->name, $canon, 38 | qq(find_encoding($canon)->name eq $canon); 39 | } 40 | for my $mime_name ( sort keys %Encode::MIME::Name::ENCODE_NAME_OF ) { 41 | my $enc = find_mime_encoding($mime_name); 42 | my $canon = $Encode::MIME::Name::ENCODE_NAME_OF{$mime_name}; 43 | my $mime_name = $Encode::MIME::Name::MIME_NAME_OF{$canon}; 44 | is $enc->mime_name, $mime_name, 45 | qq(find_mime_encoding($mime_name)->mime_name eq $mime_name); 46 | is $enc->name, $canon, 47 | qq(find_mime_encoding($mime_name)->name eq $canon); 48 | } 49 | 50 | ok find_encoding("utf8"); 51 | ok find_encoding("UTF8"); 52 | ok find_encoding("utf-8-strict"); 53 | ok find_encoding("utf-8"); 54 | ok find_encoding("UTF-8"); 55 | 56 | ok not find_mime_encoding("utf8"); 57 | ok not find_mime_encoding("UTF8"); 58 | ok not find_mime_encoding("utf-8-strict"); 59 | ok find_mime_encoding("utf-8"); 60 | ok find_mime_encoding("UTF-8"); 61 | 62 | __END__; 63 | -------------------------------------------------------------------------------- /t/mime_header_iso2022jp.t: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dankogai/p5-encode/51e8cc56415253dfe27d69204b925b4df74b8a59/t/mime_header_iso2022jp.t -------------------------------------------------------------------------------- /t/perlio.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | require Config; Config->import(); 3 | if ($Config{'extensions'} !~ /\bEncode\b/) { 4 | print "1..0 # Skip: Encode was not built\n"; 5 | exit 0; 6 | } 7 | if (ord("A") == 193) { 8 | print "1..0 # Skip: EBCDIC\n"; 9 | exit 0; 10 | } 11 | unless (PerlIO::Layer->find('perlio')){ 12 | print "1..0 # Skip: PerlIO required\n"; 13 | exit 0; 14 | } 15 | $| = 1; 16 | } 17 | 18 | use strict; 19 | use File::Basename; 20 | use File::Spec; 21 | use File::Compare qw(compare_text); 22 | use File::Copy; 23 | use FileHandle; 24 | 25 | #use Test::More qw(no_plan); 26 | use Test::More tests => 38; 27 | 28 | our $DEBUG = 0; 29 | 30 | use Encode (":all"); 31 | { 32 | no warnings; 33 | @ARGV and $DEBUG = shift; 34 | #require Encode::JP::JIS7; 35 | #require Encode::KR::2022_KR; 36 | #$Encode::JP::JIS7::DEBUG = $DEBUG; 37 | } 38 | 39 | my $seq = 0; 40 | my $dir = dirname(__FILE__); 41 | 42 | my %e = 43 | ( 44 | jisx0208 => [ qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/], 45 | ksc5601 => [ qw/euc-kr/], 46 | gb2312 => [ qw/euc-cn hz/], 47 | ); 48 | 49 | $/ = "\x0a"; # may fix VMS problem for test #28 and #29 50 | 51 | for my $src (sort keys %e) { 52 | my $ufile = File::Spec->catfile($dir,"$src.utf"); 53 | open my $fh, "<:utf8", $ufile or die "$ufile : $!"; 54 | my @uline = <$fh>; 55 | my $utext = join('' => @uline); 56 | close $fh; 57 | 58 | for my $e (@{$e{$src}}){ 59 | my $sfile = File::Spec->catfile($dir,"$$.sio"); 60 | my $pfile = File::Spec->catfile($dir,"$$.pio"); 61 | 62 | # first create a file without perlio 63 | dump2file($sfile, &encode($e, $utext, 0)); 64 | 65 | # then create a file via perlio without autoflush 66 | 67 | SKIP:{ 68 | skip "$e: !perlio_ok", 4 unless (perlio_ok($e) or $DEBUG); 69 | no warnings 'uninitialized'; 70 | open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; 71 | $fh->autoflush(0); 72 | print $fh $utext; 73 | close $fh; 74 | $seq++; 75 | is(compare_text($sfile, $pfile), 0 => ">:encoding($e)"); 76 | if ($DEBUG){ 77 | copy $sfile, "$sfile.$seq"; 78 | copy $pfile, "$pfile.$seq"; 79 | } 80 | 81 | # this time print line by line. 82 | # works even for ISO-2022 but not ISO-2022-KR 83 | open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; 84 | $fh->autoflush(1); 85 | for my $l (@uline) { 86 | print $fh $l; 87 | } 88 | close $fh; 89 | $seq++; 90 | is(compare_text($sfile, $pfile), 0 => ">:encoding($e) by lines"); 91 | if ($DEBUG){ 92 | copy $sfile, "$sfile.$seq"; 93 | copy $pfile, "$pfile.$seq"; 94 | } 95 | my $dtext; 96 | open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; 97 | $fh->autoflush(0); 98 | $dtext = join('' => <$fh>); 99 | close $fh; 100 | $seq++; 101 | ok($utext eq $dtext, "<:encoding($e)"); 102 | if ($DEBUG){ 103 | dump2file("$sfile.$seq", $utext); 104 | dump2file("$pfile.$seq", $dtext); 105 | } 106 | if (perlio_ok($e) or $DEBUG){ 107 | $dtext = ''; 108 | open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; 109 | while(defined(my $l = <$fh>)) { 110 | $dtext .= $l; 111 | } 112 | close $fh; 113 | } 114 | $seq++; 115 | ok($utext eq $dtext, "<:encoding($e) by lines"); 116 | if ($DEBUG){ 117 | dump2file("$sfile.$seq", $utext); 118 | dump2file("$pfile.$seq", $dtext); 119 | } 120 | } 121 | if ( ! $DEBUG ) { 122 | 1 while unlink ($sfile); 123 | 1 while unlink ($pfile); 124 | } 125 | } 126 | } 127 | 128 | # BOM Test 129 | 130 | SKIP:{ 131 | my $pev = PerlIO::encoding->VERSION; 132 | skip "PerlIO::encoding->VERSION = $pev <= 0.07 ", 6 133 | unless ($pev >= 0.07 or $DEBUG); 134 | 135 | my $file = File::Spec->catfile($dir,"jisx0208.utf"); 136 | open my $fh, "<:utf8", $file or die "$file : $!"; 137 | my $str = join('' => <$fh>); 138 | close $fh; 139 | my %bom = ( 140 | 'UTF-16BE' => pack('n', 0xFeFF), 141 | 'UTF-16LE' => pack('v', 0xFeFF), 142 | 'UTF-32BE' => pack('N', 0xFeFF), 143 | 'UTF-32LE' => pack('V', 0xFeFF), 144 | ); 145 | # reading 146 | for my $utf (sort keys %bom){ 147 | my $bomed = $bom{$utf} . encode($utf, $str); 148 | my $sfile = File::Spec->catfile($dir,".${utf}_${seq}_$$"); 149 | dump2file($sfile, $bomed); 150 | my $utf_nobom = $utf; $utf_nobom =~ s/(LE|BE)$//o; 151 | # reading 152 | open $fh, "<:encoding($utf_nobom)", $sfile or die "$sfile : $!"; 153 | my $cmp = join '' => <$fh>; 154 | close $fh; 155 | is($str, $cmp, "<:encoding($utf_nobom) eq $utf"); 156 | unlink $sfile; $seq++; 157 | } 158 | # writing 159 | for my $utf_nobom (qw/UTF-16 UTF-32/){ 160 | my $utf = $utf_nobom . 'BE'; 161 | my $sfile = File::Spec->catfile($dir,".${utf_nobom}_${seq}_$$"); 162 | my $bomed = $bom{$utf} . encode($utf, $str); 163 | open $fh, ">:encoding($utf_nobom)", $sfile or die "$sfile : $!"; 164 | print $fh $str; 165 | close $fh; 166 | open my $fh, "<:bytes", $sfile or die "$sfile : $!"; 167 | read $fh, my $cmp, -s $sfile; 168 | close $fh; 169 | use bytes (); 170 | ok($bomed eq $cmp, ">:encoding($utf_nobom) eq $utf"); 171 | unlink $sfile; $seq++; 172 | } 173 | } 174 | sub dump2file{ 175 | no warnings; 176 | open my $fh, ">", $_[0] or die "$_[0]: $!"; 177 | binmode $fh; 178 | print $fh $_[1]; 179 | close $fh; 180 | } 181 | -------------------------------------------------------------------------------- /t/piconv.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: piconv.t,v 0.4 2013/02/18 02:23:56 dankogai Exp $ 3 | # 4 | 5 | BEGIN { 6 | if ( $ENV{'PERL_CORE'} && $] >= 5.011) { 7 | print "1..0 # Skip: Don't know how to test this within perl's core\n"; 8 | exit 0; 9 | } 10 | } 11 | 12 | use strict; 13 | use FindBin; 14 | use File::Spec; 15 | use IPC::Open3 qw(open3); 16 | use IO::Select; 17 | use Test::More; 18 | 19 | my $WIN = $^O eq 'MSWin32'; 20 | 21 | if ($WIN) { 22 | eval { require IPC::Run; IPC::Run->VERSION(0.83); 1; } or 23 | plan skip_all => 'Win32 environments require IPC::Run 0.83 to complete this test'; 24 | } 25 | 26 | sub run_cmd (;$$); 27 | 28 | my $blib = 29 | File::Spec->rel2abs( 30 | File::Spec->catdir( $FindBin::RealBin, File::Spec->updir ) ); 31 | my $script = File::Spec->catdir($blib, 'bin', 'piconv'); 32 | my @base_cmd = ( $^X, "-Mblib=$blib", $script ); 33 | 34 | plan tests => 5; 35 | 36 | { 37 | my ( $st, $out, $err ) = run_cmd; 38 | is( $st, 0, 'status for usage call' ); 39 | is( $out, $WIN ? undef : '' ); 40 | like( $err, qr{^piconv}, 'usage' ); 41 | } 42 | 43 | { 44 | my($st, $out, $err) = run_cmd [qw(-S foobar -f utf-8 -t ascii), $script]; 45 | like($err, qr{unknown scheme.*fallback}i, 'warning for unknown scheme'); 46 | } 47 | 48 | { 49 | my ( $st, $out, $err ) = run_cmd [qw(-f utf-8 -t ascii ./non-existing/file)]; 50 | like( $err, qr{can't open}i ); 51 | } 52 | 53 | sub run_cmd (;$$) { 54 | my ( $args, $in ) = @_; 55 | 56 | my $out = "x" x 10_000; 57 | $out = ""; 58 | my $err = "x" x 10_000; 59 | $err = ""; 60 | 61 | if ($WIN) { 62 | IPC::Run->import(qw(run timeout)); 63 | my @cmd; 64 | if (defined $args) { 65 | @cmd = (@base_cmd, @$args); 66 | } else { 67 | @cmd = @base_cmd; 68 | } 69 | run(\@cmd, \$in, \$out, \$err, timeout(10)); 70 | my $st = $?; 71 | $out = undef if ($out eq ''); 72 | ( $st, $out, $err ); 73 | } else { 74 | $in ||= ''; 75 | my ( $in_fh, $out_fh, $err_fh ); 76 | use Symbol 'gensym'; 77 | $err_fh = 78 | gensym; # sigh... otherwise stderr gets just to $out_fh, not to $err_fh 79 | my $pid = open3( $in_fh, $out_fh, $err_fh, @base_cmd, @$args ) 80 | or die "Can't run @base_cmd @$args: $!"; 81 | print $in_fh $in; 82 | my $sel = IO::Select->new( $out_fh, $err_fh ); 83 | 84 | while ( my @ready = $sel->can_read ) { 85 | for my $fh (@ready) { 86 | if ( eof($fh) ) { 87 | $sel->remove($fh); 88 | last if !$sel->handles; 89 | } 90 | elsif ( $out_fh == $fh ) { 91 | my $line = <$fh>; 92 | $out .= $line; 93 | } 94 | elsif ( $err_fh == $fh ) { 95 | my $line = <$fh>; 96 | $err .= $line; 97 | } 98 | } 99 | } 100 | my $st = $?; 101 | ( $st, $out, $err ); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /t/rt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl 2 | # 3 | # $Id: rt.pl,v 2.2 2023/11/10 01:10:50 dankogai Exp $ 4 | # 5 | 6 | BEGIN { 7 | my $ucmdir = "ucm"; 8 | if ($ENV{'PERL_CORE'}){ 9 | chdir 't'; 10 | unshift @INC, '../lib'; 11 | $ucmdir = "../ext/Encode/ucm"; 12 | } 13 | require Config; Config->import(); 14 | if ($Config{'extensions'} !~ /\bEncode\b/) { 15 | print "1..0 # Skip: Encode was not built\n"; 16 | exit 0; 17 | } 18 | if (ord("A") == 193) { 19 | print "1..0 # Skip: EBCDIC\n"; 20 | exit 0; 21 | } 22 | use strict; 23 | require Test::More; 24 | our $DEBUG; 25 | our @ucm; 26 | unless(@ARGV){ 27 | use File::Spec; 28 | Test::More->import(tests => 103); 29 | opendir my $dh, $ucmdir or die "$ucmdir:$!"; 30 | @ucm = 31 | map {File::Spec->catfile($ucmdir, $_) } 32 | sort grep {/\.ucm$/o} readdir($dh); 33 | closedir $dh; 34 | }else{ 35 | Test::More->import("no_plan"); 36 | $DEBUG = 1; 37 | @ucm = @ARGV; 38 | } 39 | } 40 | 41 | use strict; 42 | use Encode qw/encode decode/; 43 | our $DEBUG; 44 | our @ucm; 45 | 46 | for my $ucm (@ucm){ 47 | my ($name, $nchar, $nrt, $nok) = rttest($ucm); 48 | $nok += 0; 49 | ok($nok == 0, "$ucm => $name ($nchar, $nrt, $nok)"); 50 | } 51 | 52 | sub rttest{ 53 | my $ucm = shift; 54 | my ($name, $nchar, $nrt, $nok); 55 | open my $rfh, "<$ucm" or die "$ucm:$!"; 56 | # \x00 |0 # 57 | while(<$rfh>){ 58 | s/#.*//o; /^$/ and next; 59 | unless ($name){ 60 | /^\s+"([^\"]+)"/io or next; 61 | $name = $1 and next; 62 | }else{ 63 | /^\s+(\S+)\s+\|(\d)/io or next; 64 | $nchar++; 65 | $3 == 0 or next; 66 | $nrt++; 67 | my $uni = chr(hex($1)); 68 | my $enc = eval qq{ "$2" }; 69 | decode($name, $enc) eq $uni or $nok++; 70 | encode($name, $uni) eq $enc or $nok++; 71 | } 72 | } 73 | return($name, $nchar, $nrt, $nok); 74 | } 75 | __END__ 76 | -------------------------------------------------------------------------------- /t/rt113164.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}) { 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: EBCDIC\n"; 13 | exit 0; 14 | } 15 | $| = 1; 16 | } 17 | 18 | use strict; 19 | use warnings; 20 | 21 | use Test::More tests => 2; 22 | 23 | use Encode; 24 | 25 | my $str = "You" . chr(8217) . "re doomed!"; 26 | 27 | my $data; 28 | 29 | my $cb = sub { 30 | $data = [ ('?') x 12_500 ]; 31 | return ";"; 32 | }; 33 | 34 | my $octets = encode('iso-8859-1', $str, $cb); 35 | is $octets, "You;re doomed!", "stack was not overwritten"; 36 | 37 | $octets = encode('iso-8859-1', $str, $cb); 38 | is $octets, "You;re doomed!", "stack was not overwritten"; 39 | -------------------------------------------------------------------------------- /t/rt65541.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}) { 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: EBCDIC\n"; 13 | exit 0; 14 | } 15 | $| = 1; 16 | } 17 | 18 | use strict; 19 | use warnings; 20 | 21 | use Encode; 22 | use PerlIO::encoding; 23 | $PerlIO::encoding::fallback &= ~Encode::WARN_ON_ERR; 24 | 25 | use Test::More tests => 3; 26 | 27 | ok open my $fh, ">:encoding(cp1250)", do{\(my $str)}; 28 | ok print $fh ("a" x 1023) . "\x{0378}"; 29 | ok close $fh; 30 | -------------------------------------------------------------------------------- /t/rt76824.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}) { 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: EBCDIC\n"; 13 | exit 0; 14 | } 15 | $| = 1; 16 | } 17 | 18 | use strict; 19 | use warnings; 20 | 21 | use Encode; 22 | use PerlIO::encoding; 23 | $PerlIO::encoding::fallback &= ~Encode::WARN_ON_ERR; 24 | 25 | use Test::More tests => 2; 26 | 27 | my $out; 28 | my @arr = ( 29 | "\x{feff}\x{39f}\x{3af} \x{3a3}\x{3c5}\x{3bd}\x{3ad}\x{3bd}\x{3bf}\x{3c7}\x{3bf}\x{3b9}\n", 30 | "\x{39f}\x{3b9} \x{393}\x{3b5}\x{3bd}\x{3bd}\x{3b1}\x{3af}\x{3bf}\x{3b9} \x{3c4}\x{3b7}\x{3c2} \x{3a3}\x{3b1}\x{3bc}\x{3bf}\x{3b8}\x{3c1}\x{3ac}\x{3ba}\x{3b7}\x{3c2}\n", 31 | "\x{39f}\x{3b9} \x{393}\x{3b5}\x{3c1}\x{3bc}\x{3b1}\x{3bd}\x{3bf}\x{3af} \x{3be}\x{3b1}\x{3bd}\x{3ac}\x{3c1}\x{3c7}\x{3bf}\x{3bd}\x{3c4}\x{3b1}\x{3b9}...\n", 32 | "\x{39f}\x{3b9} \x{395}\x{3c1}\x{3b1}\x{3c3}\x{3c4}\x{3ad}\x{3c2} \x{3a4}\x{3bf}\x{3c5} \x{391}\x{3b9}\x{3b3}\x{3b1}\x{3af}\x{3bf}\x{3c5}\n", 33 | "\x{39f}\x{3b9} \x{39a}\x{3c5}\x{3bd}\x{3b7}\x{3b3}\x{3bf}\x{3af}\n", 34 | "\x{39f}\x{3b9} \x{3a0}\x{3b1}\x{3bd}\x{3ba}\x{3c2} \x{3a4}\x{3b1} \x{39a}\x{3ac}\x{3bd}\x{3bf}\x{3c5}\x{3bd} \x{38c}\x{3bb}\x{3b1}\n", 35 | "\x{39f}\x{3b9} \x{3a6}\x{3b1}\x{3bd}\x{3c4}\x{3b1}\x{3c1}\x{3af}\x{3bd}\x{3b5}\x{3c2}\n", 36 | "\x{39f}\x{3b9}\x{3ba}\x{3bf}\x{3b3}\x{3ad}\x{3bd}\x{3b5}\x{3b9}\x{3b1} \x{3a0}\x{3b1}\x{3bd}\x{3c4}\x{3c1}\x{3b5}\x{3c5}\x{3cc}\x{3bc}\x{3b1}\x{3c3}\x{3c4}\x{3b5}\n", 37 | "\x{39f}\x{3bb}\x{3b1} \x{3b5}\x{3af}\x{3bd}\x{3b1}\x{3b9} \x{3b4}\x{3c1}\x{3cc}\x{3bc}\x{3bf}\x{3c2}\n", 38 | "\x{39f}\x{3bc}\x{3b7}\x{3c1}\x{3bf}\x{3c2}\n", 39 | "\x{39f}\x{3be}\x{3c5}\x{3b3}\x{3cc}\x{3bd}\x{3bf}\n", 40 | "\x{39f}\x{3c1}\x{3b1}\x{3c4}\x{3cc}\x{3c4}\x{3b7}\x{3c2} \x{3bc}\x{3b7}\x{3b4}\x{3ad}\x{3bd}\n", 41 | "\x{3c0}\n", 42 | "\x{3c0}\x{3ac}\x{3bd}\x{3c9}, \x{3ba}\x{3ac}\x{3c4}\x{3c9} \x{3ba}\x{3b1}\x{3b9} \x{3c0}\x{3bb}\x{3b1}\x{3b3}\x{3af}\x{3c9}\x{3c2}\n", 43 | "\x{3a4}\x{3bf} \x{39a}\x{3b1}\x{3ba}\x{3cc}\n", 44 | "\x{3a4}\x{3bf} \x{39a}\x{3b1}\x{3ba}\x{3cc} - \x{3a3}\x{3c4}\x{3b7}\x{3bd} \x{395}\x{3c0}\x{3bf}\x{3c7}\x{3ae} \x{3c4}\x{3c9}\x{3bd} \x{397}\x{3c1}\x{3ce}\x{3c9}\x{3bd}\n", 45 | "\x{3a4}\x{3bf} \x{3ba}\x{3bb}\x{3ac}\x{3bc}\x{3b1} \x{3b2}\x{3b3}\x{3ae}\x{3ba}\x{3b5} \x{3b1}\x{3c0}'\x{3c4}\x{3bf}\x{3bd} \x{3c0}\x{3b1}\x{3c1}\x{3ac}\x{3b4}\x{3b5}\x{3b9}\x{3c3}\x{3bf}\n", 46 | "\x{3a4}\x{3bf} \x{3ba}\x{3bf}\x{3c1}\x{3af}\x{3c4}\x{3c3}\x{3b9} \x{3bc}\x{3b5} \x{3c4}\x{3b1} \x{3bc}\x{3b1}\x{3cd}\x{3c1}\x{3b1}\n", 47 | "\x{3a4}\x{3bf} \x{3ba}\x{3bf}\x{3c1}\x{3af}\x{3c4}\x{3c3}\x{3b9} \x{3c4}\x{3bf}\x{3c5} \x{3bb}\x{3bf}\x{3cd}\x{3bd}\x{3b1} \x{3c0}\x{3b1}\x{3c1}\x{3ba}\n", 48 | "\x{3a4}\x{3bf} \x{39e}\x{3cd}\x{3bb}\x{3bf} \x{3b2}\x{3b3}\x{3ae}\x{3ba}\x{3b5} \x{3b1}\x{3c0}\x{3cc} \x{3c4}\x{3bf}\x{3bd} \x{3c0}\x{3b1}\x{3c1}\x{3ac}\x{3b4}\x{3b5}\x{3b9}\x{3c3}\x{3bf}\n", 49 | "\x{3a4}\x{3bf} \x{3c0}\x{3b9}\x{3bf} \x{3bb}\x{3b1}\x{3bc}\x{3c0}\x{3c1}\x{3cc} \x{3b1}\x{3c3}\x{3c4}\x{3ad}\x{3c1}\x{3b9}\n", 50 | "\x{3a4}\x{3bf} \x{3a1}\x{3b5}\x{3bc}\x{3b1}\x{3bb}\x{3b9} \x{3a4}\x{3b7}\x{3c2} \x{391}\x{3b8}\x{3b7}\x{3bd}\x{3b1}\x{3c2}\n", 51 | "\x{3a4}\x{3bf} \x{3a4}\x{3b1}\x{3bd}\x{3b3}\x{3ba}\x{3cc} \x{3c4}\x{3c9}\x{3bd} \x{3a7}\x{3c1}\x{3b9}\x{3c3}\x{3c4}\x{3bf}\x{3c5}\x{3b3}\x{3ad}\x{3bd}\x{3bd}\x{3c9}\x{3bd}\n", 52 | "\x{3a4}\x{3bf} \x{3c4}\x{3b5}\x{3bb}\x{3b5}\x{3c5}\x{3c4}\x{3b1}\x{3af}\x{3bf} \x{3c8}\x{3ad}\x{3bc}\x{3bc}\x{3b1}\n", 53 | "\x{3a4}\x{3bf} \x{3c6}\x{3b9}\x{3bb}\x{3af} \x{3c4}\x{3b7}\x{3c2}... \x{396}\x{3c9}\x{3ae}\x{3c2}\n", 54 | "\x{3a4}\x{3bf} \x{3c7}\x{3ce}\x{3bc}\x{3b1} \x{3b2}\x{3ac}\x{3c6}\x{3c4}\x{3b7}\x{3ba}\x{3b5} \x{3ba}\x{3cc}\x{3ba}\x{3ba}\x{3b9}\x{3bd}\x{3bf}\n", 55 | "\x{3a4}\x{3bf}\x{3c0}\x{3af}\x{3bf} \x{3c3}\x{3c4}\x{3b7}\x{3bd} \x{3bf}\x{3bc}\x{3af}\x{3c7}\x{3bb}\x{3b7}\n", 56 | "\x{3a4}\x{3c1}\x{3b9}\x{3bb}\x{3bf}\x{3b3}\x{3af}\x{3b1} 1: \x{3a4}\x{3bf} \x{39b}\x{3b9}\x{3b2}\x{3ac}\x{3b4}\x{3b9} \x{3c0}\x{3bf}\x{3c5} \x{3b4}\x{3b1}\x{3ba}\x{3c1}\x{3cd}\x{3b6}\x{3b5}\x{3b9}\n" 57 | ); 58 | ok open my $wh, '>:crlf:encoding(ISO-8859-1)', \$out; 59 | print $wh $_ for @arr; 60 | ok close $wh; 61 | -------------------------------------------------------------------------------- /t/rt85489.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}) { 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: EBCDIC\n"; 13 | exit 0; 14 | } 15 | $| = 1; 16 | } 17 | 18 | use strict; 19 | use warnings; 20 | 21 | use Test::More tests => 8; 22 | 23 | use Encode; 24 | 25 | my $ascii = Encode::find_encoding("ascii"); 26 | my $orig = "str"; 27 | 28 | my $str = $orig; 29 | ok !Encode::is_utf8($str), "UTF8 flag is not set on input string before ascii encode"; 30 | $ascii->encode($str); 31 | ok !Encode::is_utf8($str), "UTF8 flag is not set on input string after ascii encode"; 32 | 33 | $str = $orig; 34 | ok !Encode::is_utf8($str), "UTF8 flag is not set on input string before Encode::encode ascii"; 35 | Encode::encode("ascii", $str); 36 | ok !Encode::is_utf8($str), "UTF8 flag is not set on input string after Encode::encode ascii"; 37 | 38 | $str = $orig; 39 | Encode::_utf8_on($str); 40 | ok Encode::is_utf8($str), "UTF8 flag is set on input string before ascii decode"; 41 | $ascii->decode($str); 42 | ok Encode::is_utf8($str), "UTF8 flag is set on input string after ascii decode"; 43 | 44 | $str = $orig; 45 | Encode::_utf8_on($str); 46 | ok Encode::is_utf8($str), "UTF8 flag is set on input string before Encode::decode ascii"; 47 | Encode::decode("ascii", $str); 48 | ok Encode::is_utf8($str), "UTF8 flag is set on input string after Encode::decode ascii"; 49 | -------------------------------------------------------------------------------- /t/rt86327.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}) { 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: EBCDIC\n"; 13 | exit 0; 14 | } 15 | $| = 1; 16 | } 17 | 18 | use strict; 19 | use warnings; 20 | 21 | use Encode; 22 | use PerlIO::encoding; 23 | $PerlIO::encoding::fallback &= ~Encode::WARN_ON_ERR; 24 | 25 | use Test::More tests => 3; 26 | 27 | my @t = qw/230 13 90 65 34 239 86 15 8 26 181 25 305 123 22 139 111 6 3 28 | 100 37 1 20 1 166 1 300 19 1 42 153 81 106 114 67 1 32 34/; 29 | my $str; 30 | ok open OUT, '>:encoding(iso-8859-1)', \$str; 31 | my $string = join "\x{fffd}", map { '.'x$_ } @t; 32 | ok print OUT $string; 33 | ok close OUT; 34 | -------------------------------------------------------------------------------- /t/taint.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -T 2 | use strict; 3 | use Encode qw(encode decode); 4 | local %Encode::ExtModule = %Encode::Config::ExtModule; 5 | use Scalar::Util qw(tainted); 6 | use Test::More; 7 | use Config; 8 | my $taint = substr($ENV{PATH},0,0); 9 | my $str = "dan\x{5f3e}" . $taint; # tainted string to encode 10 | my $bin = encode('UTF-8', $str); # tainted binary to decode 11 | my $notaint = ""; 12 | my $notaint_str = "dan\x{5f3e}" . $notaint; 13 | my $notaint_bin = encode('UTF-8', $notaint_str); 14 | my @names = Encode->encodings(':all'); 15 | if (exists($Config{taint_support}) && not $Config{taint_support}) { 16 | plan skip_all => "your perl was built without taint support"; 17 | } 18 | else { 19 | plan tests => 4 * @names + 2; 20 | } 21 | for my $name (@names) { 22 | my ($d, $e, $s); 23 | eval { 24 | $e = encode($name, $str); 25 | }; 26 | SKIP: { 27 | skip $@, 1 if $@; 28 | ok tainted($e), "encode $name"; 29 | } 30 | $bin = $e.$taint if $e; 31 | eval { 32 | $d = decode($name, $bin); 33 | }; 34 | SKIP: { 35 | skip $@, 1 if $@; 36 | ok tainted($d), "decode $name"; 37 | } 38 | } 39 | for my $name (@names) { 40 | my ($d, $e, $s); 41 | eval { 42 | $e = encode($name, $notaint_str); 43 | }; 44 | SKIP: { 45 | skip $@, 1 if $@; 46 | ok ! tainted($e), "encode $name"; 47 | } 48 | $notaint_bin = $e.$notaint if $e; 49 | eval { 50 | $d = decode($name, $notaint_bin); 51 | }; 52 | SKIP: { 53 | skip $@, 1 if $@; 54 | ok ! tainted($d), "decode $name"; 55 | } 56 | } 57 | Encode::_utf8_on($bin); 58 | ok(!Encode::is_utf8($bin), "Encode::_utf8_on does not work on tainted values"); 59 | Encode::_utf8_off($str); 60 | ok(Encode::is_utf8($str), "Encode::_utf8_off does not work on tainted values"); 61 | -------------------------------------------------------------------------------- /t/truncated_utf8.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ($ENV{'PERL_CORE'}) { 3 | chdir 't'; 4 | unshift @INC, '../lib'; 5 | } 6 | require Config; Config->import(); 7 | if ($Config{'extensions'} !~ /\bEncode\b/) { 8 | print "1..0 # Skip: Encode was not built\n"; 9 | exit 0; 10 | } 11 | if (ord("A") == 193) { 12 | print "1..0 # Skip: EBCDIC\n"; 13 | exit 0; 14 | } 15 | if ( $] < 5.009 ) { 16 | print "1..0 # Skip: Perl <= 5.9 or later required\n"; 17 | exit 0; 18 | } 19 | $| = 1; 20 | } 21 | 22 | use strict; 23 | use warnings; 24 | 25 | use Encode; 26 | use PerlIO::encoding; 27 | $PerlIO::encoding::fallback &= ~(Encode::WARN_ON_ERR|Encode::PERLQQ); 28 | 29 | use Test::More tests => 9; 30 | 31 | binmode Test::More->builder->failure_output, ":utf8"; 32 | binmode Test::More->builder->todo_output, ":utf8"; 33 | 34 | is(decode("UTF-8", "\xfd\xfe"), "\x{fffd}" x 2); 35 | is(decode("UTF-8", "\xfd\xfe\xff"), "\x{fffd}" x 3); 36 | is(decode("UTF-8", "\xfd\xfe\xff\xe0"), "\x{fffd}" x 4); 37 | is(decode("UTF-8", "\xfd\xfe\xff\xe0\xe1"), "\x{fffd}" x 5); 38 | is(decode("UTF-8", "\xc1\x9f"), "\x{fffd}"); 39 | is(decode("UTF-8", "\xFF\x80\x90\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"), "\x{fffd}"); 40 | is(decode("UTF-8", "\xF0\x80\x80\x80"), "\x{fffd}"); 41 | 42 | SKIP: { 43 | # infinite loop due to bug: https://rt.perl.org/Public/Bug/Display.html?id=41442 44 | skip "Perl Version ($]) is older than v5.8.9", 2 if $] < 5.008009; 45 | my $str = ("x" x 1023) . "\xfd\xfe\xffx"; 46 | open my $fh, '<:encoding(UTF-8)', \$str; 47 | my $str2 = <$fh>; 48 | close $fh; 49 | is($str2, ("x" x 1023) . ("\x{fffd}" x 3) . "x"); 50 | 51 | TODO: { 52 | local $TODO = "bug in perlio" if $] < 5.027009; 53 | my $str = ("x" x 1023) . "\xfd\xfe\xff"; 54 | open my $fh, '<:encoding(UTF-8)', \$str; 55 | my $str2 = <$fh>; 56 | close $fh; 57 | is($str2, ("x" x 1023) . ("\x{fffd}" x 3)); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /t/undef.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings FATAL => 'all'; 3 | 4 | use Test::More; 5 | 6 | use Encode qw(encode decode find_encoding); 7 | use Encode::Encoder qw(encoder); 8 | 9 | local %Encode::ExtModule = %Encode::Config::ExtModule; 10 | 11 | my @names = Encode->encodings(':all'); 12 | 13 | plan tests => 1 + 4 * @names; 14 | 15 | my $emptyutf8; 16 | eval { my $c = encoder($emptyutf8)->utf8; }; 17 | ok(!$@,"crashed encoding undef variable ($@)"); 18 | 19 | for my $name (@names) { 20 | my $enc = find_encoding($name); 21 | is($enc->encode(undef), undef, "find_encoding('$name')->encode(undef) returns undef"); 22 | is($enc->decode(undef), undef, "find_encoding('$name')->decode(undef) returns undef"); 23 | is(encode($name, undef), undef, "encode('$name', undef) returns undef"); 24 | is(decode($name, undef), undef, "decode('$name', undef) returns undef"); 25 | } 26 | -------------------------------------------------------------------------------- /t/unibench.pl: -------------------------------------------------------------------------------- 1 | #!./perl 2 | 3 | use strict; 4 | use Encode; 5 | use Benchmark qw(:all); 6 | 7 | my $Count = shift @ARGV; 8 | $Count ||= 16; 9 | my @sizes = @ARGV || (1, 4, 16); 10 | 11 | my %utf8_seed; 12 | for my $i (0x00..0xff){ 13 | my $c = chr($i); 14 | $utf8_seed{BMP} .= ($c =~ /^\p{IsPrint}/o) ? $c : " "; 15 | } 16 | utf8::upgrade($utf8_seed{BMP}); 17 | 18 | for my $i (0x00..0xff){ 19 | my $c = chr(0x10000+$i); 20 | $utf8_seed{HIGH} .= ($c =~ /^\p{IsPrint}/o) ? $c : " "; 21 | } 22 | utf8::upgrade($utf8_seed{HIGH}); 23 | 24 | my %S; 25 | for my $i (@sizes){ 26 | my $sz = 256 * $i; 27 | for my $cp (qw(BMP HIGH)){ 28 | $S{utf8}{$sz}{$cp} = $utf8_seed{$cp} x $i; 29 | $S{utf16}{$sz}{$cp} = encode('UTF-16BE', $S{utf8}{$sz}{$cp}); 30 | } 31 | } 32 | 33 | for my $i (@sizes){ 34 | my $sz = $i * 256; 35 | my $count = $Count * int(256/$i); 36 | for my $cp (qw(BMP HIGH)){ 37 | for my $op (qw(encode decode)){ 38 | my ($meth, $from, $to) = ($op eq 'encode') ? 39 | (\&encode, 'utf8', 'utf16') : (\&decode, 'utf16', 'utf8'); 40 | my $XS = sub { 41 | Encode::Unicode::set_transcoder("xs"); 42 | $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) 43 | eq $S{$to}{$sz}{$cp} 44 | or die "$op,$from,$to,$sz,$cp"; 45 | }; 46 | my $modern = sub { 47 | Encode::Unicode::set_transcoder("modern"); 48 | $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) 49 | eq $S{$to}{$sz}{$cp} 50 | or die "$op,$from,$to,$sz,$cp"; 51 | }; 52 | my $classic = sub { 53 | Encode::Unicode::set_transcoder("classic"); 54 | $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) 55 | eq $S{$to}{$sz}{$cp} or 56 | die "$op,$from,$to,$sz,$cp"; 57 | }; 58 | print "---- $op length=$sz/range=$cp ----\n"; 59 | my $r = timethese($count, 60 | { 61 | "XS" => $XS, 62 | "Modern" => $modern, 63 | "Classic" => $classic, 64 | }, 65 | 'none', 66 | ); 67 | cmpthese($r); 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /t/use-Encode-Alias.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Encode::Alias; 5 | use open ":std", ":locale"; 6 | 7 | print "1..1\n"; 8 | print "ok 1 - use Encode::Alias works\n"; 9 | -------------------------------------------------------------------------------- /t/utf8ref.t: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: utf8ref.t,v 1.2 2016/10/28 05:03:52 dankogai Exp $ 3 | # 4 | 5 | use strict; 6 | use warnings; 7 | use Encode; 8 | use Test::More; 9 | plan tests => 12; 10 | #plan 'no_plan'; 11 | 12 | # my $a = find_encoding('ASCII'); 13 | my $u = find_encoding('UTF-8'); 14 | my $r = []; 15 | no warnings 'uninitialized'; 16 | is encode_utf8($r), ''.$r; 17 | is $u->encode($r), ''.$r; 18 | $r = {}; 19 | is decode_utf8($r), ''.$r; 20 | is $u->decode($r), ''.$r; 21 | use warnings 'uninitialized'; 22 | 23 | is encode_utf8(undef), undef; 24 | is decode_utf8(undef), undef; 25 | 26 | is encode_utf8(''), ''; 27 | is decode_utf8(''), ''; 28 | 29 | is Encode::encode('utf8', undef), undef; 30 | is Encode::decode('utf8', undef), undef; 31 | 32 | is Encode::encode('utf8', ''), ''; 33 | is Encode::decode('utf8', ''), ''; 34 | -------------------------------------------------------------------------------- /t/utf8strict.t: -------------------------------------------------------------------------------- 1 | #!../perl 2 | our $DEBUG = @ARGV; 3 | our (%ORD, %SEQ, $NTESTS); 4 | BEGIN { 5 | if ($ENV{'PERL_CORE'}){ 6 | chdir 't'; 7 | unshift @INC, '../lib'; 8 | } 9 | require Config; Config->import(); 10 | if ($Config{'extensions'} !~ /\bEncode\b/) { 11 | print "1..0 # Skip: Encode was not built\n"; 12 | exit 0; 13 | } 14 | if ($] <= 5.008 and !$Config{perl_patchlevel}){ 15 | print "1..0 # Skip: Perl 5.8.1 or later required\n"; 16 | exit 0; 17 | } 18 | # http://smontagu.damowmow.com/utf8test.html 19 | # The numbers below, like 2.1.2 are test numbers on this web page 20 | %ORD = ( 21 | 0x00000080 => 0, # 2.1.2 22 | 0x00000800 => 0, # 2.1.3 23 | 0x00010000 => 0, # 2.1.4 24 | 0x00200000 => 1, # 2.1.5 25 | 0x00400000 => 1, # 2.1.6 26 | 0x0000007F => 0, # 2.2.1 -- unmapped okay 27 | 0x000007FF => 0, # 2.2.2 28 | 0x0000FFFF => 1, # 2.2.3 29 | 0x001FFFFF => 1, # 2.2.4 30 | 0x03FFFFFF => 1, # 2.2.5 31 | 0x7FFFFFFF => 1, # 2.2.6 32 | 0x0000D800 => 1, # 5.1.1 33 | 0x0000DB7F => 1, # 5.1.2 34 | 0x0000D880 => 1, # 5.1.3 35 | 0x0000DBFF => 1, # 5.1.4 36 | 0x0000DC00 => 1, # 5.1.5 37 | 0x0000DF80 => 1, # 5.1.6 38 | 0x0000DFFF => 1, # 5.1.7 39 | # 5.2 "Paird UTF-16 surrogates skipped 40 | # because utf-8-strict raises exception at the first one 41 | 0x0000FFFF => 1, # 5.3.1 42 | ); 43 | $NTESTS += scalar keys %ORD; 44 | if (ord('A') == 193) { 45 | %SEQ = ( 46 | qq/dd 64 73 73/ => 0, # 2.3.1 47 | qq/dd 67 41 41/ => 0, # 2.3.2 48 | qq/ee 42 73 73 71/ => 0, # 2.3.3 49 | qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG 50 | # EBCDIC TODO: "3 Malformed sequences" 51 | # EBCDIC TODO: "4 Overlong sequences" 52 | ); 53 | } else { 54 | %SEQ = ( 55 | qq/ed 9f bf/ => 0, # 2.3.1 56 | qq/ee 80 80/ => 0, # 2.3.2 57 | qq/f4 8f bf bd/ => 0, # 2.3.3 58 | qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG 59 | qq/80/ => 1, # 3.1.1 60 | qq/bf/ => 1, # 3.1.2 61 | qq/80 bf/ => 1, # 3.1.3 62 | qq/80 bf 80/ => 1, # 3.1.4 63 | qq/80 bf 80 bf/ => 1, # 3.1.5 64 | qq/80 bf 80 bf 80/ => 1, # 3.1.6 65 | qq/80 bf 80 bf 80 bf/ => 1, # 3.1.7 66 | qq/80 bf 80 bf 80 bf 80/ => 1, # 3.1.8 67 | qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ => 1, # 3.1.9 68 | qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ => 1, # 3.2.1 69 | qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ => 1, # 3.2.2 70 | qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ => 1, # 3.2.3 71 | qq/f8 20 f9 20 fa 20 fb 20/ => 1, # 3.2.4 72 | qq/fc 20 fd 20/ => 1, # 3.2.5 73 | qq/c0/ => 1, # 3.3.1 74 | qq/e0 80/ => 1, # 3.3.2 75 | qq/f0 80 80/ => 1, # 3.3.3 76 | qq/f8 80 80 80/ => 1, # 3.3.4 77 | qq/fc 80 80 80 80/ => 1, # 3.3.5 78 | qq/df/ => 1, # 3.3.6 79 | qq/ef bf/ => 1, # 3.3.7 80 | qq/f7 bf bf/ => 1, # 3.3.8 81 | qq/fb bf bf bf/ => 1, # 3.3.9 82 | qq/fd bf bf bf bf/ => 1, # 3.3.10 83 | qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ => 1, # 3.4.1 84 | qq/fe/ => 1, # 3.5.1 85 | qq/ff/ => 1, # 3.5.2 86 | qq/fe fe ff ff/ => 1, # 3.5.3 87 | qq/c0 af/ => 1, # 4.1.1 88 | qq/e0 80 af/ => 1, # 4.1.2 89 | qq/f0 80 80 af/ => 1, # 4.1.3 90 | qq/f8 80 80 80 af/ => 1, # 4.1.4 91 | qq/fc 80 80 80 80 af/ => 1, # 4.1.5 92 | qq/c1 bf/ => 1, # 4.2.1 93 | qq/e0 9f bf/ => 1, # 4.2.2 94 | qq/f0 8f bf bf/ => 1, # 4.2.3 95 | qq/f8 87 bf bf bf/ => 1, # 4.2.4 96 | qq/fc 83 bf bf bf bf/ => 1, # 4.2.5 97 | qq/c0 80/ => 1, # 4.3.1 98 | qq/e0 80 80/ => 1, # 4.3.2 99 | qq/f0 80 80 80/ => 1, # 4.3.3 100 | qq/f8 80 80 80 80/ => 1, # 4.3.4 101 | qq/fc 80 80 80 80 80/ => 1, # 4.3.5 102 | ); 103 | } 104 | $NTESTS += scalar keys %SEQ; 105 | } 106 | use strict; 107 | use Encode; 108 | use utf8; 109 | use Test::More tests => $NTESTS; 110 | 111 | local($SIG{__WARN__}) = sub { $DEBUG and $@ and print STDERR $@ }; 112 | 113 | my $d = find_encoding("utf-8-strict"); 114 | for my $u (sort keys %ORD){ 115 | my $c = chr($u); 116 | eval { $d->encode($c,1) }; 117 | $DEBUG and $@ and warn $@; 118 | my $t = $@ ? 1 : 0; 119 | is($t, $ORD{$u}, sprintf "U+%04X", $u); 120 | } 121 | for my $s (sort keys %SEQ){ 122 | my $o = pack "C*" => map {hex} split /\s+/, $s; 123 | eval { $d->decode($o,1) }; 124 | $DEBUG and $@ and warn $@; 125 | my $t = $@ ? 1 : 0; 126 | is($t, $SEQ{$s}, "sequence: $s"); 127 | } 128 | 129 | __END__ 130 | 131 | 132 | -------------------------------------------------------------------------------- /t/utf8warnings.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | BEGIN { 'warnings'->unimport('utf8') if $] < 5.014 }; # turn off 'UTF-16 surrogate 0xd800' warnings 4 | 5 | use Test::More; 6 | use Encode qw(encode decode FB_CROAK LEAVE_SRC); 7 | 8 | my $script = quotemeta $0; 9 | 10 | plan tests => 12; 11 | 12 | my @invalid; 13 | 14 | ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8'; 15 | like $@, qr/^"\\x\{d800\}" does not map to UTF-8 at $script line /, 'Error message contains strict UTF-8 name'; 16 | @invalid = (); 17 | encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; }); 18 | is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800'; 19 | 20 | ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder'; 21 | like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode at $script line /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence'; 22 | @invalid = (); 23 | decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; }); 24 | is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80'; 25 | 26 | ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder'; 27 | like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode at $script line /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence'; 28 | @invalid = (); 29 | decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; }); 30 | is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0'; 31 | 32 | ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder'; 33 | like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode at $script line /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence'; 34 | decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; }); 35 | is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0'; 36 | -------------------------------------------------------------------------------- /t/whatwg-aliases.t: -------------------------------------------------------------------------------- 1 | # This test checks aliases support based on the list in the 2 | # WHATWG Encoding Living Standard 3 | # 4 | # https://encoding.spec.whatwg.org/ 5 | # 6 | # The input of this test is the file whatwg-aliases.json downloaded from 7 | # https://encoding.spec.whatwg.org/encodings.json 8 | # 9 | # To run: 10 | # AUTHOR_TESTING=1 prove -l t/whatwg-aliases.t 11 | 12 | 13 | use Test::More 14 | $ENV{AUTHOR_TESTING} 15 | ? 'no_plan' 16 | : (skip_all => 'For maintainers only'); 17 | use Encode 'find_encoding'; 18 | use JSON::PP 'decode_json'; 19 | use File::Spec; 20 | use FindBin; 21 | 22 | my $encodings = decode_json(do { 23 | # https://encoding.spec.whatwg.org/encodings.json 24 | open my $f, '<', File::Spec->catdir($FindBin::Bin, 'whatwg-aliases.json'); 25 | local $/; 26 | <$f> 27 | }); 28 | 29 | my %IGNORE = map { $_ => '' } qw( 30 | replacement 31 | utf8 32 | ); 33 | 34 | my %TODO = ( 35 | 'ISO-8859-8-I' => 'Not supported', 36 | 'gb18030' => 'Not supported', 37 | '866' => 'Not supported', 38 | 'x-user-defined' => 'Not supported', 39 | # ... 40 | ); 41 | 42 | for my $section (@$encodings) { 43 | for my $enc (@{$section->{encodings}}) { 44 | 45 | my $name = $enc->{name}; 46 | 47 | next if exists $IGNORE{$name}; 48 | 49 | local $TODO = $TODO{$name} if exists $TODO{$name}; 50 | 51 | my $encoding = find_encoding($name); 52 | isa_ok($encoding, 'Encode::Encoding', $name); 53 | 54 | for my $label (@{$enc->{labels}}) { 55 | local $TODO = $TODO{$label} if exists $TODO{$label}; 56 | 57 | my $e = find_encoding($label); 58 | if (isa_ok($e, 'Encode::Encoding', $label)) { 59 | next if exists $IGNORE{$label}; 60 | is($e->name, $encoding->name, "$label ->name is $name") 61 | } 62 | } 63 | } 64 | } 65 | 66 | done_testing; 67 | -------------------------------------------------------------------------------- /t/xml.t: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | if ( $] < 5.009 ) { 3 | print "1..0 # Skip: Perl <= 5.9 or later required\n"; 4 | exit 0; 5 | } 6 | } 7 | use strict; 8 | use warnings; 9 | 10 | use Encode; 11 | use Test::More; 12 | 13 | my $content = String->new("--\x{30c6}--"); 14 | my $text = Encode::encode('latin1', $content, Encode::FB_XMLCREF); 15 | is $text, "--テ--"; 16 | 17 | done_testing(); 18 | 19 | package String; 20 | use overload 21 | '""' => sub { ${$_[0]} }, fallback => 1; 22 | 23 | sub new { 24 | my($class, $str) = @_; 25 | bless \$str, $class; 26 | } 27 | 28 | 1; 29 | -------------------------------------------------------------------------------- /ucm/ascii.ucm: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: ascii.ucm,v 2.0 2004/05/16 20:55:19 dankogai Exp $ 3 | # 4 | "ascii" 5 | "US-ascii" 6 | 1 7 | 1 8 | \x3F 9 | # 10 | CHARMAP 11 | \x00 |0 # 12 | \x01 |0 # 13 | \x02 |0 # 14 | \x03 |0 # 15 | \x04 |0 # 16 | \x05 |0 # 17 | \x06 |0 # 18 | \x07 |0 # 19 | \x08 |0 # 20 | \x09 |0 # 21 | \x0A |0 # 22 | \x0B |0 # 23 | \x0C |0 # 24 | \x0D |0 # 25 | \x0E |0 # 26 | \x0F |0 # 27 | \x10 |0 # 28 | \x11 |0 # 29 | \x12 |0 # 30 | \x13 |0 # 31 | \x14 |0 # 32 | \x15 |0 # 33 | \x16 |0 # 34 | \x17 |0 # 35 | \x18 |0 # 36 | \x19 |0 # 37 | \x1A |0 # 38 | \x1B |0 # 39 | \x1C |0 # 40 | \x1D |0 # 41 | \x1E |0 # 42 | \x1F |0 # 43 | \x20 |0 # SPACE 44 | \x21 |0 # EXCLAMATION MARK 45 | \x22 |0 # QUOTATION MARK 46 | \x23 |0 # NUMBER SIGN 47 | \x24 |0 # DOLLAR SIGN 48 | \x25 |0 # PERCENT SIGN 49 | \x26 |0 # AMPERSAND 50 | \x27 |0 # APOSTROPHE 51 | \x28 |0 # LEFT PARENTHESIS 52 | \x29 |0 # RIGHT PARENTHESIS 53 | \x2A |0 # ASTERISK 54 | \x2B |0 # PLUS SIGN 55 | \x2C |0 # COMMA 56 | \x2D |0 # HYPHEN-MINUS 57 | \x2E |0 # FULL STOP 58 | \x2F |0 # SOLIDUS 59 | \x30 |0 # DIGIT ZERO 60 | \x31 |0 # DIGIT ONE 61 | \x32 |0 # DIGIT TWO 62 | \x33 |0 # DIGIT THREE 63 | \x34 |0 # DIGIT FOUR 64 | \x35 |0 # DIGIT FIVE 65 | \x36 |0 # DIGIT SIX 66 | \x37 |0 # DIGIT SEVEN 67 | \x38 |0 # DIGIT EIGHT 68 | \x39 |0 # DIGIT NINE 69 | \x3A |0 # COLON 70 | \x3B |0 # SEMICOLON 71 | \x3C |0 # LESS-THAN SIGN 72 | \x3D |0 # EQUALS SIGN 73 | \x3E |0 # GREATER-THAN SIGN 74 | \x3F |0 # QUESTION MARK 75 | \x40 |0 # COMMERCIAL AT 76 | \x41 |0 # LATIN CAPITAL LETTER A 77 | \x42 |0 # LATIN CAPITAL LETTER B 78 | \x43 |0 # LATIN CAPITAL LETTER C 79 | \x44 |0 # LATIN CAPITAL LETTER D 80 | \x45 |0 # LATIN CAPITAL LETTER E 81 | \x46 |0 # LATIN CAPITAL LETTER F 82 | \x47 |0 # LATIN CAPITAL LETTER G 83 | \x48 |0 # LATIN CAPITAL LETTER H 84 | \x49 |0 # LATIN CAPITAL LETTER I 85 | \x4A |0 # LATIN CAPITAL LETTER J 86 | \x4B |0 # LATIN CAPITAL LETTER K 87 | \x4C |0 # LATIN CAPITAL LETTER L 88 | \x4D |0 # LATIN CAPITAL LETTER M 89 | \x4E |0 # LATIN CAPITAL LETTER N 90 | \x4F |0 # LATIN CAPITAL LETTER O 91 | \x50 |0 # LATIN CAPITAL LETTER P 92 | \x51 |0 # LATIN CAPITAL LETTER Q 93 | \x52 |0 # LATIN CAPITAL LETTER R 94 | \x53 |0 # LATIN CAPITAL LETTER S 95 | \x54 |0 # LATIN CAPITAL LETTER T 96 | \x55 |0 # LATIN CAPITAL LETTER U 97 | \x56 |0 # LATIN CAPITAL LETTER V 98 | \x57 |0 # LATIN CAPITAL LETTER W 99 | \x58 |0 # LATIN CAPITAL LETTER X 100 | \x59 |0 # LATIN CAPITAL LETTER Y 101 | \x5A |0 # LATIN CAPITAL LETTER Z 102 | \x5B |0 # LEFT SQUARE BRACKET 103 | \x5C |0 # REVERSE SOLIDUS 104 | \x5D |0 # RIGHT SQUARE BRACKET 105 | \x5E |0 # CIRCUMFLEX ACCENT 106 | \x5F |0 # LOW LINE 107 | \x60 |0 # GRAVE ACCENT 108 | \x61 |0 # LATIN SMALL LETTER A 109 | \x62 |0 # LATIN SMALL LETTER B 110 | \x63 |0 # LATIN SMALL LETTER C 111 | \x64 |0 # LATIN SMALL LETTER D 112 | \x65 |0 # LATIN SMALL LETTER E 113 | \x66 |0 # LATIN SMALL LETTER F 114 | \x67 |0 # LATIN SMALL LETTER G 115 | \x68 |0 # LATIN SMALL LETTER H 116 | \x69 |0 # LATIN SMALL LETTER I 117 | \x6A |0 # LATIN SMALL LETTER J 118 | \x6B |0 # LATIN SMALL LETTER K 119 | \x6C |0 # LATIN SMALL LETTER L 120 | \x6D |0 # LATIN SMALL LETTER M 121 | \x6E |0 # LATIN SMALL LETTER N 122 | \x6F |0 # LATIN SMALL LETTER O 123 | \x70 |0 # LATIN SMALL LETTER P 124 | \x71 |0 # LATIN SMALL LETTER Q 125 | \x72 |0 # LATIN SMALL LETTER R 126 | \x73 |0 # LATIN SMALL LETTER S 127 | \x74 |0 # LATIN SMALL LETTER T 128 | \x75 |0 # LATIN SMALL LETTER U 129 | \x76 |0 # LATIN SMALL LETTER V 130 | \x77 |0 # LATIN SMALL LETTER W 131 | \x78 |0 # LATIN SMALL LETTER X 132 | \x79 |0 # LATIN SMALL LETTER Y 133 | \x7A |0 # LATIN SMALL LETTER Z 134 | \x7B |0 # LEFT CURLY BRACKET 135 | \x7C |0 # VERTICAL LINE 136 | \x7D |0 # RIGHT CURLY BRACKET 137 | \x7E |0 # TILDE 138 | \x7F |0 # 139 | END CHARMAP 140 | -------------------------------------------------------------------------------- /ucm/ctrl.ucm: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: ctrl.ucm,v 2.0 2004/05/16 20:55:23 dankogai Exp $ 3 | # 4 | "ascii-ctrl" 5 | 1 6 | 1 7 | \x3F 8 | # 9 | CHARMAP 10 | \x00 |0 # 11 | \x01 |0 # 12 | \x02 |0 # 13 | \x03 |0 # 14 | \x04 |0 # 15 | \x05 |0 # 16 | \x06 |0 # 17 | \x07 |0 # 18 | \x08 |0 # 19 | \x09 |0 # 20 | \x0A |0 # 21 | \x0B |0 # 22 | \x0C |0 # 23 | \x0D |0 # 24 | \x0E |0 # 25 | \x0F |0 # 26 | \x10 |0 # 27 | \x11 |0 # 28 | \x12 |0 # 29 | \x13 |0 # 30 | \x14 |0 # 31 | \x15 |0 # 32 | \x16 |0 # 33 | \x17 |0 # 34 | \x18 |0 # 35 | \x19 |0 # 36 | \x1A |0 # 37 | \x1B |0 # 38 | \x1C |0 # 39 | \x1D |0 # 40 | \x1E |0 # 41 | \x1F |0 # 42 | END CHARMAP 43 | -------------------------------------------------------------------------------- /ucm/macUkraine.ucm: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: macUkraine.ucm,v 2.0 2004/05/16 20:55:28 dankogai Exp $ 3 | # 4 | # Original table can be obtained at 5 | # http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/UKRAINE.TXT 6 | # 7 | "MacUkrainian" 8 | "MacUkraine" 9 | 1 10 | 1 11 | \x3F 12 | CHARMAP 13 | \x00 |0 # 14 | \x01 |0 # 15 | \x02 |0 # 16 | \x03 |0 # 17 | \x04 |0 # 18 | \x05 |0 # 19 | \x06 |0 # 20 | \x07 |0 # 21 | \x08 |0 # 22 | \x09 |0 # 23 | \x0A |0 # 24 | \x0B |0 # 25 | \x0C |0 # 26 | \x0D |0 # 27 | \x0E |0 # 28 | \x0F |0 # 29 | \x10 |0 # 30 | \x11 |0 # 31 | \x12 |0 # 32 | \x13 |0 # 33 | \x14 |0 # 34 | \x15 |0 # 35 | \x16 |0 # 36 | \x17 |0 # 37 | \x18 |0 # 38 | \x19 |0 # 39 | \x1A |0 # 40 | \x1B |0 # 41 | \x1C |0 # 42 | \x1D |0 # 43 | \x1E |0 # 44 | \x1F |0 # 45 | END CHARMAP 46 | -------------------------------------------------------------------------------- /ucm/null.ucm: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: null.ucm,v 2.0 2004/05/16 20:55:28 dankogai Exp $ 3 | # 4 | "null" 5 | 1 6 | 1 7 | \x3F 8 | # 9 | CHARMAP 10 | END CHARMAP 11 | --------------------------------------------------------------------------------