├── t ├── sjis.html ├── 00_compile.t └── 01_guess.t ├── MANIFEST ├── MANIFEST.SKIP ├── Makefile.PL ├── Changes ├── README └── lib └── Apache └── GuessCharset.pm /t/sjis.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miyagawa/Apache-GuessCharset/master/t/sjis.html -------------------------------------------------------------------------------- /t/00_compile.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Test::More tests => 1; 3 | 4 | BEGIN { use_ok 'Apache::GuessCharset' } 5 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | Changes 2 | MANIFEST 3 | Makefile.PL 4 | README 5 | lib/Apache/GuessCharset.pm 6 | t/00_compile.t 7 | t/01_guess.t 8 | t/sjis.html 9 | -------------------------------------------------------------------------------- /MANIFEST.SKIP: -------------------------------------------------------------------------------- 1 | \bRCS\b 2 | \bCVS\b 3 | ^MANIFEST\. 4 | ^Makefile$ 5 | ~$ 6 | \.old$ 7 | ^blib/ 8 | ^pm_to_blib 9 | ^MakeMaker-\d 10 | \.gz$ 11 | \.cvsignore 12 | -------------------------------------------------------------------------------- /Makefile.PL: -------------------------------------------------------------------------------- 1 | use ExtUtils::MakeMaker; 2 | require 5.7.3; 3 | WriteMakefile( 4 | 'NAME' => 'Apache::GuessCharset', 5 | 'VERSION_FROM' => 'lib/Apache/GuessCharset.pm', # finds $VERSION 6 | 'PREREQ_PM' => { 7 | 'Test::More' => 0.32, 8 | 'Encode::Guess' => 0, 9 | 'Apache::File' => 0, 10 | 'I18N::Charset' => 1.16, 11 | }, 12 | ); 13 | -------------------------------------------------------------------------------- /Changes: -------------------------------------------------------------------------------- 1 | Revision history for Perl extension Apache::GuessCharset 2 | 3 | 0.03 Sat May 11 13:31:31 JST 2002 4 | * handles any text/* requests without charset attribute 5 | * charset attribute is now lowercased 6 | - returns DECLINED if not in main request 7 | - added mock test code 8 | 9 | 0.02 Fri May 3 22:27:43 JST 2002 10 | - uses $r->finfo and fixed some style issue 11 | (Thanks to Geoffrey Young ) 12 | - CPAN release 13 | 14 | 0.01 Tue Apr 30 14:33:20 2002 15 | - original version 16 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | NAME 2 | Apache::GuessCharset - adds HTTP charset by guessing file's encoding 3 | 4 | SYNOPSIS 5 | PerlModule Apache::GuessCharset 6 | SetHandler perl-script 7 | PerlFixupHandler Apache::GuessCharset 8 | 9 | # how many bytes to read for guessing (default 512) 10 | PerlSetVar GuessCharsetBufferSize 1024 11 | 12 | # list of encoding suspects 13 | PerlSetVar GuessCharsetSuspects euc-jp 14 | PerlAddVar GuessCharsetSuspects shiftjis 15 | PerlAddVar GuessCharsetSuspects 7bit-jis 16 | 17 | DESCRIPTION 18 | Apache::GuessCharset is an Apache handler which adds HTTP charset 19 | attribute by automaticaly guessing file' encodings via Encode::Guess. 20 | 21 | CONFIGURATION 22 | This module uses following configuration variables. 23 | 24 | GuessCharsetSuspects 25 | a list of encodings for "Encode::Guess" to check. See the 26 | Encode::Guess manpage for details. 27 | 28 | GuessCharsetBufferSize 29 | specifies how many bytes for this module to read from source file, 30 | to properly guess encodings. default is 512. 31 | 32 | AUTHOR 33 | Tatsuhiko Miyagawa 34 | 35 | This library is free software; you can redistribute it and/or modify it 36 | under the same terms as Perl itself. 37 | 38 | SEE ALSO 39 | the Encode::Guess manpage, the Apache::File manpage 40 | 41 | -------------------------------------------------------------------------------- /t/01_guess.t: -------------------------------------------------------------------------------- 1 | use Test::More tests => 6; 2 | 3 | no warnings 'once'; 4 | use Apache::GuessCharset; 5 | use Apache::FakeRequest; 6 | use FileHandle; 7 | 8 | @Apache::File::ISA = qw(FileHandle); 9 | 10 | package Mock::Apache::Table; 11 | sub get { @{$_[0]->{$_[1]}} } 12 | 13 | local *Apache::FakeRequest::finfo = sub { 14 | FileHandle->new(shift->{filename}); 15 | }; 16 | 17 | local *Apache::FakeRequest::dir_config = sub { 18 | my $self = shift; 19 | return @_ ? @{$self->{dir_config}->{$_[0]}} 20 | : bless $self->{dir_config}, 'Mock::Apache::Table'; 21 | }; 22 | 23 | package main; 24 | 25 | { 26 | my $r = Apache::FakeRequest->new( 27 | is_main => 1, 28 | filename => "t/sjis.html", 29 | content_type => 'text/html', 30 | dir_config => { 31 | GuessCharsetSuspects => [ qw(euc-jp shiftjis 7bit-jis) ], 32 | }, 33 | ); 34 | 35 | my $code = Apache::GuessCharset::handler($r); 36 | is $code, Apache::Constants::OK, 'status code is OK'; 37 | is $r->content_type, 'text/html; charset=shift_jis', 'encoding is shift_jis'; 38 | } 39 | 40 | { 41 | my $r = Apache::FakeRequest->new( 42 | is_main => 1, 43 | filename => "t/sjis.html", 44 | content_type => 'text/plain', 45 | dir_config => { 46 | GuessCharsetSuspects => [ qw(shiftjis) ], 47 | }, 48 | ); 49 | 50 | my $code = Apache::GuessCharset::handler($r); 51 | is $code, Apache::Constants::OK, 'status code is OK: should work with text/plain'; 52 | is $r->content_type, 'text/plain; charset=shift_jis', 'encoding is shift_jis'; 53 | } 54 | 55 | { 56 | my $r = Apache::FakeRequest->new( 57 | is_main => 1, 58 | filename => "t", 59 | content_type => 'text/plain', 60 | dir_config => { 61 | GuessCharsetSuspects => [ qw(shiftjis) ], 62 | }, 63 | ); 64 | 65 | my $code = Apache::GuessCharset::handler($r); 66 | is $code, Apache::Constants::DECLINED, 'DECLINED for directory'; 67 | } 68 | 69 | { 70 | my $r = Apache::FakeRequest->new( 71 | is_main => 0, 72 | ); 73 | 74 | my $code = Apache::GuessCharset::handler($r); 75 | is $code, Apache::Constants::DECLINED, 'DECLINED for subreq'; 76 | } 77 | -------------------------------------------------------------------------------- /lib/Apache/GuessCharset.pm: -------------------------------------------------------------------------------- 1 | package Apache::GuessCharset; 2 | 3 | use strict; 4 | our $VERSION = 0.03; 5 | our $DEBUG = 0; 6 | 7 | use Apache::Constants; 8 | use Apache::File; 9 | use Encode::Guess; 10 | use I18N::Charset; 11 | 12 | # generated from http://www.iana.org/assignments/character-sets 13 | # '/Name: (\S*)/ and $name = $1; /Alias: (\S*).*preferred MIME/ and print qq("$name" => "$1",)' 14 | our %Prefered_MIME = ( 15 | "ANSI_X3.4-1968" => "US-ASCII", 16 | "ISO_8859-1:1987" => "ISO-8859-1", 17 | "ISO_8859-2:1987" => "ISO-8859-2", 18 | "ISO_8859-3:1988" => "ISO-8859-3", 19 | "ISO_8859-4:1988" => "ISO-8859-4", 20 | "ISO_8859-6:1987" => "ISO-8859-6", 21 | "ISO_8859-6-E" => "ISO-8859-6-E", 22 | "ISO_8859-6-I" => "ISO-8859-6-I", 23 | "ISO_8859-7:1987" => "ISO-8859-7", 24 | "ISO_8859-8:1988" => "ISO-8859-8", 25 | "ISO_8859-8-E" => "ISO-8859-8-E", 26 | "ISO_8859-8-I" => "ISO-8859-8-I", 27 | "ISO_8859-5:1988" => "ISO-8859-5", 28 | "ISO_8859-9:1989" => "ISO-8859-9", 29 | "Extended_UNIX_Code_Packed_Format_for_Japanese" => "EUC-JP", 30 | ); 31 | 32 | sub handler { 33 | my $r = shift; 34 | return DECLINED if 35 | ! $r->is_main or 36 | $r->content_type !~ m@^text/@ or 37 | $r->content_type =~ /charset=/ or 38 | ! -e $r->finfo or 39 | -d _ or 40 | !(my $chunk = read_chunk($r)); 41 | 42 | my @suspects = $r->dir_config->get('GuessCharsetSuspects'); 43 | my $enc = guess_encoding($chunk, @suspects); 44 | unless (ref $enc) { 45 | warn "Couldn't guess encoding: $enc" if $DEBUG; 46 | return DECLINED; 47 | } 48 | 49 | my $iana = iana_charset_name($enc->name); 50 | my $charset = lc($Prefered_MIME{$iana} || $iana); # lowercased 51 | warn "Guessed: $charset" if $DEBUG; 52 | $r->content_type($r->content_type . "; charset=$charset"); 53 | return OK; 54 | } 55 | 56 | sub read_chunk { 57 | my $r = shift; 58 | my $fh = Apache::File->new($r->filename) or return; 59 | my $buffer_size = $r->dir_config('GuessCharsetBufferSize') || 512; 60 | read $fh, my($chunk), $buffer_size; 61 | return $chunk; 62 | } 63 | 64 | 1; 65 | __END__ 66 | 67 | =head1 NAME 68 | 69 | Apache::GuessCharset - adds HTTP charset by guessing file's encoding 70 | 71 | =head1 SYNOPSIS 72 | 73 | SetHandler perl-script 74 | PerlFixupHandler +Apache::GuessCharset 75 | 76 | # how many bytes to read for guessing (default 512) 77 | PerlSetVar GuessCharsetBufferSize 1024 78 | 79 | # list of encoding suspects 80 | PerlSetVar GuessCharsetSuspects euc-jp 81 | PerlAddVar GuessCharsetSuspects shiftjis 82 | PerlAddVar GuessCharsetSuspects 7bit-jis 83 | 84 | =head1 DESCRIPTION 85 | 86 | Apache::GuessCharset is an Apache fix-up handler which adds HTTP 87 | charset attribute by automaticaly guessing text files' encodings via 88 | Encode::Guess. 89 | 90 | =head1 CONFIGURATION 91 | 92 | This module uses following configuration variables. 93 | 94 | =over 4 95 | 96 | =item GuessCharsetSuspects 97 | 98 | a list of encodings for C to check. See 99 | L for details. 100 | 101 | =item GuessCharsetBufferSize 102 | 103 | specifies how many bytes for this module to read from source file, to 104 | properly guess encodings. default is 512. 105 | 106 | =back 107 | 108 | =head1 AUTHOR 109 | 110 | Tatsuhiko Miyagawa Emiyagawa@bulknews.netE 111 | 112 | This library is free software; you can redistribute it and/or modify 113 | it under the same terms as Perl itself. 114 | 115 | =head1 SEE ALSO 116 | 117 | L, L 118 | 119 | =cut 120 | --------------------------------------------------------------------------------