├── t
├── sjis.html
├── 00_compile.t
└── 01_guess.t
├── MANIFEST
├── MANIFEST.SKIP
├── Makefile.PL
├── Changes
├── README
└── lib
└── Apache
└── GuessCharset.pm
/t/sjis.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/miyagawa/Apache-GuessCharset/master/t/sjis.html
--------------------------------------------------------------------------------
/t/00_compile.t:
--------------------------------------------------------------------------------
1 | use strict;
2 | use Test::More tests => 1;
3 |
4 | BEGIN { use_ok 'Apache::GuessCharset' }
5 |
--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | Changes
2 | MANIFEST
3 | Makefile.PL
4 | README
5 | lib/Apache/GuessCharset.pm
6 | t/00_compile.t
7 | t/01_guess.t
8 | t/sjis.html
9 |
--------------------------------------------------------------------------------
/MANIFEST.SKIP:
--------------------------------------------------------------------------------
1 | \bRCS\b
2 | \bCVS\b
3 | ^MANIFEST\.
4 | ^Makefile$
5 | ~$
6 | \.old$
7 | ^blib/
8 | ^pm_to_blib
9 | ^MakeMaker-\d
10 | \.gz$
11 | \.cvsignore
12 |
--------------------------------------------------------------------------------
/Makefile.PL:
--------------------------------------------------------------------------------
1 | use ExtUtils::MakeMaker;
2 | require 5.7.3;
3 | WriteMakefile(
4 | 'NAME' => 'Apache::GuessCharset',
5 | 'VERSION_FROM' => 'lib/Apache/GuessCharset.pm', # finds $VERSION
6 | 'PREREQ_PM' => {
7 | 'Test::More' => 0.32,
8 | 'Encode::Guess' => 0,
9 | 'Apache::File' => 0,
10 | 'I18N::Charset' => 1.16,
11 | },
12 | );
13 |
--------------------------------------------------------------------------------
/Changes:
--------------------------------------------------------------------------------
1 | Revision history for Perl extension Apache::GuessCharset
2 |
3 | 0.03 Sat May 11 13:31:31 JST 2002
4 | * handles any text/* requests without charset attribute
5 | * charset attribute is now lowercased
6 | - returns DECLINED if not in main request
7 | - added mock test code
8 |
9 | 0.02 Fri May 3 22:27:43 JST 2002
10 | - uses $r->finfo and fixed some style issue
11 | (Thanks to Geoffrey Young )
12 | - CPAN release
13 |
14 | 0.01 Tue Apr 30 14:33:20 2002
15 | - original version
16 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | NAME
2 | Apache::GuessCharset - adds HTTP charset by guessing file's encoding
3 |
4 | SYNOPSIS
5 | PerlModule Apache::GuessCharset
6 | SetHandler perl-script
7 | PerlFixupHandler Apache::GuessCharset
8 |
9 | # how many bytes to read for guessing (default 512)
10 | PerlSetVar GuessCharsetBufferSize 1024
11 |
12 | # list of encoding suspects
13 | PerlSetVar GuessCharsetSuspects euc-jp
14 | PerlAddVar GuessCharsetSuspects shiftjis
15 | PerlAddVar GuessCharsetSuspects 7bit-jis
16 |
17 | DESCRIPTION
18 | Apache::GuessCharset is an Apache handler which adds HTTP charset
19 | attribute by automaticaly guessing file' encodings via Encode::Guess.
20 |
21 | CONFIGURATION
22 | This module uses following configuration variables.
23 |
24 | GuessCharsetSuspects
25 | a list of encodings for "Encode::Guess" to check. See the
26 | Encode::Guess manpage for details.
27 |
28 | GuessCharsetBufferSize
29 | specifies how many bytes for this module to read from source file,
30 | to properly guess encodings. default is 512.
31 |
32 | AUTHOR
33 | Tatsuhiko Miyagawa
34 |
35 | This library is free software; you can redistribute it and/or modify it
36 | under the same terms as Perl itself.
37 |
38 | SEE ALSO
39 | the Encode::Guess manpage, the Apache::File manpage
40 |
41 |
--------------------------------------------------------------------------------
/t/01_guess.t:
--------------------------------------------------------------------------------
1 | use Test::More tests => 6;
2 |
3 | no warnings 'once';
4 | use Apache::GuessCharset;
5 | use Apache::FakeRequest;
6 | use FileHandle;
7 |
8 | @Apache::File::ISA = qw(FileHandle);
9 |
10 | package Mock::Apache::Table;
11 | sub get { @{$_[0]->{$_[1]}} }
12 |
13 | local *Apache::FakeRequest::finfo = sub {
14 | FileHandle->new(shift->{filename});
15 | };
16 |
17 | local *Apache::FakeRequest::dir_config = sub {
18 | my $self = shift;
19 | return @_ ? @{$self->{dir_config}->{$_[0]}}
20 | : bless $self->{dir_config}, 'Mock::Apache::Table';
21 | };
22 |
23 | package main;
24 |
25 | {
26 | my $r = Apache::FakeRequest->new(
27 | is_main => 1,
28 | filename => "t/sjis.html",
29 | content_type => 'text/html',
30 | dir_config => {
31 | GuessCharsetSuspects => [ qw(euc-jp shiftjis 7bit-jis) ],
32 | },
33 | );
34 |
35 | my $code = Apache::GuessCharset::handler($r);
36 | is $code, Apache::Constants::OK, 'status code is OK';
37 | is $r->content_type, 'text/html; charset=shift_jis', 'encoding is shift_jis';
38 | }
39 |
40 | {
41 | my $r = Apache::FakeRequest->new(
42 | is_main => 1,
43 | filename => "t/sjis.html",
44 | content_type => 'text/plain',
45 | dir_config => {
46 | GuessCharsetSuspects => [ qw(shiftjis) ],
47 | },
48 | );
49 |
50 | my $code = Apache::GuessCharset::handler($r);
51 | is $code, Apache::Constants::OK, 'status code is OK: should work with text/plain';
52 | is $r->content_type, 'text/plain; charset=shift_jis', 'encoding is shift_jis';
53 | }
54 |
55 | {
56 | my $r = Apache::FakeRequest->new(
57 | is_main => 1,
58 | filename => "t",
59 | content_type => 'text/plain',
60 | dir_config => {
61 | GuessCharsetSuspects => [ qw(shiftjis) ],
62 | },
63 | );
64 |
65 | my $code = Apache::GuessCharset::handler($r);
66 | is $code, Apache::Constants::DECLINED, 'DECLINED for directory';
67 | }
68 |
69 | {
70 | my $r = Apache::FakeRequest->new(
71 | is_main => 0,
72 | );
73 |
74 | my $code = Apache::GuessCharset::handler($r);
75 | is $code, Apache::Constants::DECLINED, 'DECLINED for subreq';
76 | }
77 |
--------------------------------------------------------------------------------
/lib/Apache/GuessCharset.pm:
--------------------------------------------------------------------------------
1 | package Apache::GuessCharset;
2 |
3 | use strict;
4 | our $VERSION = 0.03;
5 | our $DEBUG = 0;
6 |
7 | use Apache::Constants;
8 | use Apache::File;
9 | use Encode::Guess;
10 | use I18N::Charset;
11 |
12 | # generated from http://www.iana.org/assignments/character-sets
13 | # '/Name: (\S*)/ and $name = $1; /Alias: (\S*).*preferred MIME/ and print qq("$name" => "$1",)'
14 | our %Prefered_MIME = (
15 | "ANSI_X3.4-1968" => "US-ASCII",
16 | "ISO_8859-1:1987" => "ISO-8859-1",
17 | "ISO_8859-2:1987" => "ISO-8859-2",
18 | "ISO_8859-3:1988" => "ISO-8859-3",
19 | "ISO_8859-4:1988" => "ISO-8859-4",
20 | "ISO_8859-6:1987" => "ISO-8859-6",
21 | "ISO_8859-6-E" => "ISO-8859-6-E",
22 | "ISO_8859-6-I" => "ISO-8859-6-I",
23 | "ISO_8859-7:1987" => "ISO-8859-7",
24 | "ISO_8859-8:1988" => "ISO-8859-8",
25 | "ISO_8859-8-E" => "ISO-8859-8-E",
26 | "ISO_8859-8-I" => "ISO-8859-8-I",
27 | "ISO_8859-5:1988" => "ISO-8859-5",
28 | "ISO_8859-9:1989" => "ISO-8859-9",
29 | "Extended_UNIX_Code_Packed_Format_for_Japanese" => "EUC-JP",
30 | );
31 |
32 | sub handler {
33 | my $r = shift;
34 | return DECLINED if
35 | ! $r->is_main or
36 | $r->content_type !~ m@^text/@ or
37 | $r->content_type =~ /charset=/ or
38 | ! -e $r->finfo or
39 | -d _ or
40 | !(my $chunk = read_chunk($r));
41 |
42 | my @suspects = $r->dir_config->get('GuessCharsetSuspects');
43 | my $enc = guess_encoding($chunk, @suspects);
44 | unless (ref $enc) {
45 | warn "Couldn't guess encoding: $enc" if $DEBUG;
46 | return DECLINED;
47 | }
48 |
49 | my $iana = iana_charset_name($enc->name);
50 | my $charset = lc($Prefered_MIME{$iana} || $iana); # lowercased
51 | warn "Guessed: $charset" if $DEBUG;
52 | $r->content_type($r->content_type . "; charset=$charset");
53 | return OK;
54 | }
55 |
56 | sub read_chunk {
57 | my $r = shift;
58 | my $fh = Apache::File->new($r->filename) or return;
59 | my $buffer_size = $r->dir_config('GuessCharsetBufferSize') || 512;
60 | read $fh, my($chunk), $buffer_size;
61 | return $chunk;
62 | }
63 |
64 | 1;
65 | __END__
66 |
67 | =head1 NAME
68 |
69 | Apache::GuessCharset - adds HTTP charset by guessing file's encoding
70 |
71 | =head1 SYNOPSIS
72 |
73 | SetHandler perl-script
74 | PerlFixupHandler +Apache::GuessCharset
75 |
76 | # how many bytes to read for guessing (default 512)
77 | PerlSetVar GuessCharsetBufferSize 1024
78 |
79 | # list of encoding suspects
80 | PerlSetVar GuessCharsetSuspects euc-jp
81 | PerlAddVar GuessCharsetSuspects shiftjis
82 | PerlAddVar GuessCharsetSuspects 7bit-jis
83 |
84 | =head1 DESCRIPTION
85 |
86 | Apache::GuessCharset is an Apache fix-up handler which adds HTTP
87 | charset attribute by automaticaly guessing text files' encodings via
88 | Encode::Guess.
89 |
90 | =head1 CONFIGURATION
91 |
92 | This module uses following configuration variables.
93 |
94 | =over 4
95 |
96 | =item GuessCharsetSuspects
97 |
98 | a list of encodings for C to check. See
99 | L for details.
100 |
101 | =item GuessCharsetBufferSize
102 |
103 | specifies how many bytes for this module to read from source file, to
104 | properly guess encodings. default is 512.
105 |
106 | =back
107 |
108 | =head1 AUTHOR
109 |
110 | Tatsuhiko Miyagawa Emiyagawa@bulknews.netE
111 |
112 | This library is free software; you can redistribute it and/or modify
113 | it under the same terms as Perl itself.
114 |
115 | =head1 SEE ALSO
116 |
117 | L, L
118 |
119 | =cut
120 |
--------------------------------------------------------------------------------