[Catalyst-commits] r11665 - in
Catalyst-Plugin-Unicode-Encoding/branches: .
attempt_at_understanding_incoming_browser_charsets
attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode
dnm at dev.catalyst.perl.org
dnm at dev.catalyst.perl.org
Thu Oct 22 20:18:45 GMT 2009
Author: dnm
Date: 2009-10-22 20:18:45 +0000 (Thu, 22 Oct 2009)
New Revision: 11665
Added:
Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/
Modified:
Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode/Encoding.pm
Log:
Adding & committing branch for configurable incoming charset.
- Changed default FB_CROAK to FB_DEFAULT, since as it stands, any out-of-the-box
Catalyst site using Unicode::Encoding will just 500 if sent non-($c->encoding)
data in a request param.
- Added incoming_charset configuration parameter to specify a list of known
possible incoming charsets for linking sites.
Copied: Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets (from rev 11661, Catalyst-Plugin-Unicode-Encoding/trunk)
Property changes on: Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets
___________________________________________________________________
Name: svn:ignore
+ META.yml
inc
Makefile.old
Makefile
MANIFEST
MANIFEST.bak
blib
pm_to_blib
Name: svn:mergeinfo
+
Modified: Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode/Encoding.pm
===================================================================
--- Catalyst-Plugin-Unicode-Encoding/trunk/lib/Catalyst/Plugin/Unicode/Encoding.pm 2009-10-21 13:19:59 UTC (rev 11661)
+++ Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode/Encoding.pm 2009-10-22 20:18:45 UTC (rev 11665)
@@ -3,12 +3,14 @@
use strict;
use base 'Class::Data::Inheritable';
+use utf8;
use Carp ();
use Encode 2.21 ();
+use Encode::Guess;
use MRO::Compat;
our $VERSION = '0.3';
-our $CHECK = Encode::FB_CROAK | Encode::LEAVE_SRC;
+our $CHECK = Encode::FB_DEFAULT;
__PACKAGE__->mk_classdata('_encoding');
@@ -79,14 +81,70 @@
my $enc = $c->encoding;
- for my $value ( values %{ $c->request->{parameters} } ) {
+ my @possible_incoming_charsets;
+ if (my $charsets = $c->config->{ incoming_charset }) {
+ @possible_incoming_charsets
+ = map { split /\s+/, $_ }
+ ref $charsets ? @{ $charsets } : ( $charsets );
+ }
+
+ # If we have a list of possible charsets to search through, use them. If
+ # not, assume all input has to be valid based on $c->encoding. Bad chars
+ # will be replaced with empty chars. (see our $CHECK above.)
+ my $found_encoding = @possible_incoming_charsets ? undef : $enc;
+
+ PASSED_PARAMETER:
+ foreach my $parameter ( keys %{ $c->request->{ parameters } } ) {
+ my $value = $c->request->param( $parameter );
+
# TODO: Hash support from the Params::Nested
if ( ref $value && ref $value ne 'ARRAY' ) {
- next;
+ next PASSED_PARAMETER;
}
- $_ = $enc->decode( $_, $CHECK ) for ( ref($value) ? @{$value} : $value );
+ PARAMETER_VALUE:
+ for $value ( ref($value) ? @{ $value } : $value ) {
+ # If it doesn't have a high byte character, decoding is going to
+ # work regardless of what encoding we think it might be.
+ my $has_highbyte_char = grep { $_ > 127 }
+ map { ord( $_ ) }
+ split //, $value;
+
+ if (!$has_highbyte_char) {
+ next PARAMETER_VALUE;
+ }
+
+ if ( !defined $found_encoding ) {
+ eval { $enc->decode( $value, Encode::FB_CROAK ) };
+
+ if ($@) {
+ $c->log->info(
+ 'Params were not sent in '
+ . $c->encoding->name . '. '
+ . 'Attempting to guess.'
+ );
+
+ $found_encoding = guess_encoding(
+ $value, @possible_incoming_charsets
+ );
+
+ if (!ref $found_encoding) {
+ $found_encoding = $enc;
+
+ $c->log->warn(
+ 'Failed finding encoding on input -- will put a '
+ . 'substitution character on failed conversions'
+ );
+ }
+ }
+ else {
+ $found_encoding = $enc;
+ }
+ }
+
+ $value = $found_encoding->decode( $value, $CHECK );
+ }
}
}
@@ -115,6 +173,7 @@
use Catalyst qw[Unicode::Encoding];
MyApp->config( encoding => 'UTF-8' ); # A valid Encode encoding
+ MyApp->config( incoming_charset => [qw( big5 iso8859-1 )]);
=head1 DESCRIPTION
@@ -122,6 +181,38 @@
On request, decodes all params from encoding into a sequence of
logical characters. On response, encodes body into encoding.
+=head1 CONFIGURATION
+
+=over 2
+
+=item incoming_charset
+
+You may set your Catalyst application with one or more incoming_charset values
+in your configuration. This module will attempt to decode incoming request
+parameters to these type(s).
+
+ MyApp->config( incoming_charset => [qw( big5 )] );
+
+This is especially prudent when your website has a form submitted to from
+another website that may have different encoding than yours. Because browsers
+don't generally send their charset type with their request, it's up to you to
+have some idea what charsets they might be in.
+
+Be careful using any iso8859 charset, as it's likely to match just about
+everything as a series of 8 bit characters.
+
+If a match isn't successfully found, this module will silently replace invalid
+characters in accordance with C<Encode>'s FB_DEFAULT method.
+
+If this isn't what you want or expect, set C<$Catalyst::Plugin::Unicode::Encoding::CHECK>
+to the relevant FB_ or other relevant constant from the C<Encode> module.
+
+=item encoding
+
+Set your Catalyst application's default encoding.
+
+=back
+
=head1 METHODS
=over 4
More information about the Catalyst-commits
mailing list