[Catalyst-commits] r11665 - in Catalyst-Plugin-Unicode-Encoding/branches: . attempt_at_understanding_incoming_browser_charsets attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode

Thu Oct 22 20:18:45 GMT 2009

Author: dnm
Date: 2009-10-22 20:18:45 +0000 (Thu, 22 Oct 2009)
New Revision: 11665

Added:
   Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/
Modified:
   Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode/Encoding.pm
Log:
Adding & committing branch for configurable incoming charset.

- Changed default FB_CROAK to FB_DEFAULT, since as it stands, any out-of-the-box
Catalyst site using Unicode::Encoding will just 500 if sent non-($c->encoding)
data in a request param.
- Added incoming_charset configuration parameter to specify a list of known
possible incoming charsets for linking sites.



Copied: Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets (from rev 11661, Catalyst-Plugin-Unicode-Encoding/trunk)


Property changes on: Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets
___________________________________________________________________
Name: svn:ignore
   + META.yml
inc
Makefile.old
Makefile
MANIFEST
MANIFEST.bak
blib
pm_to_blib


Name: svn:mergeinfo
   + 

Modified: Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode/Encoding.pm
===================================================================

--- Catalyst-Plugin-Unicode-Encoding/trunk/lib/Catalyst/Plugin/Unicode/Encoding.pm	2009-10-21 13:19:59 UTC (rev 11661)
+++ Catalyst-Plugin-Unicode-Encoding/branches/attempt_at_understanding_incoming_browser_charsets/lib/Catalyst/Plugin/Unicode/Encoding.pm	2009-10-22 20:18:45 UTC (rev 11665)
@@ -3,12 +3,14 @@
 use strict;
 use base 'Class::Data::Inheritable';
 
+use utf8;
 use Carp ();
 use Encode 2.21 ();
+use Encode::Guess;
 
 use MRO::Compat;
 our $VERSION = '0.3';
-our $CHECK   = Encode::FB_CROAK | Encode::LEAVE_SRC;
+our $CHECK   = Encode::FB_DEFAULT;
 
 __PACKAGE__->mk_classdata('_encoding');
 
@@ -79,14 +81,70 @@
 
     my $enc = $c->encoding;
 
-    for my $value ( values %{ $c->request->{parameters} } ) {
+    my @possible_incoming_charsets;
 
+    if (my $charsets = $c->config->{ incoming_charset }) {
+        @possible_incoming_charsets
+             = map { split /\s+/, $_ }
+               ref $charsets ? @{ $charsets } : ( $charsets );
+    }
+
+    # If we have a list of possible charsets to search through, use them.  If
+    # not, assume all input has to be valid based on $c->encoding.  Bad chars
+    # will be replaced with empty chars. (see our $CHECK above.)
+    my $found_encoding = @possible_incoming_charsets ? undef : $enc;
+
+    PASSED_PARAMETER:
+    foreach my $parameter ( keys %{ $c->request->{ parameters } } ) {
+        my $value = $c->request->param( $parameter );
+
         # TODO: Hash support from the Params::Nested
         if ( ref $value && ref $value ne 'ARRAY' ) {
-            next;
+            next PASSED_PARAMETER;
         }
 
-        $_ = $enc->decode( $_, $CHECK ) for ( ref($value) ? @{$value} : $value );
+        PARAMETER_VALUE:
+        for $value ( ref($value) ? @{ $value } : $value ) {
+            # If it doesn't have a high byte character, decoding is going to
+            # work regardless of what encoding we think it might be.
+            my $has_highbyte_char = grep { $_ > 127  }
+                                     map { ord( $_ ) }
+                                   split //, $value;
+
+            if (!$has_highbyte_char) {
+                next PARAMETER_VALUE;
+            }
+
+            if ( !defined $found_encoding ) {
+                eval { $enc->decode( $value, Encode::FB_CROAK ) };
+
+                if ($@) {
+                    $c->log->info(
+                        'Params were not sent in '
+                      . $c->encoding->name . '. '
+                      . 'Attempting to guess.'
+                    );
+
+                    $found_encoding = guess_encoding(
+                        $value, @possible_incoming_charsets
+                    );
+
+                    if (!ref $found_encoding) {
+                        $found_encoding = $enc;
+
+                        $c->log->warn(
+                            'Failed finding encoding on input -- will put a '
+                          . 'substitution character on failed conversions'
+                        );
+                    }
+                }
+                else {
+                    $found_encoding = $enc;
+                }
+            }
+
+            $value = $found_encoding->decode( $value, $CHECK );
+        }
     }
 }
 
@@ -115,6 +173,7 @@
     use Catalyst qw[Unicode::Encoding];
 
     MyApp->config( encoding => 'UTF-8' ); # A valid Encode encoding
+    MyApp->config( incoming_charset => [qw( big5 iso8859-1 )]);
 
 
 =head1 DESCRIPTION
@@ -122,6 +181,38 @@
 On request, decodes all params from encoding into a sequence of
 logical characters. On response, encodes body into encoding.
 
+=head1 CONFIGURATION
+
+=over 2
+
+=item incoming_charset
+
+You may set your Catalyst application with one or more incoming_charset values
+in your configuration. This module will attempt to decode incoming request
+parameters to these type(s).
+
+    MyApp->config( incoming_charset => [qw( big5 )] );
+
+This is especially prudent when your website has a form submitted to from
+another website that may have different encoding than yours.  Because browsers
+don't generally send their charset type with their request, it's up to you to
+have some idea what charsets they might be in.
+
+Be careful using any iso8859 charset, as it's likely to match just about
+everything as a series of 8 bit characters.
+
+If a match isn't successfully found, this module will silently replace invalid
+characters in accordance with C<Encode>'s FB_DEFAULT method.
+
+If this isn't what you want or expect, set C<$Catalyst::Plugin::Unicode::Encoding::CHECK>
+to the relevant FB_ or other relevant constant from the C<Encode> module.
+
+=item encoding
+
+Set your Catalyst application's default encoding.
+
+=back
+
 =head1 METHODS
 
 =over 4