#! /bin/false

# vim: set autoindent shiftwidth=4 tabstop=4:

# Conversion routines for UTF-8 (perl < 5.8.0).
# Copyright (C) 2002-2017 Guido Flohr <guido.flohr@cantanea.com>,
# all rights reserved.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

package Locale::RecodeData::UTF_8;

use strict;

require Locale::RecodeData;
use base qw(Locale::RecodeData);

sub _recode
{
    if ($_[0]->{_from} eq 'INTERNAL') {
	return $_[0]->_fromInternal ($_[1]);
    } else {
	return $_[0]->_toInternal ($_[1]);
    }
}

# This routine assumes that the internal representation is always sane
# and contains valid codes only.
sub _fromInternal
{
    $_[1] = join '', map {
	if ($_ <= 0x7f) {
	    chr $_;
	} elsif ($_ <= 0x7ff) {
	    pack ("C2", 
		  (0xc0 | (($_ >> 6) & 0x1f)),
		  (0x80 | ($_ & 0x3f)));
	} elsif ($_ <= 0xffff) {
	    pack ("C3", 
		  (0xe0 | (($_ >> 12) & 0xf)),
		  (0x80 | (($_ >> 6) & 0x3f)),
		  (0x80 | ($_ & 0x3f)));
	} elsif ($_ <= 0x1fffff) {
	    pack ("C4", 
		  (0xf0 | (($_ >> 18) & 0x7)),
		  (0x80 | (($_ >> 12) & 0x3f)),
		  (0x80 | (($_ >> 6) & 0x3f)),
		  (0x80 | ($_ & 0x3f)));
	} elsif ($_ <= 0x3ffffff) {
	    pack ("C5", 
		  (0xf0 | (($_ >> 24) & 0x3)),
		  (0x80 | (($_ >> 18) & 0x3f)),
		  (0x80 | (($_ >> 12) & 0x3f)),
		  (0x80 | (($_ >> 6) & 0x3f)),
		  (0x80 | ($_ & 0x3f)));
	} else {
	    pack ("C6", 
		  (0xf0 | (($_ >> 30) & 0x3)),
		  (0x80 | (($_ >> 24) & 0x1)),
		  (0x80 | (($_ >> 18) & 0x3f)),
		  (0x80 | (($_ >> 12) & 0x3f)),
		  (0x80 | (($_ >> 6) & 0x3f)),
		  (0x80 | ($_ & 0x3f)));
	}
    } @{$_[1]};
    return 1;
}

# Decode UTF-8 into integers.  We do not bother to care about possibly
# configured replacement characters here and simply fall back to 0xfffd.
# Rationale: the internal format is never output directly and the other
# encoders will handle the replacement character correctly.
sub _toInternal
{
    if ($] >= 5.006) {
	$_[1] = [ unpack "U*", $_[1] ];
	return 1;
    }

    # Sigh, we have to decode ourselves.  FIXME: Should be optimized.
    # The routine is awfully slow.
    # It also does not necessarily detect illegal multi-byte sequences.

    my @chars = ();
    my @bytes = unpack "C*", $_[1];

    BYTE: while (@bytes) {
	my $byte = shift @bytes;
        if ($byte < 0x80) {
            push @chars, $byte;
        } elsif ($byte < 0xc0 || $byte > 0xfd) {
            push @chars, 0xfffd;
        } else {
            my $num_bytes;
            my $char;
            if ($byte < 0xe0) {
                $char = $byte & 0x1f;
                $num_bytes = 1;
            } elsif ($byte < 0xf0) {
                $char = $byte & 0xf;
                $num_bytes = 2;
            } elsif ($byte < 0xf8) {
                $char = $byte & 0x7;
                $num_bytes = 3;
            } elsif ($byte < 0xfc) {
                $char = $byte & 0x3;
                $num_bytes = 4;
            } else {
                $char = $byte & 0x1;
                $num_bytes = 5;
            }
            for (my $i = 0; $i < $num_bytes; ++$i) {
                my $next = shift @bytes;
                if (!defined $next || $next < 0x80 || $next > 0xbf) {
                    push @chars, 0xfffd;
                    next BYTE;
                } else {
                    $char <<= 6;
                    $char |= $next & 0x3f;
                }
            }
            push @chars, $char;
        }
    }
    
    $_[1] = \@chars;
    
    return 1;
}

1;

__END__

=head1 NAME

Locale::RecodeData::UTF_8 - Conversion routines for UTF-8

=head1 SYNOPSIS

This module is internal to libintl.  Do not use directly!

=head1 DESCRIPTION

This modules contains the conversion tables for UTF-8.  It is capable of
converting from UTF-8 to the internal format of libintl-perl and vice
versa.   It is only suitable for Perl versions E<lt>= 5.8.0.  However,
you do not have to bother about version checking, Locale::Recode(3)
will do that for you.


=head1 CHARACTER TABLE

See http://www.unicode.org/.

=head1 AUTHOR

Copyright (C) 2002-2017 L<Guido Flohr|http://www.guido-flohr.net/>
(L<mailto:guido.flohr@cantanea.com>), all rights reserved.  See the source
code for details!code for details!

=head1 SEE ALSO

Locale::RecodeData(3), Locale::Recode(3), perl(1)

=cut
Local Variables:
mode: perl
perl-indent-level: 4
perl-continued-statement-offset: 4
perl-continued-brace-offset: 0
perl-brace-offset: -4
perl-brace-imaginary-offset: 0
perl-label-offset: -4
cperl-indent-level: 4
cperl-continued-statement-offset: 2
tab-width: 4
End: