#! /usr/bin/perl
#
#  Copyright (c) 1999, 2000
#     Konstantin Chuguev.  All rights reserved.
# 
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#  1. Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#  2. Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
# 
#  THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
#  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
#  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
#  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
#  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
#  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
#  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
#  SUCH DAMAGE.
# 
#     iconv (Charset Conversion Library) v2.0
#

require 'getopts.pl';
use integer;

sub pack_hex {
    "_$_[0](" . join(", ", map sprintf("0x%02X", $_), unpack('C4', $_[1]))
    . ")";
}

sub pack_array {
    my($size, $format, $array_ref) = @_;
    return pack("$format$size", @$array_ref) unless $opt_C;
    my($res, $i);
    if ($format eq 'N') {
        for ($i = 0; $i < $size; $i += 2) {
            $res .= "\t"
                    . &pack_hex('1l', pack("N", $$array_ref[$i]))
                    . ", "
                . &pack_hex('1l', pack("N", $$array_ref[$i+1]))
                . ",\n";
        }
        $array_size += $size * 4;
    } else {
        for ($i = 0; $i < $size; $i += 4) {
            $res .= "\t"
                    . &pack_hex('2s', pack("n2", $$array_ref[$i], $$array_ref[$i+1]))
                    . ", "
                . &pack_hex('2s', pack("n2", $$array_ref[$i+2], $$array_ref[$i+3]))
                . ",\n";
        }
        $array_size += $size * 2;
    }
    return $res;
}

# create an array of short/long values in network byte order
sub build_array {
    my($size, $format, $default, $array_ref) = @_;
    my($i);
    for $i (0 .. $size-1) {
        $$array_ref[$i] = $default unless defined($$array_ref[$i]);
    }
    return &pack_array($size, $format, $array_ref);
}

sub build_table1 {
    my($size, $array_ref) = @_;
    return &build_array($size, "n", 0xFFFE, $array_ref);
}

sub build_table2 {
    my($size, $array_ref) = @_;
    my($offset, $n, $i, @offs) = ($size * 4, 0);
    for $i (0 .. $size-1) {
        next unless defined($$array_ref[$i]);
        $offs[$i] = $offset;
        $offset += $size * 2;
    }
    my($data) = (&build_array($size, "N", 0, \@offs));
    for $i (0 .. $size-1) {
        next unless defined($$array_ref[$i]);
        $n ++;
        $data .= &build_table1($size, $$array_ref[$i]);
    }
    printf STDERR "%d subtables.\n", $n;
    return $data;
}

$control0 = 0;
$control1 = 0;
$delete = 0;

@to_ucs;
@from_ucs;

# set a value in two charset conversion tables; update charset properties
# ($cs, $ucs) = (local charset code, Unicode)
#
sub set_val {
    my($cs, $ucs) = @_;
    return if $opt_a && $cs > 0x7F;
    $to_ucs[$cs >> 8][$cs & 0xFF] = $ucs;
    $from_ucs[$ucs >> 8][$ucs & 0xFF] = $cs;
    if (($cs & 0x60) == 0) {
        if($cs & 0x80) {
            $control1 = 1;
        } else {
            $control0 = 1;
        }
    }
    $delete = 1 if $cs == 0x7F;
    if ($cs < 0x80) {
        $_7bit = 1;
    } elsif ($cs < 0x100) {
        $_8bit = 1;
    } elsif ($cs & 0x8080) {
        $_16bit = 1;
    } else {
        $_14bit = 1;
    }
}

# set a range of equal codes to charset conversion tables
#
sub set_range {
    for (@_) {
        &set_val($_, $_);
    }
}

&Getopts('aCc:Mm:o:p:u:');
#         ||| || | | +- u N:    field number for Unicode character codes
#         ||| || | +--- p str:    prefix
#         ||| || +----- o file:    output file name
#         ||| |+------- m file:    character mnemonic table from RFC1345
#         ||| +-------- M:    Macintosh newline (<LF> only)
#         ||+---------- c N:    field number for charset character codes
#         |+----------- C:    make C source file
#         +------------ a:    ignore 8 bit (for ASCII)

$opt_c = 0 unless defined($opt_c);
$opt_p = '0x' unless defined($opt_p);
$opt_u = 1 unless defined($opt_u);

if ($opt_o) {
    $opt_o =~ tr/-/_/;
    open(STDOUT, ">$opt_o");
    $opt_o =~ s/.c$//;
}

%map;

if ($opt_M) {
    $/ = "\cM";
}

if ($opt_m) {
    open(MAP, $opt_m);
    while(<MAP>) {
        chop;
        next unless /^ [^ ]/;
        next if 2 > split;
        $map{$_[0]} = $_[1];
    }
    close(MAP);
    local($code) = 0;
    while (<>) {
        chop;
        s/^ *//;
        if (/^&[a-z]/) {
            split(' ', substr($_, 1));
            if ($_[0] eq 'code') {
                $code = $_[1];
            }
        } else {
            foreach (split) {
                &set_val($code, hex "0x$map{$_}") if $_ ne '??';
                $code ++;
            }
        }
    }
} else {
    while (<>) {
        s/[#\n].*//;
        next if 2 > split;    # too few fields
        next if ($_[$opt_c] =~ s/^$opt_p/0x/o) != 1;
        # local charset code prefix is invalid
        &set_val(hex $_[$opt_c], hex $_[$opt_u]);
    }
}

if (!$_16bit && !$_14bit) {
    if ($_8bit) {
        print STDERR "8bit charset";
        if (!$control0) {
            &set_range(0 .. 0x1F);
            print STDERR "; control0 chars added";
        }
        if (!$control1) {
            &set_range(0x80 .. 0x9F);
            print STDERR "; control1 chars added";
        }
        if (!$delete) {
            &set_range(0x7F);
            print STDERR "; delete char added";
        }
        $nbits = 8;
        $type = 1;
    } else {
        print STDERR "7bit charset";
        $nbits = 7;
        $type = 0;
    }
    print STDERR ".\n";
    $to = &build_table1($_8bit ? 256 : 128, $to_ucs[0]);
} elsif ($_16bit) {
    print STDERR "16bit charset";
    if (!$_7bit && !$_8bit) {
        &set_range(0 .. 0x7F);
        print STDERR "; ASCII subset added";
    } elsif (!$control0) {
        &set_range(0 .. 0x1F);
        print STDERR "; control0 chars added";
    }
    print STDERR ".\n";
    $to = &build_table2(256, \@to_ucs);
    $nbits = 16;
    $type = 3;
} else {
    print STDERR "14bit charset.\n";
    $to = &build_table2(128, \@to_ucs);
    $nbits = 14;
    $type = 2;
}

$to_size = $opt_C ? $array_size : length($to);

$from = &build_table2(256, \@from_ucs);

if ($opt_C) {
    die "-o option is mandatory with -C" unless $opt_o;
    $opt_o =~ s/\.c$//;
    $opt_o =~ tr/-/_/;
    $name = $opt_o;
        $name =~ tr/[a-z]/[A-Z]/;
    print "#include \"..\/lib\/deps.h\"\n\n";
    print "#ifdef _ICONV_CONVERTER_$name\n";
    print "#include \"..\/lib\/endian.h\"\n\n";
    print "_CONST unsigned char _iconv_ccs_table_$opt_o" . "[] = {\n";
    print "\t3, 'C', 'S', 'C', 'T', ICONV_ORDER, $nbits, $type,\n";
    print &pack_array(2, 'N', [8, 8 + $to_size]);
    print $to;
    print $from;
    print "};\n\n";
    print "#endif /* #ifdef _ICONV_CONVERTER_$name */\n\n";
} else {
    print pack("A5CCCNN", "\003CSCT", 0, $nbits, $type, 8, 8 + $to_size);
    print $to;
    print $from;
}

