#!/usr/bin/perl -w

# a function to reencode a file and stick anything unique into the array
sub encode {
    my $in = $_[0];
    my $cs = $_[1];
    die "cannot find $in, stopped" unless ( -f $in );
    open ( I, "recode utf-8...$cs < $in 2>/dev/null |" ) || die "ugga";
    foreach ( <I> ) {
	chomp;
	s/\015//;
	if ( /^[a-z\200-\377]+$/ && /[\200-\377]/ ) {
	    if ( defined( $forms{$_} ) && $forms{$_} ne $cs ) {
		$forms{$_} = "inconclusive";
	    } else {
		$forms{$_} = $cs;
		$source{$_} = $in;
	    }
	}
    }
}

# call encode once for each interesting combination of words and encodings
encode( "german.words", "iso-8859-15" );
encode( "german.words", "macroman" );
encode( "german.words", "cp437" );

encode( "french.words", "iso-8859-15" );
encode( "french.words", "macroman" );
encode( "french.words", "cp437" );

encode( "dutch.words", "iso-8859-15" );
encode( "dutch.words", "macroman" );
encode( "dutch.words", "cp437" );

encode( "czech.words", "iso-8859-2" );

encode( "norwegian.words", "iso-8859-15" );
encode( "norwegian.words", "macroman" );
encode( "norwegian.words", "cp865" );

encode( "swedish.words", "iso-8859-15" );
encode( "swedish.words", "macroman" );
encode( "swedish.words", "cp865" );

#
# NOTE: the file names above must match those in jamfile
#

# print the encoded forms we saw only once
$numForms = 0;
foreach ( sort keys %forms ) {
    $numForms++ if ( $forms{$_} ne "inconclusive" );
}

# if we didn't produce anything, recode probably isn't present. give a
# nice compiler error

die "What went wrong? I am unhappy." if ( !$numForms );


$enum{"macroman"} = "MacRoman";
$enum{"iso-8859-2"} = "Iso88592";
$enum{"iso-8859-15"} = "Iso885915";
$enum{"cp437"} = "Cp437";
$enum{"cp865"} = "Cp865";

open( O, "> wordlist.inc" ) || die "could not open wordlist.inc for writing";

($junk,$junk,$junk,$mday,$mon,$year,$junk,$junk) = gmtime();
print O "// generated by ",
    '$Id$',
    " at ",
    sprintf( "%04d-%02d-%02d", $year + 1900, $mon + 1, $mday ), "\n";

print O "enum Encoding {\n",
"    Iso88592, Iso885915, MacRoman, Cp437, Cp865, NumEncodings\n",
"};\n",
"static const uint NumForms = ", $numForms, ";\n",
"static const struct {\n",
"    const char * encodedForm;\n",
"    Encoding encoding;\n",
"} forms[", $numForms, "] = {\n";

foreach ( sort keys %forms ) {
    print O "    { \"$_\", $enum{$forms{$_}} }, // from $source{$_}\n"
        if ( $forms{$_} ne "inconclusive" );
}

print O "};\n";
