#!/usr/bin/perl # >> dieses skript gehoert andrea stubbe, << # >> darf aber von netten menschen ausgeliehen << # >> und benutzt werden. << # >> as79@gmx.de << # guesses charset of # usage: perl charset_guesser.pl [efga] # von bis Binrdaten # 0000 007F 0xxx.xxxx # 0080 07FF 110x.xxxx 10xxxxxx # 0800 FFFF 1110.xxxx 10xxxxxx 10xxxxxx no utf8; use strict; use constant DEBUG => 0; # 0: off, 1: on, 2: a bit my $iso = 0; my $iso15 = 0; my $win = 0; my $mac = 0; my $utf8 = 0; my $utf8_lf = 0; my $noAscii = 0; undef $/; my $file = $ARGV[0] || die "\nPlease specify a file!\n(Usage: perl charset_guesser.pl [efg])\n\n"; open( FH, $ARGV[0] ) or die "could not find $file!"; my $text = ; close(FH); my @bytes = split( //, $text ); my $b; my $follower = 0; my $complete = ""; # get language: g,e,f (german, english, french, all [default]) my $lang = $ARGV[1] || "a"; #print "Language: $lang\n" if DEBUG == 1; # look at bytes foreach $b (@bytes) { #last if getDif() > 3; # early exit my $byte = sprintf "%08b", ord($b); # print $byte."-" if DEBUG == 1; if ($byte =~ /^1/o) { # no ascii $noAscii++; # lead & follow if ($byte =~ /^11(0|(10))/o) { $complete = $byte; $follower = 1 if $1 eq "0"; $follower = 2 if $1 eq "10"; $utf8_lf += 0.5; # wofr war das?? } if ($byte =~ /^10/ && $follower-- > 0) { $complete .= $byte; # print "* ",length($complete),"\n"; if (length($complete) == 24) { $utf8 += 3; } elsif (length($complete) == 16) { $utf8 += 1.5; } } # german: ш if ($lang eq "g" || $lang eq "a") { if ($byte =~ /^((11100100)|(11110110)|(11111100)|(11000100)|(11010110)|(11011100)|(11011111))$/o) { $iso++; $win++; } elsif ($byte =~ /^((10001010)|(10011010)|(10011111)|(10000000)|(10000101)|(10000110)|(10100111))$/o) { $mac++; } elsif ($follower == 0 && $complete =~ /^((1100001110100100)|(1100001110110110)|(1100001110111100)|(1100001110000100)|(1100001110010110)|(1100001110011100)|(1100001110011111))$/o) { $utf8++; } } # french: , , und c-cedille, oe-ligatur if ($lang eq "f" || $lang eq "a") { # jeweils 3A, AE, C cedille, 3E, I circ, 3O, 3U, OE (falls vorhanden) my $isoBig = qr/1100(0000|0001|0010)|11000110|11000111|1100(1000|1001|1010)|11001110|1101(0010|0011|0100)|1101(1001|1010|1011)/o; my $isoSm = qr/1110(0000|0001|0010)|11100110|11100111|1110(1000|1001|1010)|11101110|1111(0010|0011|0100)|1111(1001|1010|1011)/o; my $iso15Big = qr/10111100/o; my $iso15Sm = qr/10111101/o; my $macBig = qr/111001(01|11)|11001010|10011110|10000010|1110(0110|1001)|10000011|11101011|1110111(0|1)|11110001|1111(0010|0011|0100)|11001110/o; my $macSm = qr/1000(0111|1000|1001)|10111110|10001101|1000(1110|1111)|10010000|10010100|1001(0111|1000|1001)|1001(1100|1101|1110)|11001111/o; my $winBig = qr/10001100/o; my $winSm = qr/10011100/o; my $uniBig = qr//o; my $uniSm = qr//o; if ($byte =~/^($macBig)$/o) { $mac += 0.4; # mac geringer gewichten } if ($byte =~/^($macSm)$/o) { $mac += 0.8; # mac geringer gewichten } if ($byte =~/^($isoBig)$/o) { $iso += 0.5; $win += 0.5; } if ($byte =~/^($isoSm)$/o) { $iso += 1; $win += 1; } if ($byte =~/^($iso15Big)$/o) { $iso15 += 0.5; } if ($byte =~/^($iso15Sm)$/o) { $iso15 += 1; } if ($byte =~/^($iso15Big)$/o) { $win += 0.5; } if ($byte =~/^($iso15Sm)$/o) { $win += 1; } if ($follower == 0 && $complete =~ /^(1100001110100001|1100001110100000|1100001110100010|1100001110000001|1100001110000000|1100001110000010|1100001110101001|1100001110101000|1100001110101010|1100001110001001|1100001110001000|1100001110001010|1100001110101101|1100001110101100|1100001110101110|1100001110001101|1100001110001100|1100001110001110|1100001110110011|1100001110110010|1100001110110100|1100001110010011|1100001110010010|1100001110010100|1100001110111010|1100001110111001|1100001110111011|1100001110011010|1100001110011001|1100001110011011|1100001110100111|1100001110000111|1101010010000000)$/o) { $utf8++; } } # special chars: Euro, TM, R, "-stuff #0000 007F 0xxx.xxxx #0080 07FF 110x.xxxx 10xxxxxx #0800 FFFF 1110.xxxx 10xxxxxx 10xxxxxx if ($byte =~ /^(10101110|10111010|10111011)$/o) { # R $win++; $iso++; $utf8++; } elsif ($byte =~/^(10000000|10011001|10000010|10000100|10001011|10011011)$/o) { # EURO, TM, " $win++; } elsif ($byte =~ /^(10010001|10010010|10010011|10010100|10010110|10010111|10000101)$/o) { # '"- ... $win++; $iso++; } elsif ($byte =~ /^(11001010|11001001|11010000|11010001)$/o) { # - nobr, ... $mac += 0.8; } elsif ($byte =~ /^10100100$/o) { # EURO $iso15++; } elsif ($byte =~ /^(11011011|10101010|10101000|11100010|11100011|11011100|11011101|11010010|10010011|10010100|10010101|11000111|11001000)$/o) { # EURO, TM, R, " $mac += 0.8; } elsif ($follower == 0 && $complete =~ /^(111000101000001010101100|111000101000010010100010|111010001000001010101101|111010001000000010011110|111010001000000010111001|111010001000000010011000|111010001000000010011001|1110100010000000100011100|1110100010000000100011101|111010001000000010111010)$/o) { # EURO, TM, " $utf8++; } } # end if /^1/ } # guess & print it $iso15 += $iso; print "\nISO: $iso, WIN: $win, MAC: $mac, UTF8: $utf8\n" if DEBUG == 1; print "$file \t:" if DEBUG == 2; if ($noAscii == 0) { print "ASCII"; } elsif ($iso >= $win && $iso >= $iso15 && $iso > $utf8 && $iso >= $mac) { print "ISO-1"; } elsif ($win > $iso15 && $win > $utf8 && $win > $mac) { print "WINDOWS"; } elsif ($utf8 > $iso15 && $utf8 >= $mac) { print "UTF8"; } elsif ($mac > $iso15) { print "MAC"; } else { print "ISO-15"; } print "\n"; sub getDif { my $max = 0; my $sec = 0; if ( $utf8 > $sec ) { if ( $utf8 > $max ) { $max = $utf8; } else { $sec = $utf8; } } if ( $win > $sec ) { if ( $win > $max ) { $max = $win; } else { $sec = $win; } } if ( $mac > $sec ) { if ( $mac > $max ) { $max = $mac; } else { $mac = $win; } } if ( $iso > $sec ) { if ( $iso > $max ) { $max = $iso; } else { $sec = $iso; } } if ( $iso15 > $sec ) { if ( $win > $iso15 ) { $max = $iso15; } else { $sec = $iso15; } } return $max - $sec; }