#!/usr/bin/perl -w

# erkennt 2-sprachiges


# ------------------------  HISTORY -----------------------
# 04.12.05	new
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "5";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;
my %english = ();

initHashes();
my $chars = qr/[A-Za-zäöüéèáàóòúùíìñßÄÖÜÀÁÉÈÍÌÓÒÚÙç]/;
my $nochars = qr/[^A-Za-zäöüéèáàóòúùíìñßÄÖÜÀÁÉÈÍÌÓÒÚÙç]/;

# ----------- main -----------
undef $/;

#open( U, ">>unkonwnWords.txt" );
#binmode(U, ":utf8"); 

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
while ( $filename = readdir( DH ) ) {
	next if $filename =~ /^\./;
	next unless !defined($oneFile) || $filename eq $oneFile;
	last if $num_files-- == 0;

	print STDERR ".";
	print $filename.": "	if DEBUG;
	#print U $filename, "\n", "-" x 50, "\n";
	
	# ------- calculate features  ----------
	my $text = "";
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	$text =~ tr/´`/''/;
	$text = decode_entities( $text );
	$text =~s#(?:<(head|script|style|noframes)[^>]*>.*?<\/\1>)|(?:<!--.*?-->)##gsio;
	$text =~ s/<[^>]*>/ /gsio;
	$text =~ s/\s+/ /gsio;
	
	my $not_ge = 0; # nicht in general corpus
	my $ge = 1; 	# in general corpus
	my $name = 1;
	my $uppercase = 0;
	
	while ( $text =~ /(?<!\d)($chars{2,})(?!\d)/go ) {	# filter 1st, 5th, 2nd
		if ($1 eq "\u$1") {
			$uppercase++;
			if ($uppercase > 1) {
				# aufeinanderfolgende Großgeschriebene Wörter sind Namen
				$name++;
			} else {
				if (!exists $english{lc($1)}) {
					$not_ge++;
				} else {
					$ge++;
				}
			}
		} else {
			$uppercase = 0;
			if (!exists $english{lc($1)}) {
				$not_ge++;
			} else {
				$ge++;
			}
		}
		
		
# 		if (!exists $english{lc($1)}) {
# 			#print U lc($1), "\n";
# 			if ($1 eq "\u$1") {
# 				$uppercase++;
# 				if ($uppercase > 1) {
# 					#print "$1 ($uppercase), ";
# 					$name++;
# 				}
# 			} else {
# 				$uppercase = 0;
# 				$not_ge++;
# 			}
# 		} else {
# 			$ge++;
# 		}
		
	}
	# output
	if ($not_ge/$ge > 0.25 && $name/$ge < 0.5 && $ge > 30) {
		print "$filename    \tIS DICT\t$not_ge\t$ge\t", $not_ge/$ge,"\t$name\n";
	} elsif ($base == 5) {
		print "$filename    \tNO DICT\t$not_ge\t$ge\t", $not_ge/$ge,"\t$name\n";
	}
}
closedir( DH );

sub initHashes {
	open( L1, "../Wortlisten/general_english.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$english{lc($_)} = 1;
	}
	close( L1 );
	open( L1, "../Wortlisten/countries.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$english{lc($_)} = 1;
	}
	close( L1 );
	open( L1, "../Wortlisten/cities.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$english{lc($_)} = 1;
	}
	close( L1 );
	open( L1, "../Wortlisten/code.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$english{lc($_)} = 1;
	}
	close( L1 );
	print STDERR ("initialised hash: ", 0+keys( %english ), "\n") 	if DEBUG;
}