#!/usr/bin/perl -w


# TODO: pronoun mit pronounPT vergleichen, nicht mit i normalisieren, sondern mit länge, suche nach DELETE

# ------------------------  HISTORY -----------------------
# 15.12.05	new			from find_glossary
# 29.01.06	added		pos

$VERSION = '15.12.05';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "13";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

my $nameExp = initNames();
my $archaicExp = initArchaic();
my $livingExp = initLiving();
my $speakExp = initSpeak();
my ($posExp,$negExp,$neutExp) = initAdj();
my $pronExp = qr/I|he|her|hers|herself|him|himself|his|it|its|itself|me|mine|my|myself|our|ours|ourself|ourselves|she|their|theirs|them|themselves|they|us|we|you|your|yours|yourself|yourselves/i;
my $pron1Exp = qr/me|mine|my|myself|our|ours|ourself|ourselves|us|we|I/i;
my $pron2Exp = qr/you|your|yours|yourself|yourselves/i;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	
	# neu: noch machen! i ist anzahl der wörter
#	if ( (100*$pronounPT/$i > 6.0392 && 100*$present/$i <= 2.139)
#		  || (100*$pronounPT/$i <= 6.0392 && 100*$cardDigit/$i <= 0.3273 && 100*$past/$i > 0.9174) ) {
		
	# ------- calculate features  ----------
	my $text = "";
	my $pronoun = 0;
	my $name = 0;
	my $length = 0;
	my $adj= 0;
	my $pos_neg = 0;
	my $archaic = 0;
	my $sentence = 0;
	my $sent_num = 0;
	my $comma = 0;
	my $speech = 0;
	my $living = 0;
	my $number = 0;
	my $chapter = 0;
	my $colon = 0;
	my $speak = 0;
	my $negPT = 0;
	my $cardDigit = 0;
	my $contr = 0;	# 'll, n't, ...
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );

	# unfug + select + links
	$text =~ s/(?:<(head|script|style|noframes|select|a)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	
	$text =~ s/&nbsp;/ /gio;

	my @subs;
	my @orig;
	# process <pre>
	while ( $text =~ m#<pre[^>]*>(.*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	$text =~ s/\s+/ /gio;
	$text =~ s#<([^> ]+)[^>]*>#<$1>#gio;
	
	$length = length($text);
	next DIR if $length < 4000;
	$text = substr( $text, 0, 100000 ) if $length > 100000;
	
	my @lines = split( /<br>|<br\/>|<p>|<\/p>|<div>|<\/div>|<td>|<\/td>/, $text );
	foreach (@lines) {
		$colon++	if m/^.{1,20}?:/;
	}
	my $lines = @lines;
	
	$text =~ s#<br>|<br/>|<p>|</p>|<div>|</div>|<td>|</td>#.#gio;
	
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚'`]/'/gio;
	$text =~ s/[“”]/"/gio;

	$text =~ s/<([^>]+?)>//gs;
		
	# Länge
	$length = length( $text );
	next DIR if $length < 4000;
	
	# POS
	my $present = 0;
	my $past = 0;
	my $pronounPT = 0;
	
	my $ft = $dirT."/".$filename;
	my $verb = `cat $ft | grep -c '\\WV'` || 0;
	next DIR if 1000*$verb/$length < 20;
	
	$pronounPT = `cat $ft | grep -c 'PP\$'`;
	
	# tense
	$past = `cat $ft | grep -c 'V[VBH]D\$'`;
	$present = `cat $ft | grep -c 'V[VHB]P\$'`;
	
	
	# sent, comma
	my @sent = split( /[.!?;]/, $text );
	foreach (@sent) {
		next if !m/\w+ \w+ \w+/; # satz: min 3 wörter
		$sentence += length;
		$sent_num++;
		while ( m/(,)|((?:\W["'])|(?:["']\W))/g ) {
			$comma++ if defined $1;
			$speech++ if defined $2;
		}
	}
	next DIR if $sent_num == 0;
	$sentence = sprintf( "%.3f", $sentence/$sent_num );

	
	# adj, pronouns, names, archaic, chapter
	my $pos = 0;
	my $neg = 0;
	my $neut = 0;
	my $konj = 0;
	my $pro1 = 0;
	my $pro2 = 0;
	while ( $text =~ /([a-z]+('[a-z]+)?|\d+[thnds.])/iog ) {	# changed 29.01
		$_ = lc($1);
		if (m/^[A-Z]/o && m/^$nameExp$|^$nameExp'/io) {	# changed 29.01
			$name++;
		} elsif (m/^$posExp$/io) {
			$pos++;
		} elsif (m/^$negExp$/io) {
			$neg++;
		} elsif (m/^$neutExp$/io) {
			$neut++;
		} elsif (m/^$pronExp$|^$pronExp'/io) { 	# changed 29.01
			$pronoun++;
			$pro1++ if m/^$pron1Exp$|^$pron1Exp'/io;
			$pro2++ if m/^$pron2Exp$|^$pron2Exp'/io;
		} elsif (m/^chapter$/io) {
			$chapter++;
		} elsif (m/^$speakExp$/io) {
			$speak++;
		} elsif (m/^$archaicExp$/io) {
			$archaic++;
		} elsif (m/^(no|nor|not)$/io) {
			$negPT++;
		} elsif (m/^\d/) {
			$cardDigit++;
		} elsif (m#n't|'ll|'d|'re|'ve#o) {
			$contr++;
		} elsif (m/^(and|or)$/o) {
			$konj++;
		}
	}
	
	$number = $cardDigit;	# DELETE THIS!
	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;
	
	$comma = sprintf "%.3f", $comma/$length*1000;
	$colon = sprintf "%.3f", $colon/$length*1000;
	$speech = sprintf "%.3f", $speech/$length*1000;
	$adj = sprintf "%.3f", ($pos+$neg+$neut)/$length*1000;
	($pos, $neg) = ($neg, $pos) if $pos < $neg;
	$pos_neg = $neg > 0 ? sprintf("%.3f", $pos/$neg) : sprintf("%.3f", $pos);
	my $pro_abs = $pronoun;
	$pronoun = sprintf "%.3f", $pronoun/$length*1000;
	$pro1 = sprintf "%.3f", $pro1/$length*1000;
	$pro2 = sprintf "%.3f", $pro2/$length*1000;
	$speak = sprintf "%.3f", $speak/$length*1000;
	$archaic = sprintf "%.3f", $archaic/$length*100000;
	$konj = sprintf "%.3f", $konj/$length*1000;
	$verb = sprintf "%.3f", $verb/$length*1000;
	
	# numbers
	$number = sprintf "%.3f", 1000*$number/$length;
	
	# the + living entity
	while ( $text =~ /\bthe ([a-z]{2,})/gi ) {
		$_ = lc($1);
		$living++ if m/^$livingExp$/io;
	}
	$name = sprintf "%.3f", ($name+$living)/$length*1000;
	my $SPN = $name + $pronoun + $speech;
	
	
	# final output
	if ( $length > 4000 && ($sentence > 30 && $comma > 9 && $colon > 0.4 || $sentence > 40) && $sent_num > 70 && 
		( $verb > 28 || $verb > 20 && $archaic > 2 && $comma > 10 ) && $adj > 19 && $pronoun > 5.5 &&  # gesteltzte sprache -> weniger verben
		( $name < 5 && $speak > 2 || $name < 1.5 ) && # viele Namen erlaubt, wenn viel gesprochen wird
		$konj > 3 && $konj+$comma > 9 && $comma > 4 && # nebensätze
		($past > 0.3 || $pro1/$pronoun > 0.4 || $speech > 3) && # ich-form oder dialoge
		$pos_neg < 6 && $number < 0.8 && ($colon < 0.9 && $speak > 2 || $colon < 0.5) && $speak > 0.1 && ($pos_neg > 3 && $SPN > 5.5 || $SPN > 13) ) {		
		print "$filename\t$sentence\t$sent_num\t$lines\t$comma\t$colon\t$number\t$verb\t$adj\t$pos_neg\t\t$konj\t$past\t$present\t$pronoun\t$pro1\t$pro2\t$speech\t$name\t$SPN\t$archaic\t$speak\t$length\n";
	} elsif ( $base == 13 ) {
		print "$filename\t$sentence\t$sent_num\t$lines\t$comma\t$colon\t$number\t$verb\t$adj\t$pos_neg\t\t$konj\t$past\t$present\t$pronoun\t$pro1\t$pro2\t$speech\t$name\t$SPN\t$archaic\t$speak\t$length NO\n";
 	}
}
closedir( DH );

# ++++++++++ FUN, FUN, FUN ++++++++++++ # 

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= "|Herb)";
	close( L1 );
	return $reg;
}
sub initArchaic {
	my $reg = "(?:";
	open( L1, "../Wortlisten/archaic.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$_ = lc($_);
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}
sub initLiving {
	my $reg = "(?:";
	open( L1, "../Wortlisten/humans.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$_ = lc($_);
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/animals.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$_ = lc($_);
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}
sub initSpeak {
	my $reg = "(?:";
	open( L1, "../Wortlisten/speak.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$_ = lc($_);
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}
sub initAdj {
	my $pos = "(?:";
	my $neg = "(?:";
	my $neut = "(?:";
	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$neg .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$pos .= "$_|";
	}
	close( PS );

	open( NT, "../Wortlisten/adjectives_neut.txt" ) || die "can't find neut.txt";
	binmode(NT, ":utf8"); 
	while ( <NT> ) {
		chomp;
		$_ = lc($_);
		$neut .= "$_|";
	}
	close( NT );
	chop ($pos,$neg,$neut);
	$pos .= ")";
	$neg .= ")";
	$neut .= ")";
	return ($pos,$neg,$neut);
}

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}