#!/usr/bin/perl -w

# erkennt brief, mail, rede


# ------------------------  HISTRORY -----------------------
# 29.12.05	new			created from find_poem.pl, version 29.11.
# 31.01.06	changed 	dir -> base
#			added		POS

$VERSION = '31.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "34";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;
my $titles = qr/Mr|Mrs|Ms|Dr|Prof|Rev|Bishop|Archbishop|Father|Lord|Earl|Duke|Count|Countess|Baron|Baroness|Governor|President/;
my $pronExp = qr/I|he|her|hers|herself|him|himself|his|it|its|itself|me|mine|my|myself|our|ours|ourself|ourselves|she|their|theirs|them|themselves|they|us|we|you|your|yours|yourself|yourselves/i;
my $pron1Exp = qr/I|my|me|myself|mine/i;
my $pron2Exp = qr/you|your|yourself|yourselves/i;
my $pron3Exp = qr/he|her|hers|herself|him|himself|his|it|its|itself|she/i;
my $pron1PExp = qr/our|ours|ourself|ourselves|us|we/i;
my $pron3PExp = qr/their|theirs|them|themselves|they/i;
my $measureExp = qr#tablespoons?|cups?|teaspoons?|oz|tbsp|tsp#io; ## incomplete
my $keyAnleitungExp = qr#how to|tutorials?|guidelines?|using|recipes?|FAQ#io;

undef $/;

# open dir
opendir( DH, $dir ) or die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR  ".";
	
	# ------- calculate features  ----------
	# read content
	open( FH, $dir.$filename ) || die $!;
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# remove several tags (_not_ usual nonsense)
	$text =~ s#(?:<(head|script|style|noframes|select|textarea|ul|ol|dl)[^>]*>.*?<\/\1>)|(?:<!--.*?-->)|(?:</?(?:big|small|font|b|i|u|em|strong)(?:[ :=][^>]+)?/?>)##gsio;
	$text =~ s/&nbsp;/ /goi;
	$text =~ s/<([a-z]+) [a-z][^>]*>/<$1>/giso;
	$text =~ s/<a>.*?<\/a>//giso;	
	
	next if length ($text) > 200000;
	
	# process <pre>
	my @subs;
	my @orig;
	while ( $text =~ m#<pre[^>]*>(.*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	$text =~ s/\s+/ /gio;
	
	# get lines (ending with: <br>, <p>, </div>)
	my @lines = split /<br>|<p>|<\/p>|<\/div>|<\/tr>|<tr>|<br\/>|<td>|<\/td>/i, $text;
	
	my $endOfStart = @lines < 50 ? @lines/2 : 25;
	my $startOfEnd = @lines < 50 ? @lines/2 : @lines-10;
	my $i = 0;
	
	# get features - words
	my $hi = 0;
	my $bye = 0;
	my $pro1 = 0;
	my $pro1P = 0;
	my $pro2 = 0;
	my $pro3 = 0;
	my $pro3P = 0;
	my $pro_all = 0;
	my $official = 0;
	my $len = 0;
	my $hereNow = 0;
	my $excl = 0;
	my $quest = 0;
	my $ing = 0;
	my $keywords = 0;
	my $keyAnleitung = 0;
	my $num = 0;
	my $measure = 0;
	
	foreach (@lines) {
 		s/<[^>]+>//gio;	# delete tags
 		$_ = decode_entities( $_ );
 		if ( $_ !~ m/[a-z]{4,}/i ) {
 			$startOfEnd--;
 			next; # skip blank lines
		}
 		$i++;
 		
		# am textanfang
		if ( $i < $endOfStart ) {
			if ( m/^ *(Dear|Hi|Hello|Good (morning|afternoon|evening))\b/o ) {
				$hi++;
			} elsif (m /^ *(To the Editor|To whom it may concern|(an )?open letters?|Address on|Madame|Mister|President)\b/oi ) {
				$hi++;
				$official++;
			} elsif ( m /\b$keyAnleitungExp\b/io ) {
				$keyAnleitung++;
			}
		}
		
		# pronomen, deixis
 		while ( m/([a-z]+|\d+[thrds.]?)/gio ) {
 			my $m = $1;
 			$pro1++ 	if $m =~ m/^($pron1Exp)$/io;
 			$pro2++ 	if $m =~ m/^($pron2Exp)$/io;
 			$pro3++ 	if $m =~ m/^($pron3Exp)$/io;
 			$pro1P++ 	if $m =~ m/^($pron1PExp)$/io;
 			$pro3P++ 	if $m =~ m/^($pron3PExp)$/io;
 			$pro_all++ 	if $m =~ m/^($pronExp)$/io;
 			$hereNow++	if $m =~ m/^(here|now|today)$/io;
 			$ing++		if $m =~ m/ing$/io;
 			$num++ 		if $m =~ m/^\d+$/io;
 		 	$measure++	if $m =~ m/^$measureExp$/io;
 			$keywords++ if $m =~ m/^(writing|lines|readers|written|editors?|response|received|saying)$/io;
 		}

		# !?
		while (m/(\?+)/go) {
			$quest++;
		}
		while (m/(!+)/go) {
			$excl++;
		}
		
		# am textende
		if ($i > $startOfEnd && m/(^ *(Thank you|Sincerely|Regards|Best [Rr]egards|Yours [Ss]incerely|Yours faithfully|Best [Ww]ishes|Best to all)(?:\W|$))|^ ?(Signed|SIGNED) ?$/o) {
			$bye++;
		}
	}

	$text =~ s/<[^>]+>//gio;
	$len = length ($text);
	next DIR if $len < 500 || $pro_all == 0;
		
	# get features - POS, zeit
	my $ft = $dirT."/".$filename;
	my $verb = `cat $ft | grep -c '\\WV'` || 0;
	next DIR if $verb == 0;
	
	if (1000*$verb/$len > 35) {	# sehr viele verben: diverse sachen abwerten
		$pro1 *= 0.6;
		$pro2 *= 0.6;
		$pro3 *= 0.6;
		$pro1P *= 0.6;
		$pro3P *= 0.6;
		$pro_all *= 0.6;
		$quest *= 0.8;
		$excl *= 0.6;
	}
	
	my $past = 0;
	my $hasBeen = 0;
	my $arePP = 0;
	my $present3 = 0;
	my $present = 0;
	my $det = 0;
	my $pronounT = 0;
	
	$det = `cat $ft | grep -c 'DT\$'`;
	$pronounT = `cat $ft | grep -c 'PP\$'`;

	$hasBeen = `cat $ft | grep -A 1 '^has' | grep -c '^been'`;
	$arePP = `cat $ft | grep -A 1 '^are' | grep -c 'V[VBH][ND]\$'`;
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present3 = `cat $ft | grep -c 'V[VHB]Z\$'`;
	$present = `cat $ft | grep -c 'V[VHB]P\$'`;
	
	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;
	$present3 = sprintf "%.3f", $present3/$verb;
	$hasBeen = sprintf "%.3f", $hasBeen/$verb;
	$arePP = sprintf "%.3f", $arePP/$verb;
	$ing = sprintf "%.3f", $ing/$verb;
	
	$num = sprintf("%.3f", $num/$len*1000);
	$pro1 = sprintf("%.3f", $pro1/$len*1000);
	$pro2 = sprintf("%.3f", $pro2/$len*1000);
	$pro3 = sprintf("%.3f", $pro3/$len*1000);
	$pro1P = sprintf("%.3f", $pro1P/$len*1000);
	$pro3P = sprintf("%.3f", $pro3P/$len*1000);
	$pro_all = sprintf("%.3f", $pro_all/$len*1000);
	$hereNow = sprintf("%.3f", $hereNow/$len*10000);
	$excl = sprintf("%.3f", $excl/$len*10000);
	$quest = sprintf("%.3f", $quest/$len*10000);

	
	$det = sprintf "%.3f", 1000*$det/$len;
	$verb = sprintf "%.3f", 1000*$verb/$len;
	$pronounT = sprintf "%.3f", 1000*$pronounT/$len;
	
	# if & out
	$filename = substr($filename,0,10);	

	if ( ($hi > 0 || $bye > 0 || $keywords > 0) && ($pro1 > 1 || $official > 0) && # offizieller text oder in 1sing
		$pro3+$pro3P < 6 && $pro3P < 2 && $hasBeen < 0.02 && $arePP < 0.08 && $present3 < 0.3 && 
		($excl+$quest > 4 || $hereNow > 2 || $official > 0) && $verb > 16 && ($num < 4 || $num < 9 && $len < 10000 || $num < 12 && $len < 2000) && 
		($pro2 > 0.5 || $official > 0) &&
		($quest < 15 || $quest < 22 && $pro2 < 3 ) && # vs. Interview, FAQ,...
		$measure < 3 && $keyAnleitung < 4 && # vs Anleitung
		1) {
		
		print "$filename\t$hi\t$official\t$bye\t$quest\t$excl\t$pro1\t$pro2\t$pro3\t$pro1P\t$pro3P\t$pro_all\t$verb\t$past\t$present\t$present3\t$ing\t$hasBeen\t$arePP\t$det\t$hereNow\t$num\t$measure\t$keywords\t$keyAnleitung\t$len\n";
	} elsif ($base == 34) {
		print "$filename\t$hi\t$official\t$bye\t$quest\t$excl\t$pro1\t$pro2\t$pro3\t$pro1P\t$pro3P\t$pro_all\t$verb\t$past\t$present\t$present3\t$ing\t$hasBeen\t$arePP\t$det\t$hereNow\t$num\t$measure\t$keywords\t$keyAnleitung\t$len NO\n";
	}
}
close( DH );

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}
