#!/usr/bin/perl -w

# erkennt offizielle reports

# ------------------------  HISTRORY -----------------------
# 06.02.06	new			created from find_minutes.pl, version 07.01.06

$VERSION = '06.02.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || 41;
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

# exp
my $pro1Exp = qr#I|my|me|myself|mine#io;
my $pro2Exp = qr#you|your|yours|yourself|yourselves#io;
my $pro3Exp = qr#he|she|his|her|hers|herself|himself#io;

my $timeNameExp = qr#year|annual|month|monthly#io;

my $day_e_ord = qr/([1-3]?(1st|2nd|3rd|[04-9]th)|11th|12th|13th)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#([01][0-9]|2[0-3])[-.:][0-5][0-9]#;

my $dateExp = qr/($day_e_ord $month_e( $year)?|$month_e $day_e_ord(, $year)?|$month_e $day_num(, $year)?|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year|$year)/;

my $moneyExp = qr/(?:(\$|£|€|eur) ?\d+(,\d{3})*([,\.]\d{2})?)|(?:\d+(,\d{3})*(\.\d{2})? eur)/io;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	

	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	my $noLinks = $text;
	$noLinks =~ s#<a>.*?</a>##io;
	$noLinks =~ s#<br( [^>]*)?>|<p( [^>]*)?># #io;
	$noLinks =~ s/<[^>]+>//go;
		
	# get Features
	my $length = 0;
	my $you = 0;
	my $ing = 0;
	my $ord = 0;
	my $pro1 = 0;
	my $pro2 = 0;
	my $pro3 = 0;
	my $form = 0;
	my $verb = 0;
	my $past = 0;
	my $present = 0;
	my $time = 0;
	my $kontr = 0;	# kontraktion 'll etc.
	my $quest = 0;
	my $timeName = 0;
	my $keywords = 0;
	
	
	# ordinalzahlen, form-elemente	
	$form = ($text =~ s/<input[^>]*>//gio) || 0;
	$ord = ($text =~ s/<li[^>]*>//gio) || 0;
	
	next DIR if $form > 8;
	
	# länge
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	next DIR if $length < 1000;
	
	# ?
	{
	my $k = 0;
	while ($k >= 0) {
		$k = index($text,'?',$k+1);
		$quest++ if $k > 0;
	}
	}
	
	# date
	while ( $noLinks =~ m#(?<=[^\w.:/-])$dateExp(?=[^\w.:/-])#gio ) {
		$time++;
	}
	
	# keywords am anfang
	$noLinks = substr( $noLinks, 0, 1000 );
	
	my $keyNeg = 0;
	while ($noLinks =~ /([a-z]+)/gio) {
		my $m = $1;
		if ($m =~ /^report$/io) {
			$keywords++;
		} elsif ($m =~ /^(Minutes|Meetings?|MEETINGS?|MINUTES)$/ && $keywords == 0) {
			$keyNeg++;
		}
	}
	$keywords--	if $keyNeg > 0;
	next DIR if $keywords < 0;
	
	# words
	my $badwords = 0;
	my $vocab = 0;
	my $vague = 0;
	my $cond = 0;
	my $money = 0;
	
	while ( $text =~ /([a-z0-9'.%\$£€]+)/iog ) {
		my $m = $1;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$cond++ if $m =~ /^(would|should|could|will)$/io;
		$vocab++ if $m =~ /^And$/o;
		$vocab++ if $m =~ /^(well|really|folks|great|dreamed|fear|feared|guess)$|^".*?"$|^'.*?'$/io;
		$badwords++ if $m =~ /^(fool|fooled|stupid|stupidity|crazy|crazyness|foolish|laughable|faulty|bloated|coward|liar|blame|sick)$/io;
		$vague++ if $m =~ /^(something|somehow|however|mysterious|perhaps)$/io;
		$pro1++ if $m =~ /^($pro1Exp)$/io;
		$pro2++ if $m =~ /^($pro2Exp)$/io;
		$pro3++ if $m =~ /^($pro3Exp)$/io;
		$ord++ 	if $m =~ /^\d+\.$|^II+\.?$/io;
		$money++ if $m =~ /^($moneyExp)$/io;
		$timeName++ if $m =~ /^($timeNameExp)$/io;
	}
	
	# pos + zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	next DIR if $verb == 0;
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'VV[PZ]\$'`; # ohne have und be, wegen "has said" etc.

	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;

	
	my $norm = $length;
	
	$kontr = sprintf "%.3f", 1000*$kontr/$norm;
	$quest = sprintf "%.3f", 1000*$quest/$norm;
	$cond = sprintf "%.3f", 1000*$cond/$norm;
	$vocab = sprintf "%.3f", 1000*$vocab/$norm;
	$badwords = sprintf "%.3f", 1000*$badwords/$norm;
	$vague = sprintf "%.3f", 1000*$vague/$norm;
	$pro1 = sprintf "%.3f", 1000*$pro1/$norm;
	$pro2 = sprintf "%.3f", 1000*$pro2/$norm;
	$pro3 = sprintf "%.3f", 1000*$pro3/$norm;
	$time = sprintf "%.3f", 1000*$time/$norm;
	$timeName = sprintf "%.3f", 1000*$timeName/$norm;
	$verb = sprintf "%.3f", 1000*$verb/$norm;
	$money = sprintf "%.3f", 1000*$money/$norm;
	my $ordR = sprintf "%.3f", 1000*$ord/$norm;
	
	my $lang = sprintf "%.3f", $kontr+$vague+$vocab;
	
	$filename = substr( $filename,0,10 );
	
	if 	($length > 1000 && $past > $present && $past > 0.25 && $verb < 30 &&
		( $verb > 14 || $verb > 10 && $money > 1 && $ordR > 1) && # viele stats -> weniger verben
		$lang < 1.2 && $badwords < 0.03 && $kontr < 0.03 && $quest < 0.2 && $vague < 0.6 &&
		( $ordR > 0.15 || $ord > 15 || $money > 0 ) &&
		( $pro1 < 0.5 || $pro3 < 0.3 && $pro2 < 0.05 ) && 
		$pro2 < 0.25 && $pro3 < 2.1 && $time > 0.1 && $cond < 2.5 &&
		( $keywords > 0 || $time > 2 && $timeName > 0.1 && $ordR > 0.8 ) && # strenger, wenn keyword fehlt
	1) {		
		print "$filename:\t$verb\t$money\t$past\t$present\t$pro1\t$pro2\t$pro3\t$lang\t$badwords\t$kontr\t$cond\t$vague\t$vocab\t$time\t$timeName\t$ord\t$ordR\t$quest\t$keywords\t$length\n";
	} elsif ($base == 41) {
		print "$filename:\t$verb\t$money\t$past\t$present\t$pro1\t$pro2\t$pro3\t$lang\t$badwords\t$kontr\t$cond\t$vague\t$vocab\t$time\t$timeName\t$ord\t$ordR\t$quest\t$keywords\t$length NO\n";
	}
}

close( DH );

