#!/usr/bin/perl -w

# erkennt glosse

# ------------------------  HISTRORY -----------------------
# 19.01.06	new			created from find_comment.pl, version 15.01.06
# 22.01.06	added		count hi and bye (filter letter)

$VERSION = '19.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "18";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

# exp
my $ordExp = qr#first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|\d{1,2}\.|11th|12th|13th|II+#oi;
my $pro1Exp = qr#I|my|me|myself|mine#io;
my $nameExp = initNames();
my $causalExp = initCause();
my ($posExp,$negExp,$neutExp) = initAdj();

my $day_e_ord = qr/([1-3]?((1st)|(2nd)|(3rd)|([04-9]th)))/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#([01][0-9]|2[0-3])[-.:][0-5][0-9]#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/($day, ?$time|$day_e_ord $month_e( $year)?|$month_e $day_e_ord(, $year)?|$month_e $day_num(, $year)?|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
		
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	
	# get Features
	my $length = 0;
	my $you = 0;
	my $ord = 0;
	my $name = 0;
	my $pro1 = 0;
	my $form = 0;
	my $verb = 0;
	my $past = 0;
	my $conj = 0;
	my $present = 0;
	my $causal = 0;
	my $time = 0;
	my $adj = 0;
	my $adjP = 0;
	my $adjN = 0;
	my $kontr = 0;	# kontraktion 'll etc.
	my $quest = 0;
	my $sent = 0; # anzahl . am wortende
	my $cond = 0; # konditional: would,should, could
	my $digit = 0;
	my $review = 0;
	my $he = 0;
	my $badwords = 0;
	my $argue = 0;
	my $vocab = 0;
	my $vague = 0;
	my $common = 0;
	my $neg = 0;
	my $keywords = 0;
	my $voc2 = 0;
	
	# ordinalzahlen, form-elemente, links
	my $noLinks = $text;
	$noLinks =~ s#<a>.*?</a>##gio;
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	next DIR if $length > 40000 || $length < 1500;
	
	# ?
	{
	my $k = 0;
	while ($k >= 0) {
		$k = index($text,'?',$k+1);
		$quest++ if $k > 0;
	}
	}
	
	# date
	while ( $noLinks =~ /\s$dateExp\s/go ) {
		$time++;
		next DIR if 1000*$time/$length > 0.9;
	}
	
	# hi und bye
	my $start = substr($noLinks,0,1000);
	my $end = substr($noLinks,length($noLinks)-1000,1000);
	
	while ( $start =~ /\b(Dear|Hi|Hello|Madame|Mister|Good (morning|afternoon|evening)|To the Editor|To whom it may concern|(an )?open letters?|Address on)\b/gio ) {
		next DIR;
	}
	
	while ( $end =~ /\b(Thank you|Sincerely|Regards|Best [Rr]egards|Yours [Ss]incerely|Yours faithfully|Best [Ww]ishes|Best to all|Signed|SIGNED)\b/o ) {
		next DIR;
	}
	
	# words
	
	while ( $text =~ /([a-z0-9]+('[a-z]+)?)/iog ) {
		my $m = $1;
		$causal++ if $m =~ /^$causalExp$/io;
		$adj++	if $m =~ /^$neutExp$/io;
		$adjN++ if $m =~ /^($negExp)$/io;
		$adjP++ if $m =~ /^($posExp)$/io;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$cond++ if $m =~ /^(would|should|could|will)$/io;
		$argue++ if $m =~ /^But$/o;
		$argue++ if $m =~ /^(finally|neveretheless|admit|actually|yes|despite|indeed|unless|either|question|aside|way|theory|furthermore)$/io;
		$vocab++ if $m =~ /^And$/o;
		$vocab++ if $m =~ /^(well|really|folks|great|only|this|dreamed|fear|feared|enough|guess)$|^'.*?'$/io;
		$voc2++ if $m =~ /^(saw|recognized?|recognizing|wonder(ed|ing)?|noticed?|noticing|hear(d|ing)?|see(ing)?|noted?|noting|spot|spott(ed|ing)|realized?|realizing|perceived?|perceiving|glimpse|discover(ed|ing)?|overhear(d|ing)?|listen(ed|ing)?|detect(ed|ing)?|observed?|observing)$/;
		$badwords++ if $m =~ /^(drunk|drunken|wasteful|irresponsibility|irresponsible|fool|fooled|stupid|stupidity|crazy|crazyness|foolish|laughable|faulty|bloated|coward|liar|blame|sick)$/io;
		$common++ if $m =~ /^(they|their|always|never|every|worst|worse|good|best|better|bad)$/io;
		$vague++ if $m =~ /^(something|somehow|however|mysterious|perhaps)$/io;
		$neg++ if $m =~ /^(no|nor|not)$|n't$/io;
		$pro1++ if $m =~ /^($pro1Exp)$/io;
		$you++ if $m =~ /^(your?|yourself|yourselves)$/io;
		$review++ if $m =~ /^(book|film|cd|disc)$/io; # gegen 20
		$name++ if $m =~ /^($nameExp)$/o;
		$he++ if $m =~ /^(he|she|his|her|himself|herself|they|their|themselves)$/io; # changed here: 3 pl
		$keywords++ if $m =~ /^(columns?|columnist|irony|sarcasm|sarcastic(ally)?|ironic(ally)?)$/io;
		$ord++ if $m =~ /^($ordExp)$/io;
		next DIR if 1000*$adj/$length > 28 || 1000*$kontr/$length > 5 || $review > 3 || 1000*$review/$length > 0.9;
		next DIR if 1000*$vague/$length > 0.8 || 1000*$ord/$length > 1;
	}
	$adj += $adjP + $adjN;
	
	# pos + zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	next DIR if $verb == 0;
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'V[VHB][PZ]\$'`;
	$present -= `cat $ft | grep -A 1 'V[BH][PZ]\$' | grep -c 'V[VBH][ND]'`; # ohne "has said"

	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;

	
	my $norm = $length;
	
	$causal = sprintf "%.3f", 1000*$causal/$norm;
	$adj = sprintf "%.3f", 1000*$adj/$norm;
	$adjP = sprintf "%.3f", 1000*$adjP/$norm;
	$adjN = sprintf "%.3f", 1000*$adjN/$norm;
	$kontr = sprintf "%.3f", 1000*$kontr/$norm;
	$quest = sprintf "%.3f", 1000*$quest/$norm;
	$cond = sprintf "%.3f", 1000*$cond/$norm;
	$vocab = sprintf "%.3f", 1000*$vocab/$norm;
	$voc2 = sprintf "%.3f", 1000*$voc2/$norm;
	$keywords = sprintf "%.3f", 1000*$keywords/$norm;
	$argue = sprintf "%.3f", 1000*$argue/$norm;
	$badwords = sprintf "%.3f", 1000*$badwords/$norm;
	$common = sprintf "%.3f", 1000*$common/$norm;
	$vague = sprintf "%.3f", 1000*$vague/$norm;
	$neg = sprintf "%.3f", 1000*$neg/$norm;
	$pro1 = sprintf "%.3f", 1000*$pro1/$norm;
	$you = sprintf "%.3f", 1000*$you/$norm;
	$time = sprintf "%.3f", 1000*$time/$norm;
	$verb = sprintf "%.3f", 1000*$verb/$norm;
	$name = sprintf "%.3f", 1000*$name/$norm;
	$he = sprintf "%.3f", 1000*$he/$norm;
	$ord = sprintf "%.3f", 1000*$ord/$norm;
	my $review_rel = sprintf "%.3f", 1000*$review/$norm;
	
	my $bad = sprintf "%.3f", 4*$badwords+$adjN-$adjP+$common;
	$bad = sprintf( "%.3f", 0 ) if $bad < 0;
	my $arg = 2*$quest+$causal+$cond+$argue+$common+$neg;
	my $lang = $kontr+$common+$vague+$vocab;
	
	$filename = substr($filename,0,10);
	
	if ( $adj > 15 && $adj < 28 && $verb > 17 && $kontr < 5 && $review < 3 && $review_rel < 0.9 && $neg > 0 && $lang > 2 && $time < 0.9 &&
		( $bad > 4 || $arg > 4 ) && # böser comment oder argumentativ
		$vague < 0.8 && $adjP+$adjN > 1.5 && $quest > 0 && ( $keywords + $voc2 ) > 0 && $ord < 1 &&
		( $cond > 0.1 || $quest > 2 ) && # fragen sind ja irgendwie auch conditional.
		$pro1 > 0.2 && $name < 6 && $he > 1 && 
		( $argue < 1 && $quest < 1 || $pro1 > 3.5 && $lang > 4) && # wenn argumentierend, dann aber ich-Form und locker
		( $name > 1 || $you > 5 ) && # entweder direkte ansprache oder personen kommen vor
		1) {

		print "$filename\t$name,\t$you,\t$he,\t$quest,\t$causal,\t$kontr,\t$adj,\t$adjP,\t$adjN,\t$cond,\t$keywords,\t$vocab,\t$voc2,\t$argue,\t$badwords,\t$common,\t$vague,\t$neg,\t$pro1,\t$verb,\t$time,\t$past,\t$present,\t($bad,\t$arg,\t$lang), $length\n";
	} elsif ($base eq "18") {
		print "NO $filename\t$name,\t$you,\t$he,\t$quest,\t$causal,\t$kontr,\t$adj,\t$adjP,\t$adjN,\t$cond,\t$keywords,\t$vocab,\t$voc2,\t$argue,\t$badwords,\t$common,\t$vague,\t$neg,\t$pro1,\t$verb,\t$time\t$past,\t$present,\t($bad,\t$arg,\t$lang), $length\n";
	}
}

close( DH );

# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $pos = "(?:";
	my $neg = "(?:";
	my $neut = "(?:";
	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$neg .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$pos .= "$_|";
	}
	close( PS );

	open( NT, "../Wortlisten/adjectives_neut.txt" ) || die "can't find neut.txt";
	binmode(NT, ":utf8"); 
	while ( <NT> ) {
		chomp;
		$_ = lc($_);
		$neut .= "$_|";
	}
	close( NT );
	chop ($pos,$neg,$neut);
	$pos .= ")";
	$neg .= ")";
	$neut .= ")";
	return ($pos,$neg,$neut);
}
