#!/usr/bin/perl -w

# erkennt rezensionen

# ------------------------  HISTRORY -----------------------
# 15.01.06	new			created from find_comment.pl, version 14.01.06
# 22.01.06	debug

$VERSION = '22.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "20";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

# exp
my $ordExp = qr#first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|\d{1,2}\.|11th|12th|13th|II+#oi;
my $pro1Exp = qr#I|my|me|myself|mine#io;
# my $nameExp = initNames();
my $causalExp = initCause();
my ($posExp,$negExp,$neutExp) = initAdj();

my $day_e_ord = qr/([2-3]?(1st|2nd|3rd|[04-9]th)|1\dth)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#([01][0-9]|2[0-3])[.:-][0-5][0-9]#;
my $day = qr#yesterday|today#i;
my $weekday = qr#(monday|tuesday|wednsday|thursday|friday|saturday|sunday)#i;

my $dateExp = qr/($time|$day_e_ord $month_e|$month_e $day_e_ord|$month_e $day_num|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	
	# get Features
	my $length = 0;
	my $you = 0;
	my $ing = 0;
	my $ord = 0;
	my $name = 0;
	my $pro1 = 0;
	my $form = 0;
	my $verb = 0;
	my $past = 0;
	my $conj = 0;
	my $present = 0;
	my $causal = 0;
	my $time = 0;
	my $adj = 0;
	my $adjP = 0;
	my $adjN = 0;
	my $kontr = 0;	# kontraktion 'll etc.
	my $quest = 0;
	my $sent = 0; # anzahl . am wortende
	my $cond = 0; # konditional: would,should, could
	my $digit = 0;
	my $review = 0;
	
	# ordinalzahlen, form-elemente, links
	my $noLinks = $text;
	$noLinks =~ s#<a>.*?</a>##gio;
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	next DIR if $length > 80000 || $length < 1300;
	
	# fragezeichzen _am satzende_ (kann auch in buchtitel vorkommen)
	while ( $text =~ /\? [A-Z]/go ) {
		$quest++;
	}
	
	# date
	while ( $noLinks =~ /\s$dateExp\s/go ) {
		$time++;
	}
	next DIR if 1000*$time/$length > 0.5;
	
	# words
	my $badwords = 0;
	my $argue = 0;
	my $vocab = 0;
	my $vague = 0;
	my $common = 0;
	my $neg = 0;
	my $detDef = 0;
	my $detIndef = 0;
	my $timeDeixis = 0;
	my $it = 0;
	my $he = 0;
	
	while ( $text =~ /([a-z0-9]+('[a-z]+)?)/iog ) {
		my $m = $1;
		$adj++	if $m =~ /^$neutExp$/io;
		$adjN++ if $m =~ /^($negExp)$/io;
		$adjP++ if $m =~ /^($posExp)$/io;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$badwords++ if $m =~ /^(fool|fooled|stupid|stupidity|crazy|crazyness|foolish|laughable|faulty|bloated|coward|liar|blame|sick)$/io;
		$detDef++	if $m =~ /^(the|this)$/io;
		$it++	if $m =~ /^(it|its|itself)$/io;
		$he++ if $m =~ /^(he|she|his|her|himself|herself)$/io;
		$detIndef++ if $m =~ /^(a|an)$/io;
		$vague++ if $m =~ /^(something|somehow|however|perhaps)$/io;
		$neg++ if $m =~ /^(no|nor|not)$|n't$/io;
		$pro1++ if $m =~ /^($pro1Exp)$/io;
		$timeDeixis++ if $m =~ /^(yesterday|today|tomorrow|month|week)$/io;
		$ing++ if $m =~ /ing$/o;
		$review++ if $m =~ /^(book|film|movie|cd|album|anthology|designer|band|author|musician|restaurant|food|wine)$/io; # was wird rezensiert? + verfasser
		next DIR if 1000*$badwords/$length > 0.1 || 1000*$timeDeixis/$length > 0.9;
	}
	$adj += $adjP + $adjN;
	next DIR if 1000*$adj/$length < 16.5 || 1000*($adjP+$adjN)/$length < 1;
	
	# pos + zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	next DIR if $verb == 0;
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'VV[PZ]\$'`; # ohne have und be, wegen "has said" etc.
	my $present3 = `cat $ft | grep -c 'VV[Z]\$'`; # ohne have und be, wegen "has said" etc.

	$past = sprintf "%.3f", $past/$verb;
	$ing = sprintf "%.3f", $ing/$verb;
	$present = sprintf "%.3f", $present/$verb;
	$present3 = sprintf "%.3f", $present3/$verb;

	
	my $norm = $length;
	
	#$adj = sprintf "%.3f", 1000*$adj/$norm;
	print " ---  $adj\n";
	$adjP = sprintf "%.3f", 1000*$adjP/$norm;
	$adjN = sprintf "%.3f", 1000*$adjN/$norm;
	$kontr = sprintf "%.3f", 1000*$kontr/$norm;
	$quest = sprintf "%.3f", 1000*$quest/$norm;
	$badwords = sprintf "%.3f", 1000*$badwords/$norm;
	$vague = sprintf "%.3f", 1000*$vague/$norm;
	$neg = sprintf "%.3f", 1000*$neg/$norm;
	$pro1 = sprintf "%.3f", 1000*$pro1/$norm;
	$time = sprintf "%.3f", 1000*$time/$norm;
	$timeDeixis = sprintf "%.3f", 1000*$timeDeixis/$norm;
	$verb = sprintf "%.3f", 1000*$verb/$norm;
	$detDef = sprintf "%.3f", 1000*$detDef/$norm;
	$detIndef = sprintf "%.3f", 1000*$detIndef/$norm;
	$it = sprintf "%.3f", 1000*$it/$norm;
	$he = sprintf "%.3f", 1000*$he/$norm;
	my $review_rel = sprintf "%.3f", 1000*$review/$norm;
	
	my $adjPN = sprintf( "%.3f", $adjP > $adjN ? $adjN/$adjP : $adjP/$adjN);
	
	my $arg = 2*$quest+$causal+$cond+$argue+$common+$neg;
	my $lang = $kontr+$common+$vague+$vocab;
	
if ( $length < 80000 && $length > 1000 && $adj > 18.2 && $adjP+$adjN > 1 && $badwords < 0.1 && $verb > 15 && $it > 0.5 && $review > 0 && $review_rel > 0.15 &&
		$timeDeixis < 0.35 && $time < 0.3 && $detDef > 5 && $present3 > 0.045 && # weil ja "bla" _ist_ toll
		( $pro1 < 1 || $adj > 23 ) && # von weka. unsinn?
		( $vague < 0.5 || $adjPN < 0.1) && # sehr positiv/negativ: vague-words vermutlich in anderem zusammenhang
		( $quest < 0.7 || $quest < 2 && $kontr > 1.5 && $adj > 20) && # mehr fragen erlaubt bei informellen texten: viele adj,kontr
		( $detDef > $detIndef || $quest < 0.1 && $kontr < 0.1 && $vague < 0.1 && $adj < 20) && # formellen texten: weniger detDef erlaubt
		( $past >= $present || $present3 >= 0.5*$present ) &&
		1 ) {
		print "$filename $quest, $it, $he, $kontr, $adj, $adjP, $adjN, $adjPN, $verb, $past, $present, $present3, $detDef, $detIndef, $time, $timeDeixis, $vague, $neg, $pro1, $review_rel, $length, YES\n";
	} elsif ($base eq "20") {
		print "$filename $quest, $it, $he, $kontr, $adj, $adjP, $adjN, $adjPN, $verb, $past, $present, $present3, $detDef, $detIndef, $time, $timeDeixis, $vague, $neg, $pro1, $review_rel, $length, NOPE\n";
	}
}

close( DH );

# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $pos = "(?:";
	my $neg = "(?:";
	my $neut = "(?:";
	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$neg .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$pos .= "$_|";
	}
	close( PS );

	open( NT, "../Wortlisten/adjectives_neut.txt" ) || die "can't find neut.txt";
	binmode(NT, ":utf8"); 
	while ( <NT> ) {
		chomp;
		$_ = lc($_);
		$neut .= "$_|";
	}
	close( NT );
	chop ($pos,$neg,$neut);
	$pos .= ")";
	$neg .= ")";
	$neut .= ")";
	return ($pos,$neg,$neut);
}
