#!/usr/bin/perl -w

# erkennt anleitungen

# ------------------------  HISTRORY -----------------------
# 07.01.06	new			created from find_literatur.pl, version 07.01.06

$VERSION = '07.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "9";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

# exp
my $ordExp = qr#first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|\d{1,2}\.|11th|12th|13th|II+#oi;
my $pro1Exp = qr#I|my|me|myself|mine#io;
my $causalExp = initCause();
my ($posExp,$negExp,$neutExp) = initAdj();

my $day_e_ord = qr/([1-3]?(1st|2nd|3rd|[04-9]th)|11th|12th|13th)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#([01][0-9]|2[0-3])[-.:][0-5][0-9]#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/($day, ?$time|$day_e_ord $month_e( $year)?|$month_e $day_e_ord(, $year)?|$month_e $day_num(, $year)?|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	
	# get Features
	my $length = 0;
	my $you = 0;
	my $ing = 0;
	my $ord = 0;
	my $pro1 = 0;
	my $form = 0;
	my $verb = 0;
	my $past = 0;
	my $conj = 0;
	my $present = 0;
	my $causal = 0;
	my $time = 0;
	my $adj = 0;
	my $adjP = 0;
	my $adjN = 0;
	my $kontr = 0;	# kontraktion 'll etc.
	my $quest = 0;
	my $sent = 0; # anzahl . am wortende
	my $cond = 0; # konditional: would,should, could
	my $digit = 0;
	my $review = 0;
	
	# ordinalzahlen, form-elemente, links
	my $noLinks = $text;
	$noLinks =~ s#<a>.*?</a>##io;
	$form = ($text =~ s/<input[^>]*>//gio) || 0;
	next DIR if $form > 6;
	
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	next DIR if $length > 20000 || $length < 300;
	
	# ?
	{
	my $k = 0;
	while ($k >= 0) {
		$k = index($text,'?',$k+1);
		$quest++ if $k > 0;
	}
	}
	
	# date
	while ( $noLinks =~ /\s$dateExp\s/go ) {
		$time++;
	}
	next DIR if 1000*$time/$length > 0.2;
	
	# words
	my $badwords = 0;
	my $argue = 0;
	my $vocab = 0;
	my $vague = 0;
	my $common = 0;
	my $neg = 0;
	
	while ( $text =~ /([a-z0-9'.%\$]+)/iog ) {
		my $m = $1;
		$causal++ if $m =~ /^$causalExp$/io;
		$adj++	if $m =~ /^$neutExp$/io;
		$adjN++ if $m =~ /^($negExp)$/io;
		$adjP++ if $m =~ /^($posExp)$/io;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$cond++ if $m =~ /^(would|should|could|will)$/io;
		$argue++ if $m =~ /^But$/o;
		$argue++ if $m =~ /^(finally|neveretheless|admit|actually|yes|despite|indeed|unless|either|question|aside|way|theory|furthermore)$/io;
		$vocab++ if $m =~ /^And$/o;
		$vocab++ if $m =~ /^(well|really|folks|great|only|this|dreamed|fear|feared|enough|guess)$|^".*?"$|^'.*?'$/io;
		$badwords++ if $m =~ /^(fool|fooled|stupid|stupidity|crazy|crazyness|foolish|laughable|faulty|bloated|coward|liar|blame|sick)$/io;
		$common++ if $m =~ /^(they|their|always|never|every|worst|worse|good|best|better|bad)$/io;
		$vague++ if $m =~ /^(something|somehow|however|mysterious|perhaps)$/io;
		$neg++ if $m =~ /^(no|nor|not)$|n't$/io;
		$pro1++ if $m =~ /^($pro1Exp)$/io;
		$ing++ if $m =~ /ing$/o;
		$review++ if $m =~ /^(book|film|cd|disc)$/io; # gegen 20
	}
	$adj += $adjP + $adjN;
	
	# pos + zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	next DIR if $verb == 0;
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'VV[PZ]\$'`; # ohne have und be, wegen "has said" etc.

	$past = sprintf "%.3f", $past/$verb;
	$ing = sprintf "%.3f", $ing/$verb;
	$present = sprintf "%.3f", $present/$verb;

	
	my $norm = $length;
	
	$causal = sprintf "%.3f", 1000*$causal/$norm;
	$adj = sprintf "%.3f", 1000*$adj/$norm;
	$adjP = sprintf "%.3f", 1000*$adjP/$norm;
	$adjN = sprintf "%.3f", 1000*$adjN/$norm;
	$kontr = sprintf "%.3f", 1000*$kontr/$norm;
	$quest = sprintf "%.3f", 1000*$quest/$norm;
	$cond = sprintf "%.3f", 1000*$cond/$norm;
	$vocab = sprintf "%.3f", 1000*$vocab/$norm;
	$argue = sprintf "%.3f", 1000*$argue/$norm;
	$badwords = sprintf "%.3f", 1000*$badwords/$norm;
	$common = sprintf "%.3f", 1000*$common/$norm;
	$vague = sprintf "%.3f", 1000*$vague/$norm;
	$neg = sprintf "%.3f", 1000*$neg/$norm;
	$pro1 = sprintf "%.3f", 1000*$pro1/$norm;
	$time = sprintf "%.3f", 1000*$time/$norm;
	$verb = sprintf "%.3f", 1000*$verb/$norm;
	
	my $bad = 4*$badwords+$adjN-$adjP+$common;
	$bad = 0 if $bad < 0;
	my $arg = 2*$quest+$causal+$cond+$argue+$common+$neg;
	my $lang = $kontr+$common+$vague+$vocab;
	
	if ( $causal > 0.4 && $adj > 1 && $kontr < 3.8 && $lang > 1.5 && $time < 0.1 && $review < 2 && $verb > 11 && $past >= $present &&
		( $bad > 4 || $arg > 6 ) && # böser comment oder argumentativ
		( $arg > 15 || $adjP < 2.4 ) && # mit so großem arg: weniger streng mit adjP
		( $adjN > 0 || $lang < 3 ) && # nüchterne Sprache -> kein geschimpfe
		( $adjP < 1 || $adjN > 0.5 ) && # wenn viele positive adj., dann auch negative (argumentierend)
		( $length < 7000 || $lang > 3.4 ) && # lange texte sind eher features, deswegen strenger
		( $pro1 < 2 || $bad > 7.5 ) ) { # böse kommentare sind persönlicher (1 ausnahme in train)
		
		print "$filename $past,\t$present,\t$time,\t$length,\t$adjP,\t$adjN,\t$cond,\t$vocab,\t$argue,\t$badwords,\t$common,\t$vague,\t$neg,\t$pro1,\t$review,\tYES ($bad,\t$arg,\t$lang)\n";
	} elsif ($base eq "9") {
		print "$filename $past,\t$present,\t$time,\t$length,\t$adjP,\t$adjN,\t$cond,\t$vocab,\t$argue,\t$badwords,\t$common,\t$vague,\t$neg,\t$pro1,\t$review,\tNOPE ($bad,\t$arg,\t$lang)\n";
	}
}

close( DH );

# ================  FUN  ================ #

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $pos = "(?:";
	my $neg = "(?:";
	my $neut = "(?:";
	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$neg .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/e_adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$pos .= "$_|";
	}
	close( PS );

	open( NT, "../Wortlisten/e_adjectives_neut.txt" ) || die "can't find neut.txt";
	binmode(NT, ":utf8"); 
	while ( <NT> ) {
		chomp;
		$_ = lc($_);
		$neut .= "$_|";
	}
	close( NT );
	chop ($pos,$neg,$neut);
	$pos .= ")";
	$neg .= ")";
	$neut .= ")";
	return ($pos,$neg,$neut);
}
