#!/usr/bin/perl -w

# erkennt erklärung

# ------------------------  HISTRORY -----------------------
# 15.01.06	new			created from find_comment.pl, version 14.01.06

$VERSION = '15.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

# regex
my $ordExp = qr#second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|\d{1,2}\.#oi;
my $measureExp = qr#tablespoons?|cups?|teaspoons?|oz|tbsp|tsp#io; ## incomplete
my $nameExp = initNames();
my $adjExp = initAdj();


my $day_e_ord = qr/([2-3]?(1st|2nd|3rd|[04-9]th)|1\dth)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $timeExp = qr#([01][0-9]|2[0-3])([.:-][0-5][0-9]){1,2}#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/($timeExp|$day_e_ord $month_e|$month_e $day_e_ord|$month_e $day_num|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;


my $base = $ARGV[0] || "30";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
		
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	# Features
	my $length = 0;
	my $name = 0;
	my $quest = 0;
	my $kontr = 0;
	my $it = 0;
	my $we = 0;
	my $he = 0;
	my $time = 0;
	my $time_abs = 0;
	my $timeDeixis = 0;
	my $timeLink = 0;
	my $detDef = 0;
	my $detIndef = 0;
	my $num = 0;
	my $verb = 0;
	my $konj = 0;
	my $vague = 0;
	my $adjPN = 0;
	my $measure = 0;
	my $ord = 0;
	my $bigrams = 0;
	my $past = 0;
	my $thePast = 0; # century, decade, jahre vor 1950
	my $present = 0;
	my $headlines = 0;
	my $form = 0;

	my $kontr2 = 0;

	# headline & form: exclude this (38,1)
	$form = ($text =~ s/<input[^>]*>//gio);
	while ( $text =~ m#(<h\d[^>]*>.*?</h\d>|<b(?: [^>]*)?>.*?</b>|<center(?: [^>]*)?>.*?</center>|<font(?: [^>]*)?>.*?</font>)#gio ) {
		my $headline = $1 if defined $1;
		$headline =~ s#<[^>]+>##go;
		if ( $headline =~ /^ ?(Results|Appendix|Summary|Evaluating|Evaluation|Bibliography|Biography|Footnotes|Synthesis|Description|Suggested Reading|Ressources|Previous Studies|Abstract|Synopsis|Acknowledgments|References|Introduction|Discussion|Conclusion) ?$/i) {
			$headlines++;
		}
	}
	if ($headlines > 1) {
		print "$filename: Headlines ($headlines)\n" if $base == 30;
		next DIR;
	}
	
	# replace tags, length
	my $noLinks = $text;
	$noLinks =~ s#<a>.*?</a>##gio;
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	if ($length < 1300) {
		print "$filename: Length ($length)\n" if $base == 30;
		next DIR;
	}
	
	# Frage
	while ( $text =~ /\? [A-Z]/go ) {
		$quest++;
	}
	if (1000*$quest/$length > 1) {
		printf ("$filename Quest (%.3f)\n", 1000*$quest/$length)  if $base == 30;
		next DIR;
	}
	
	# Time
	while ( $text =~ /\s($timeExp)\s/go ) {
		$timeLink++;
	}
	while ( $noLinks =~ /\s($dateExp)\s/go ) {
		$time++;
	}
	$time_abs = $time;
	if (1000*$time/$length > 0.3 && $time > 2) {
		printf ("$filename Time ($time, %.3f)\n", 1000*$time/$length)  if $base == 30;
		next DIR ;
	}
	
	# Text
	while ( $text =~ /([a-z0-9]+('[a-z]+)?)/iog ) {
		my $m = $1;
		$timeDeixis++ if $m =~ /^(yesterday|today|tomorrow|month|week)$/io;
		$we++ if $m =~ /^(we|us|our|ourselves)$/io;
		$num++ if $m =~ /^\d+$/io;
		$konj++ if $m =~ /^(and|or)$/o;
		$thePast++ if $m =~ /^(century|decade|1[0-8]\d\d|19[0-7]\d)$/io; # century oder jahreszahl vor 1980
		$ord++ if $m =~ /^($ordExp)$/gio;
		$measure++ if $m =~ /^($measureExp)$/gio;
		$it++	if $m =~ /^(it|its|itself)$/io;
		$he++ if $m =~ /^(he|she|his|her|himself|herself)$/io;
		$name++ if $m =~ /^($nameExp)$/o;
		$adjPN++ if $m =~ /^($adjExp)$/io;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$kontr2++ if $m =~ /'ll|'d|'re|'ve/o;
		$detDef++	if $m =~ /^(the|this)$/io;
		$detIndef++ if $m =~ /^(a|an)$/io;
		$vague++ if $m =~ /^(something|somehow|however|perhaps)$/io;
#		next DIR if 1000*$name/$length > 2.5 || 1000*$kontr/$length > 1 || 1000*$kontr2/$length > 0.1 || 1000*$it/$length > 2.6;
#		next DIR if 1000*$we/$length > 3.2 || 1000*$num/$length > 6 || 1000*$measure/$length > 0.2;
	}


	# POS, Zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	if (1000*$verb/$length < 10) {
		printf ("$filename Verb (%.3f)\n", 1000*$verb/$length) if $base == 30;
		next DIR;
	}
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'VV[PZ]\$'`; # ohne have und be, wegen "has said" etc.

	# Bigramme
	while ( $text =~ /\b(our results|our notion|our estimate|our analysis|our application|our conclusion|our data|our definition|our evaluation|our finding|our heuristic|our measure|our method|our methodology|our model|our research|our sample|we chose|we assume|we expect|we compare|we conclude|we propose|we suggest|we note|we estimate|we omit|we discuss|we test|we studied|we examine|we analyze|we calculate|we compute|we define|we encountered|we exclude|we focus|we include)\b/gio ) {
		$bigrams++;
	}
	
	# Normalize
	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;
	
	$name = sprintf "%.3f", 1000*$name/$length;
	$quest = sprintf "%.3f", 1000*$quest/$length;
	$kontr = sprintf "%.3f", 1000*$kontr/$length;
	$it = sprintf "%.3f", 1000*$it/$length;
	$we = sprintf "%.3f", 1000*$we/$length;
	$he = sprintf "%.3f", 1000*$he/$length;
	$time = sprintf "%.3f", 1000*$time/$length;
	$timeDeixis = sprintf "%.3f", 1000*$timeDeixis/$length;
	$detDef = sprintf "%.3f", 1000*$detDef/$length;
	$detIndef = sprintf "%.3f", 1000*$detIndef/$length;
	$num = sprintf "%.3f", 1000*$num/$length;
	$verb = sprintf "%.3f", 1000*$verb/$length;
	$konj = sprintf "%.3f", 1000*$konj/$length;
	$vague = sprintf "%.3f", 1000*$vague/$length;
	$adjPN = sprintf "%.3f", 1000*$adjPN/$length;
	$measure = sprintf "%.3f", 1000*$measure/$length;
	$ord = sprintf "%.3f", 1000*$ord/$length;
	$bigrams = sprintf "%.3f", 1000*$bigrams/$length;
	$thePast = sprintf "%.3f", 1000*$thePast/$length;
	
	$kontr2 = sprintf "%.3f", 1000*$kontr2/$length;
	
	# if & print
	$filename = substr($filename,0,10) if length($filename) > 10;
	
	if ( $name < 2.5 && ($name+$he < 5 && $past > 0.3 && $thePast > 0.5 || $name+$he < 1.3) && # geschichtliche darstellung: mehr namen
		$kontr < 0.9 && $kontr2 < 0.1 && $it < 2.6 && $konj > 3 && $adjPN < 2.6 && $we < 3.1 && $vague < 0.4 && 
		($verb > 10 && ($kontr < 0.01 && $adjPN < 0.05 || $length < 9000 && $konj < 5) || $verb > 15) && # wenige verben, wenn offiziell (kein kontr, adjPN < 0.05) oder kurz (length < 8000, konj < 5)
		( $time < 0.2 || $time_abs < 2 ) && $timeDeixis < 0.4 && $timeLink < 3 &&
		( $detDef > 10 || $name < 1.3) && # weka (vielleicht: wenige "the <person>", dann mehr namen erlaubt und vv.) 
		$num > 0.9 && $num < 6 && $measure < 0.2 && $detDef > 1.5*$detIndef && $bigrams < 0.01 &&
		($thePast > 1 || $ord < 0.5) && # (weil: 18th century etc. zählt nicht als ord)
		($past < 0.4 || ($present > 0.07 || $thePast > 1) ) && $present < 0.25 && $quest < 0.45 &&
	1) {
		print "$filename\t$name,\t$quest,\t$kontr,\t$kontr2,\t$it,\t$we,\t$he,\t$time,\t$time_abs,\t$timeDeixis,\t$timeLink,\t$detDef,\t$detIndef,\t$num,\t$verb,\t$konj,\t$vague,\t$adjPN,\t$measure,\t$ord,\t$bigrams,\t$past,\t$thePast,\t$present,\t$length\n";
	} elsif ($base == 30) {
		print "NO $filename\t$name,\t$quest,\t$kontr,\t$kontr2,\t$it,\t$we,\t$he,\t$time,\t$time_abs,\t$timeDeixis,\t$timeLink,\t$detDef,\t$detIndef,\t$num,\t$verb,\t$konj,\t$vague,\t$adjPN,\t$measure,\t$ord,\t$bigrams,\t$past,\t$thePast,\t$present,\t$length\n";
	}

}

close( DH );


# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8");
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $adj = "(?:";

	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/e_adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( PS );

	chop ($adj);
	$adj .= ")";

	return $adj;
}

