#!/usr/bin/perl -w

# erkennt präsentation

# ------------------------  HISTRORY -----------------------
# 17.01.06	new			created from find_erklaerung.pl, version 17.01.06
# 22.01.06	debug

$VERSION = '22.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

# regex
my $ordExp = qr#second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|11th|12th|13th|\d{1,2}\.#oi;
my $measureExp = qr#tablespoons?|cups?|teaspoons?|oz|tbsp|tsp#io; ## incomplete
my $nameExp = initNames();
my $adjExp = initAdj();


my $day_e_ord = qr/([2-3]?(1st|2nd|3rd|[04-9]th)|1\dth)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $timeExp = qr#([01][0-9]|2[0-3])([.:-][0-5][0-9]){1,2}#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/($timeExp|$day_e_ord $month_e|$month_e $day_e_ord|$month_e $day_num|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;


my $base = $ARGV[0] || "31";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(?!img)(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;

	
	# Features
	my $length = 0;
	my $name = 0;
	my $quest = 0;
	my $we = 0;
	my $i = 0;
	my $he = 0;
	my $time = 0;
	my $time_abs = 0;
	my $timeDeixis = 0;
	my $timeLink = 0;
	my $detDef = 0;
	my $detIndef = 0;
	my $verb = 0;
	my $konj = 0;
	my $vague = 0;
	my $adjPN = 0;
	my $measure = 0;
	my $ord = 0;
	my $bigrams = 0;
	my $past = 0;
	my $thePast = 0; # century, decade, jahre vor 1950
	my $present = 0;
	my $present3 = 0;
	my $headlines = 0;
	my $form = 0;
	my $keywords = 0;
	my $they = 0;
	
	my $hasBeen = 0;
	my $arePP = 0;

	# headline & form: exclude this (38,1)
	my $noLinks = $text;
	$noLinks =~ s#<a>(?!</a>).*?</a>##gio;
	
	$form = ($text =~ s/<input[^>]*>//gio) || 0;
	while ( $noLinks =~ m#(<h\d[^>]*>.*?</h\d>|<b(?: [^>]*)?>.*?</b>|<center(?: [^>]*)?>.*?</center>|<font(?: [^>]*)?>.*?</font>)#gio ) {
		my $headline = $1 if defined $1;
		$headline =~ s#<[^>]+>##go;
		if ( $headline =~ /^ ?(Results|Appendix|Summary|Evaluating|Evaluation|Bibliography|Biography|Footnotes|Synthesis|Description|Suggested Reading|Ressources|Previous Studies|Abstract|Synopsis|Acknowledgments|References|Introduction|Discussion|Conclusion) ?$/i) {
			$headlines++;
		}
	}
#	next DIR if $form > 5 || $headlines > 1;
	
	# keywords-image
	while ($noLinks =~ m#<img ([^>]+)>#gio ) {
		my $m = $1;
		$keywords++ if $m =~ /welcome|about|information|philosophy|mission|goals?|innovations?/io;
		$headlines++ if $m =~ /^(Results|Appendix|Summary|Evaluating|Evaluation|Bibliography|Biography|Footnotes|Synthesis|Description|Suggested.Reading|Ressources|Previous.Studies|Abstract|Synopsis|Acknowledgments|References|Introduction|Discussion|Conclusion)$/io;
	}
	
	# replace tags, length
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	next DIR if $length < 1000 || $length > 30000;
	my $textStart = substr($text,0,800);
	
	# Keywords
	while ( $textStart =~ /([a-z]+)/go ) {
		$keywords++ if $1 =~ /^(welcome|new|philosophy|information|mission|founded|goals?|innovative|innovations?|global)$/io; #information
	}
	
	# Frage
	while ( $text =~ /\? [A-Z]/go ) {
		$quest++;
	}
#	next DIR if 1000*$quest/$length > 0.4;
	
	# Time
	while ( $text =~ /\s($dateExp)\s/go ) {
		$time++;
	}
#	next DIR if 1000*$time/$length > 0.3 && $time > 2;
	
	# Text
	while ( $text =~ /([A-Za-z]+('[A-Za-z]+)?|(?<!\.)\d+[THNDSthnds.](?!\d| [A-Z]))/og ) { # \d. nicht in Zahl oder am Satzende
		my $m = $1;
		$we++ if $m =~ /^(we|us|our|ourselves)$/io;
		$konj++ if $m =~ /^(and|or)$/o;
		$thePast++ if $m =~ /^(century|decade|1[0-8]\d\d|19[0-7]\d)$/io; # century oder jahreszahl vor 1980
		$ord++ if $m =~ /^($ordExp)$/gio;
		$measure++ if $m =~ /^($measureExp)$/gio;
		$he++ if $m =~ /^(he|she|his|her|himself|herself)$/io;
		$name++ if $m =~ /^($nameExp)$/o;
		$adjPN++ if $m =~ /^($adjExp)$/io;
		$detDef++	if $m =~ /^(the|this)$/io;
		$detIndef++ if $m =~ /^(a|an)$/io;
		$vague++ if $m =~ /^(something|somehow|however|perhaps)$/io;
		$they++ if $m =~ /^(They|Their)$/o;
		$i++ if $m =~ /^I$/ || $m =~ /^(me|my|myself)$/io;
	}

	# POS, Zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	next DIR if $verb == 0;
	
	$hasBeen = `cat $ft | grep -A 1 '^has' | grep -c '^been'`;
	$arePP = `cat $ft | grep -A 1 '^are' | grep -c 'V[VBH][ND]\$'`;
	
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present3 = `cat $ft | grep -c 'V[VHB]Z\$'`;
	$present = `cat $ft | grep -c 'V[VHB]P\$'`;

	# Bigramme
	while ( $text =~ /\b(about us|who we are|our results|our notion|our estimate|our analysis|our application|our conclusion|our data|our definition|our evaluation|our finding|our heuristic|our measure|our method|our methodology|our model|our research|our sample|we chose|we assume|we expect|we compare|we conclude|we propose|we suggest|we note|we estimate|we omit|we discuss|we test|we studied|we examine|we analyze|we calculate|we compute|we define|we encountered|we exclude|we focus|we include)\b/gio ) {
		my $m = $1;
		if ($m =~ /(about us)|(who we are)/i) {
			$keywords++;
		} else {
			$bigrams++;
		}
	}
	
	# Normalize
	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;
	$present3 = sprintf "%.3f", $present3/$verb;
	
	$name = sprintf "%.3f", 1000*$name/$length;
	$quest = sprintf "%.3f", 1000*$quest/$length;
	$we = sprintf "%.3f", 1000*$we/$length;
	$he = sprintf "%.3f", 1000*$he/$length;
	$i = sprintf "%.3f", 1000*$i/$length;
	$they = sprintf "%.3f", 1000*$they/$length;
	$detDef = sprintf "%.3f", 1000*$detDef/$length;
	$detIndef = sprintf "%.3f", 1000*$detIndef/$length;
	$verb = sprintf "%.3f", 1000*$verb/$length;
	$konj = sprintf "%.3f", 1000*$konj/$length;
	$vague = sprintf "%.3f", 1000*$vague/$length;
	$adjPN = sprintf "%.3f", 1000*$adjPN/$length;
	$measure = sprintf "%.3f", 1000*$measure/$length;
	$ord = sprintf "%.3f", 1000*$ord/$length;
	$bigrams = sprintf "%.3f", 1000*$bigrams/$length;
	$thePast = sprintf "%.3f", 1000*$thePast/$length;
	$hasBeen = sprintf "%.3f", 1000*$hasBeen/$length;
	$arePP = sprintf "%.3f", 1000*$arePP/$length;
	
	# if & print
	$filename = substr($filename,0,10) if length($filename) > 10;
	
	if ( $length > 1000 && $length < 30000 && $quest < 0.8 && $measure < 0.01 && $time < 4 && $keywords > 0 && $vague < 0.7 &&
		$detDef > $detIndef && $verb > 8.3 && $present > 0 && $konj > 3 && $ord < 0.5 && $bigrams < 0.5 &&
		$he < 0.9 && $i < 1.5 && $they < 0.9 && # kein bericht über person, wenig verallgemeinerung (vs. 9), nicht "ich"
		($we > 0.5 || ($hasBeen > 0.3 || $arePP > 0.8) && $present3 > 0.1) && # we-form oder "has been", "are advised", "is good"
	1 ){
		print "$filename\t$name,\t$quest,\t$we,\t$i,\t$arePP,\t$hasBeen,\t$present3,\t$he,\t$they,\t$time,\t$detDef,\t$detIndef,\t$verb,\t$konj,\t$vague,\t$adjPN,\t$measure,\t$ord,\t$bigrams,\t$past,\t$thePast,\t$present,\t$keywords,\t$form,\t$headlines,\t$length\n";
	} elsif ($base == 31) {
		print "$filename\t$name,\t$quest,\t$we,\t$i,\t$arePP,\t$hasBeen,\t$present3,\t$he,\t$they,\t$time,\t$detDef,\t$detIndef,\t$verb,\t$konj,\t$vague,\t$adjPN,\t$measure,\t$ord,\t$bigrams,\t$past,\t$thePast,\t$present,\t$keywords,\t$form,\t$headlines,\t$length NO\n";
	}
}

close( DH );


# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $adj = "(?:";

	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( PS );

	chop ($adj);
	$adj .= ")";

	return $adj;
}

