#!/usr/bin/perl -w

# erkennt reportage

# ------------------------  HISTORY -----------------------
# 17.01.06	new			created from find_erklärung.pl, version 17.01.06

$VERSION = '17.01.06';

# --------------------------  TODO -------------------------
# Länder und Städtenamen

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

# regex
my $ordExp = qr#second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|11th|12th|13th|\d{1,2}\.#oi; #first
my $measureExp = qr#tablespoons?|cups?|teaspoons?|oz|tbsp|tsp#io; ## incomplete
my $nameExp = initNames();
my $adjExp = initAdj();
my $adjNExp = initAdjN();
my $countryExp = initCountries();


my $day_e_ord = qr/([2-3]?(1st|2nd|3rd|[04-9]th)|1\dth)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $timeExp = qr#([01][0-9]|2[0-3])([.:-][0-5][0-9]){1,2}#;
my $day = qr#yesterday|today#i;
my $weekday = qr#(monday|tuesday|wednsday|thursday|friday|saturday|sunday)#i;

my $dateExp = qr/($timeExp|$day_e_ord $month_e|$month_e $day_e_ord|$month_e $day_num|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;


my $base = $ARGV[0] || "3";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(?!img)(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	# Features
	my $length = 0;
	my $name = 0;
	my $quest = 0;
	my $we = 0;
	my $i = 0;
	my $he = 0;
	my $time = 0;
	my $time_abs = 0;
	my $verb = 0;
	my $konj = 0;
	my $vague = 0;
	my $adjPN = 0;
	my $adj = 0;
	my $measure = 0;
	my $ord = 0;
	my $bigrams = 0;
	my $past = 0;
	my $thePast = 0; # century, decade, jahre vor 1950
	my $present = 0;
	my $form = 0;
	my $kontr = 0;
	my $argue = 0;
	my $common = 0;
	my $vocab = 0;
	my $living = 0;
	my $land = 0;
	my $portraitWords = 0;
	# form: exclude this (8,38)
	my $noLinks = $text;
	$noLinks =~ s#<a>(?!</a>).*?</a>##gio;
	
	$form = ($text =~ s/<input[^>]*>//gio) || 0;
	next DIR if $form > 10;
		
	# replace tags, length
	$text =~ s/<[^>]+>//go;
	$noLinks =~ s/<[^>]+>//go;
	$length = length( $text );
	next DIR if $length < 2500 || $length > 45000;
	my $textStart = substr($text,0,800);
		
	# Frage
	while ( $text =~ /\? [A-Z]/go ) {
		$quest++;
	}
	next DIR if 1000*$quest/$length > 3;
	
	# Time
	while ( $noLinks =~ /\s($dateExp)\s/go ) {
		$time++;
	}
	next DIR if 1000*$time/$length > 0.6;

	# Text
	while ( $text =~ /([a-z]+('[a-z]+)?|\d+[thrds.])/iog ) {
		my $m = $1;
		$we++ if $m =~ /^(we|us|our|ourselves)$/io;
		$konj++ if $m =~ /^(and|or)$/o;
		$thePast++ if $m =~ /^(century|decade|1[0-8]\d\d|19[0-7]\d)$/io; # century oder jahreszahl vor 1980
		$ord++ if $m =~ /^($ordExp)$/gio;
		$measure++ if $m =~ /^($measureExp)$/gio;
		$he++ if $m =~ /^(he|she|his|her|himself|herself|they|their|themselves)$/io; # changed here: 3 pl
		$name++ if $m =~ /^($nameExp)$/o;
		$adjPN++ if $m =~ /^($adjExp)$/io;
		$vague++ if $m =~ /^(something|somehow|however|perhaps|just|another)$/io;
		$i++ if $m =~ /^I$/ || $m =~ /^(me|my|myself)$/io;
		$adj++ if $m =~ /^$adjNExp$/io;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$argue++ if $m =~ /^But$/o;
		$argue++ if $m =~ /^(finally|neveretheless|admit|actually|yes|despite|indeed|unless|either|question|aside|way|theory|furthermore)$/io;
		$common++ if $m =~ /^(they|their|always|never|every|worst|worse|good|best|better|bad)$/io;
		$vocab++ if $m =~ /^And$/o;
		$vocab++ if $m =~ /^(well|really|folks|great|only|this|dreamed|fear|feared|enough|guess)$/io;
		$land++ if $m =~ /^($countryExp)$/io;
		$portraitWords++ if $m =~ /^(born|died|saddened|biography|obituary|dies)$/io;
#		print "* $m\n" if $m =~ /^(born|died|saddened|biography|obituary|dies)$/;
		next DIR if 1000*$vocab/$length > 3 || 1000*$he/$length > 8 || 1000*$name/$length > 6.5 || 1000*$kontr/$length > 2.5;
		next DIR if 1000*$argue/$length > 1.3 || 1000*$common/$length > 3.8;
	}
	$adj += $adjPN;
#	next DIR if  1000*$konj/$length < 4;

	# POS, Zeiten
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	next DIR if  1000*$verb/$length < 12;
	
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'V[VHB]P\$'`;
	$living = `cat $ft | grep -A 1 '^[tT]he' | grep -c 'N[NP]S\$'`;

	# Bigramme
	while ( $text =~ /\b(our results|our notion|our estimate|our analysis|our application|our conclusion|our data|our definition|our evaluation|our finding|our heuristic|our measure|our method|our methodology|our model|our research|our sample|we chose|we assume|we expect|we compare|we conclude|we propose|we suggest|we note|we estimate|we omit|we discuss|we test|we studied|we examine|we analyze|we calculate|we compute|we define|we encountered|we exclude|we focus|we include)\b/gio ) {
		$bigrams++;
	}
	next if $bigrams > 0.01;
	
	# Normalize
	$time_abs = $time;
	
	$past = sprintf "%.3f", $past/$verb;
	$present = sprintf "%.3f", $present/$verb;
	
	$name = sprintf "%.3f", 1000*$name/$length;
	$living = sprintf "%.3f", 1000*$living/$length;
	$quest = sprintf "%.3f", 1000*$quest/$length;
	$he = sprintf "%.3f", 1000*$he/$length;
	$we = sprintf "%.3f", 10*$we/$konj;
	$i = sprintf "%.3f", 10*$i/$konj;
	$verb = sprintf "%.3f", 1000*$verb/$length;
	$konj = sprintf "%.3f", 1000*$konj/$length;
	$vague = sprintf "%.3f", 1000*$vague/$length;
	$adjPN = sprintf "%.3f", 1000*$adjPN/$length;
	$adj = sprintf "%.3f", 1000*$adj/$length;
	$measure = sprintf "%.3f", 1000*$measure/$length;
	$ord = sprintf "%.3f", 1000*$ord/$length;
	$bigrams = sprintf "%.3f", 1000*$bigrams/$length;
	$thePast = sprintf "%.3f", 1000*$thePast/$length;
	$kontr = sprintf "%.3f", 1000*$kontr/$length;
	$argue = sprintf "%.3f", 1000*$argue/$length;
	$common = sprintf "%.3f", 1000*$common/$length;
	$time = sprintf "%.3f", 1000*$time/$length;
	$vocab = sprintf "%.3f", 1000*$vocab/$length;
	$land = sprintf "%.3f", 1000*$land/$length;
	
	# if & print
	$filename = substr($filename,0,10) if length($filename) > 10;
# he > 3 oder (name > 4 || N-Plural > 2: über menschen und dinge ) oder (land > 0.5 && past > 0.4: über reisen und abenteuer)

	if ($length > 2500 && $length < 45000 && $form < 10 && $name > 0.5 && $name < 6.5 && $verb > 18 && $konj > 2 &&
	$adj > 17 && $adjPN > 0.5 && $kontr < 2.5 && # gegen zu sachlich oder zu literarisch
	$argue < 1.3 && $common < 3.8 && $quest < 3 && # gegen kommentare und fragen (FAQ, Interview)
	$we+$i > 1.6 && $he < 8 && $adjPN < 4 && ($portraitWords < 1 || $name + $he < 7) && # eher erste Person; gegen Porträt
	( $past > $present && $past > 0.2 || $present > 0.2) && $thePast < 1 &&
	$bigrams < 0.01 && $time < 0.6 && $vocab < 3 && # gegen science, time, zu lockere sprache
	($he > 3 || $name > 4 || $living > 2 || ($land > 0.5 && $past > 0.4)) && # s.o.
	1) {
		print "$filename\t$name,\t$verb,\t$vocab,\t$common,\t$kontr,\t$quest\t$argue,\t$i,\t$we,\t$he,\t$time,\t$time_abs,\t$verb,\t$konj,\t$vague,\t$adj,\t$adjPN,\t$measure,\t$ord,\t$bigrams,\t$past,\t$thePast,\t$present\t$length\n";
	} elsif ($base == 3) {
		print "NO $filename\t$name,\t$verb,\t$vocab,\t$common,\t$kontr,\t$quest,\t$argue,\t$i,\t$we,\t$he,\t$time,\t$time_abs,\t$verb,\t$konj,\t$vague,\t$adj,\t$adjPN,\t$measure,\t$ord,\t$bigrams,\t$past,\t$thePast,\t$present\t$length\n";
	}
}

close( DH );


# ================  FUN  ================ #

sub initCountries {
	my $reg = "(?:";
	open( L1, "../Wortlisten/countries.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/cities.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
		chop $reg;
	$reg .= ")";
	return $reg;
}

sub initArchaic {
	my $reg = "(?:";
	open( L1, "../Wortlisten/archaic.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$_ = lc($_);
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}


sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $adj = "(?:";

	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( PS );

	chop ($adj);
	$adj .= ")";

	return $adj;
}

sub initAdjN {
	my $adj = "(?:";

	open( NG, "../Wortlisten/adjectives_neut.txt" ) || die "can't find neut.txt";
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$adj .= "$_|";
	}
	close( NG );

	chop ($adj);
	$adj .= ")";

	return $adj;
}