#!/usr/bin/perl -w

# erkennt nachrichten

# ------------------------  HISTRORY -----------------------
# 07.01.06	new			created from find_literatur.pl, version 07.01.06
# 24.01.06	debug

$VERSION = '24.01.06';


#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "24";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";
my $filename;
my $num_files = 50;

# exp
my $ordExp = qr#first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|\d{1,2}\.|II+#oi;
my $pro1Exp = qr#I|my|me|myself|mine#io;
my $nameExp = initNames();
my $causalExp = initCause();
my ($posExp,$negExp,$neutExp) = initAdj();

my $day_e_ord = qr/([1-3]?(1st|2nd|3rd|[04-9]th)|11th|12th|13th)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan\.?|Feb\.?|Mar\.?|Apr\.?|May|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?|Sept\.?)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#([01][0-9]|2[0-3])[-.:][0-5][0-9]#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/($day, ?$time|$day_e_ord $month_e( $year)?|$month_e $day_e_ord(, $year)?|$month_e $day_num(, $year)?|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚'']/'/gio;
	$text =~ s/[“”"]/"/gio;
	
	
	# get Features
	my $length = 0;
	my $you = 0;
	my $he = 0;
	my $we = 0;
	my $ing = 0;
	my $ord = 0;
	my $name = 0;
	my $headline = 0;
	my $pro1 = 0;
	my $form = 0;
	my $verb = 0;
	my $past = 0;
	my $conj = 0;
	my $speech = 0;
	my $present = 0;
	my $causal = 0;
	my $time = 0;
	my $adj = 0;
	my $adjPN = 0;
	my $kontr = 0;
	my $quest = 0;
	my $sent = 0; # anzahl . am wortende
	
	# ordinalzahlen, form-elemente, links
	while ( $text =~ m#<ol>(.*?)</ol>#gio ) {
		my $m = $1;
		while ( $m =~ m#<li>#gio ) {
			$ord++;
		}
	}
	while ( $text =~ m#<input|<textarea#gio) {
		$form++;
	}
	next DIR if $form > 10;
	
	my $li = 0;
	while ( $text =~ m#<li#gio) {
		$li++;
	}
	next DIR if $li > 10;
	
	# headline
	while ( $text =~ /<h\d[^>]*>/gio ) {
		$headline++;
	}
	next DIR if $headline > 3;
	
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	my $norm = $length < 1000 ? 1000 : $length;
#	next DIR if $length > 6500 || $length < 200;
	
	# ?
	{
	my $k = 0;
	while ($k >= 0) {
		$k = index($text,'?',$k+1);
		$quest++ if $k > 0;
	}
	}
#	next DIR if 1000*$quest/$norm > 0.01;
	
	# words
	while ( $text =~ /([a-z0-9]+('[a-z]+|[.%\$](?!\d))?)/iog ) {
		my $m = $1;
		$you++ if $m =~ /^you($|')/io;
		$he++ if $m =~ /^he$|^she$/io;
		$we++ if $m =~ /^we$/io;
		$ing++ if $m =~ /ing$/io;
		$name++ if $m =~ /^$nameExp$/o;
		$pro1++ if $m =~ /^$pro1Exp$/io;
		$ord++ if $m =~ /^$ordExp$/io;
		$causal++ if $m =~ /^$causalExp$/io;
		$time++ if $m =~ /^(months?|years?|weeks?|now|yesterday|after|before|past|future|present)$/io;
		$adj++	if $m =~ /^$neutExp$/io;
		$adjPN++ if $m =~ /^($posExp|$negExp)$/io;
		$kontr++ if $m =~ /n't|'ll|'d|'re|'ve/o;
		$sent++ if $m =~ /[a-z%0-9\$]{3}\.$/o; # klein geschriebenes wort vor punkt -> satzende
#		next DIR if 1000*$you/$norm > 0.3 || 1000*($adj+$adjPN)/$norm > 20 || 1000*$adjPN/$norm > 0.4;
#		next DIR if 1000*$causal/$norm > 4 || 1000*$name/$norm > 15;
	}
	$adj += $adjPN;
	next DIR if $sent < 1;
	
	# time
	while ( $text =~ /$dateExp/g ) {
		$time++;
	}
	next DIR if $time == 0;
	
	# speech
	while ( $text =~ /(?:\W["'])|(?:["']\W)/g ) {
		$speech++;
	}
	
	# pos
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	$past = `cat $ft | grep -c 'V[VBH][ND]\$'`;
	$present = `cat $ft | grep -c 'VV[PZ]\$'`; # ohne have und be, wegen "has said" etc.
	$conj = `cat $ft | grep -c 'CC\$'`;
	
	next DIR if 1000*$verb/$norm < 3;
	
	my $you_rel = sprintf "%.3f", 1000*$you/$norm;
	$he = sprintf "%.3f", 1000*$he/$norm;
	my $ord_rel = sprintf "%.3f", 1000*$ord/$norm;
	$conj = sprintf "%.3f", 1000*$conj/$norm;
	$causal = sprintf "%.3f", 1000*$causal/$norm;
	$time = sprintf "%.3f", 1000*$time/$norm;
	$past = sprintf "%.3f", $past/$verb;
	$ing = sprintf "%.3f", $ing/$verb;
	$present = sprintf "%.3f", $present/$verb;
	$verb = sprintf "%.3f", 1000*$verb/$norm;
	$adjPN = sprintf "%.3f", $adjPN/$adj;
	$adj = sprintf "%.3f", 1000*$adj/$norm;
	$kontr = sprintf "%.3f", 1000*$kontr/$norm;
	$quest = sprintf "%.3f", 1000*$quest/$norm;
	
	if ( $length < 6500 && $length > 200 && $headline < 3 && $sent > 1
		 && ($you_rel < 0.3 || $you_rel < 0.9 && $speech > 6*$you) && ($we+$pro1-3*$speech <= 0 || $we+$pro1 < 3)
		 && $verb >= 5 && $past > 0.18 && ($past > $present || $ing > $present )
		 && $adj < 20 && $adjPN < 0.4
		 && $causal < 4 && $time > 0 && $name < 15 && $quest < 0.01 && ($ord_rel < 1.5 || $ord < 3)
		 && ($kontr < 0.4 || $speech > 0 && $kontr/$speech < 0.2) ) {
		print "$filename $sent ,$length, $kontr, $adj, $adjPN, $verb, $past, $present, $ing, $speech, $you_rel, $we, $pro1, $ord, $ord_rel, $name, $time IS\n";
	} elsif ($base == 24) {
		print "$filename $sent, $length, $kontr, $adj, $adjPN, $verb, $past, $present, $ing, $speech, $you_rel, $we, $pro1, $ord, $ord_rel, $name, $time NO\n";
	}
}

close( DH );

# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initCause {
	my $reg = "(?:";
	open( L1, "../Wortlisten/kausal.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	my $pos = "(?:";
	my $neg = "(?:";
	my $neut = "(?:";
	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$neg .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$pos .= "$_|";
	}
	close( PS );

	open( NT, "../Wortlisten/adjectives_neut.txt" ) || die "can't find neut.txt";
	binmode(NT, ":utf8"); 
	while ( <NT> ) {
		chomp;
		$_ = lc($_);
		$neut .= "$_|";
	}
	close( NT );
	chop ($pos,$neg,$neut);
	$pos .= ")";
	$neg .= ")";
	$neut .= ")";
	return ($pos,$neg,$neut);
}
