#!/usr/bin/perl -w

# erkennt personenlisten
# sucht sortierte Aufzählungen, vornamen, Mr., Firmennamen


# ------------------------  HISTORY -----------------------
# 15.12.05	new			from find_glossary
# 22.01.06	added		headline++ nur wenn häufigster name > 4
# 			changed		headline .= verbessert
#			added		pro1/speech < 2 (vs. 18)
# 22.01.06	changed: dir -> base

$VERSION = '22.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "22";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";


my $filename;
my $text = "";
my $num_files = 50;
my $nameExp = initNames();
my $gcExp = initGC();
my ($posExp,$negExp,$neutExp) = initAdj();

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $he = 0;
	my $she = 0;
	my $name = 0;
	my $keywords = 0;
	my %names = ();
	my %lastnames = ();
	my $len = 0;
	my $adjPN = 0;
	my $adjPNA = 0;
	my $maxName;
	my $maxLast;
	my $headlineName = 0;
	my $headlineLast = 0;
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );

	# unfug + select + links
	$text =~ s/(?:<(head|script|style|noframes|select|a)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	
	$text =~ s/&nbsp;/ /gio;
	$text =~ s/\s+/ /gio;
	
	my $headline = "";
	my $limit = length( $text )/5 < 1000 ? 1000 : length( $text )/5;
	my $startText = substr( $text, 0, 1000);
	my $hCounter = 0;
	while ( $text =~ m#<h\d>((?!</h).+?)</h|<b>(.+?)</b#gio ) {
		$hCounter++;
		last if $hCounter > 3;
		$_ = $1 || $2;
		$headline .= "$_ :: ";
	}
	#print STDERR "$headlineName\n" unless $headlineName eq "";
	
	$text =~ s/<([^>]+?)>//gs;
	$text = decode_entities( $text );
	my $speech = ($text =~ s/[‘“”"]/"/gio) || 1;
	
	# laenge
	$len = length( $text );

	# adjektive
	my $pos = 0;
	my $neg = 0;
	my $neut = 0;
	my $you = 0;
	my $pro1 = 0;
	
	while ($text =~ /([a-z]+)/gi) {
		$_ = lc($1);
		my $big = $1;
		$he++ 			if m/^he$/io;
		$she++ 			if m/^she$/io;
		$name++ 		if $big =~ m/^$nameExp$/o;
		$names{$_}++ 	if $big =~ m/^$nameExp$/o;
		$keywords++		if m/^(born|died|saddened|biography|obituary|dies)$/io;
		$pos++			if m/^$posExp$/io;
		$neg++			if m/^$negExp$/io;
		$neut++			if m/^$neutExp$/io;
		$lastnames{$_}++ if $big =~ m/^[A-Z]/ && !exists $names{$_} && $_ !~ /^$gcExp$/io;
		$pro1++ 		if m/^(I|my|me|myself|mine)$/io;
		$you++ 			if m/^(you|your|yourself|yourselves)$/io;
		next DIR if 1000*$you/$len > 2;
	}
	next DIR if $name == 0 || (keys %lastnames) == 0;
		
	# adjektive
	if ($neg > 0) {
		$adjPN = sprintf"%.4f", ($pos/$neg);
	} else {
		$adjPN = sprintf"%.4f", ($pos);
	}
	if ($pos+$neg+$neut > 0) {
		$adjPNA = sprintf"%.4f", ($pos+$neg)/($pos+$neg+$neut);
	} else {
		$adjPNA = 0;
	}
	
	# am haeufigsten genannter name (passt ziemlich gut!)
	my $n = (sort { $names{$a} <=> $names{$b} } (keys %names))[-1];
	$maxName = $names{$n};
#	print "$n: $maxName :: ",$maxName/$len,"\n";
	if ( $headline =~ /$n/i ) {
		$headlineName += 2 if ($maxName/$len > 0.001 || $maxName > 4);
	}
	$n = (sort { $lastnames{$a} <=> $lastnames{$b} } (keys %lastnames))[-1];
	$maxLast = $lastnames{$n};
	$headlineLast++ if $headline =~ /$n/i && ($maxLast/$len > 0.001 || $maxLast > 4);;
	
	($he,$she) = ($she,$he) if $she > $he;
	my $he_she = $he > 0 ? $she/$he : 100; # klein/gross -> kleiner ist besser
	my $heNameLast = 1000*($he+$maxName+$maxLast)/$len+$headlineLast+$headlineName;
	my $heName = 1000*($he+$maxName)/$len+$headlineName;
	
	# normalize, sprintf
	$pro1 = sprintf "%.3f", $pro1/$speech;
	$you = sprintf "%.3f", 1000*$you/$len;
	$he_she = sprintf "%.3f", $he_she;
	my $he_abs = $he;
	$he = sprintf "%.3f", 1000*$he/$len;
	$heName = sprintf "%.3f", $heName;
	$heNameLast = sprintf "%.3f", $heNameLast;
	$adjPN = sprintf "%.3f", $adjPN;
	$filename = substr($filename,0,10);
	
	# final output
	if ( $he_abs > 1 && $he > 0.5 && $heName > 1.5 && $adjPNA > 0.06 && $he_she < 0.25 && $len > 2000 && $you < 2) {
		if ( $heNameLast > 5 && $adjPN > 2.7 && $pro1 < 2 ) {
			print "$filename\t$he_she,\t$pro1,\t$you,\t$he,\t$heName,\t$heNameLast,\t$maxName,\t$maxLast,\t$adjPN [Y1]\n";
		} elsif ( $heNameLast > 7.6 && $adjPN > 1.2 ) {
			print "$filename\t$he_she,\t$pro1,\t$you,\t$he,\t$heName,\t$heNameLast,\t$maxName,\t$maxLast,\t$adjPN [Y2]\n";
		} elsif ( $base == 22 ) {
			print "$filename\t$he_she,\t$pro1,\t$you,\t$he,\t$heName,\t$heNameLast,\t$maxName,\t$maxLast,\t$adjPN [NO2]\n";
		}
	} elsif ( $base == 22 ) {
		print "$filename\t$he_she,\t$pro1,\t$you,\t$he,\t$heName,\t$heNameLast,\t$maxName,\t$maxLast,\t$adjPN [NO1]\n";
	}
}
closedir( DH );

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= "|Herb)";
	close( L1 );
	return $reg;
}
sub initGC {
	my $reg = "(?:";
	open( L1, "../Wortlisten/general_english.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	my $i = 0;
	while ( <L1> ) {
		last if $i++ > 400;
		chomp;
		$_ = lc($_);
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}
sub initAdj {
	my $pos = "(?:";
	my $neg = "(?:";
	my $neut = "(?:";
	open( NG, "../Wortlisten/adjectives_neg.txt" ) || die "can't find neg.txt";
	binmode(NG, ":utf8"); 
	while ( <NG> ) {
		chomp;
		$_ = lc($_);
		$neg .= "$_|";
	}
	close( NG );

	open( PS, "../Wortlisten/adjectives_pos.txt" ) || die "can't find pos.txt";
	binmode(PS, ":utf8"); 
	while ( <PS> ) {
		chomp;
		$_ = lc($_);
		$pos .= "$_|";
	}
	close( PS );

	open( NT, "../Wortlisten/adjectives_neut.txt" ) || die "can't find neut.txt";
	binmode(NT, ":utf8"); 
	while ( <NT> ) {
		chomp;
		$_ = lc($_);
		$neut .= "$_|";
	}
	close( NT );
	chop ($pos,$neg,$neut);
	$pos .= ")";
	$neg .= ")";
	$neut .= ")";
	return ($pos,$neg,$neut);
}


sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}