#!/usr/bin/perl -w

# erkennt personenlisten
# sucht sortierte Aufzählungen, vornamen, Mr., Firmennamen


# ------------------------  HISTORY -----------------------
# 15.12.05	new			from find_glossary
# 21.12.05	added		Laendernamen, liste/length-Bedingung
# 22.01.06	changed		dir -> base

$VERSION = '22.01.06';


#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "7";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;
my $titles = "Mr|Mrs|Ms|Dr|Prof|Rev|Bishop|Archbishop|Father|Lord|Earl|Duke|Count|Countess|Baron|Baroness";
my $comp = "Systems|Corporation|Broadcast|Productions?|United|Ltd|Designs?|Holdings?|Services?|Inc|Limited|University|Company|Centre|Academy|Department|Group|AG|Theatre|Magazine|Publications|Association|Associates|Co|Laboratories|Corp|Works|Engineering|Technology|Electronics";
my $nameExp = initNames();
$nameExp .= "|$titles|$comp";

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $glossary = 0;
	my $az = 0;
	my $len = 0;
	my $number = 0;
	my @dt = ();	# fuer reihenfolge

	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );

	# unfug + select
	$text =~ s/(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:<\/?(?:a|font|i|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	$text =~ s/&nbsp;/ /gs;
	$text =~ s#<strong[^>]*>#<b>#gio;
	$text =~ s#</strong>#</b>#gio;
	$text =~ s#<([^>]*?) [^>]*>#<$1>#gio;
	
	my @subs;
	my @orig;
	# process <pre>
	while ( $text =~ m#<pre[^>]*>(.*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	$text =~ s/\s+/ /gio;
	
	my $theWayThatIfoundYou = "dl";
	# falls <dl> gefunden: push <dt>... 
	if ( $text =~ /<dl/i ) {
		while ( $text =~ m#<dt[^>]*>(.*?)(?:</dt>|<dd)#ig ) {
			$_ = $1;
			s/<[^>]+>//g;
			s/(?<=\W)$titles(?=\W)/ /go;
			s/^\W+|\W+$//g;
			next unless m/(?:(?<=\W)|^)($nameExp)(?:(?=[^a-zA-Z'])|$)/o;
			print "* $_ -> $1\n" if DEBUG;
			push( @dt, $_);
		}
	}
	# wenn nichts gefunden, dann <ol> und <ul>
	if ( @dt < 15 && $text =~ /<ol|<ul/i ) {
		$theWayThatIfoundYou = "ul";
		@dt = ();
		while ( $text =~ m#<li[^>]*>(.*?)(?=</li>|<li>|</ul>|</ol>)#oig ) {
			$_ = $1;
			s/<[^>]+>//g;
			s/(?<=\W)$titles(?=\W)/ /g;
			s/^\W+|\W+$//g;
			next unless m/(?:(?<=\W)|^)($nameExp)(?:(?=[^a-zA-Z'])|$)/o;
			print "* $_ -> $1\n" if DEBUG;
			push( @dt, $_);
		}
	}
	# noch nichts? tabellen
	if ( @dt < 15 ) {
		$theWayThatIfoundYou = "td";
		@dt = ();
		while ( $text =~ m#<tr[^>]*> ?<td[^>]*>(.*?)</td> ?</tr>|<tr[^>]*> ?<td[^>]*>(.*?</td> ?<td[^>]*>.*?)</td>#gio ) {
			$_ = $1 if defined $1;
			$_ = $2 if defined $2;
			s#</td>#,#gi;
			s/<[^>]+>//g;
			s/(?<=\W)$titles(?=\W)/ /go;
			s/^\W+|\W+$//g;
			next unless m/(?:(?<=\W)|^)($nameExp)(?:(?=[^a-zA-Z'])|$)/o;
			print "* $_ -> $1\n" if DEBUG;
			push( @dt, $_);
		}
	}
	# noch nichts? <b>wort</b>, GROSS[.:-]
	if ( @dt < 15 ) {
		$theWayThatIfoundYou = "b";
		@dt = ();
		while ( $text =~ m#> ?<(?:b|B)>(.*?)</(?:b|B)>|> ?([A-Z][A-Z,; -]{1,30}[A-Z])#go ) {
			$_ = $1		if defined $1;
			$_ = $2		if defined $2;
			s/<[^>]+>//g;
			s/(?<=\W)$titles(?=\W)/ /go;
			s/^\W+|\W+$//g;
			next unless m/(?:(?<=\W)|^)($nameExp)(?:(?=[^a-zA-Z'])|$)/o;
			print "* $_ -> $1\n" if DEBUG;
			push( @dt, $_);
		}
	}
	# immer noch nichts... \nname
	if ( @dt < 10 ) {
		$theWayThatIfoundYou = "nl";
		$text =~ s#(<br>|<p>|<\/p>|<\/div>|<\/tr>|<tr>|<br ?\/>)#\n#gsio;
		$text =~ s#<[^>]*>##gsio;
		@dt = ();
		while ( $text =~ m#\n([^\n]{3,30})#g ) {
			$_ = $1		if defined $1;
			if (length == 30) {
				s/\w+\s*$//;
			}
			s/<[^>]+>//go;
			s/(?<=\W)$titles(?=\W)/ /go;
			s/^\W+|\W+$//go;
			next unless m/(?:(?<=\W)|^)($nameExp)(?:(?=[^a-zA-Z'])|$)/o;
			print "* $_ -> $1\n" if DEBUG;
			push( @dt, $_);
		}
	}
		
	$text =~ s/<[^>]+?>//gos;
	$text =~ s/\s+/ /gio; # del later
	
	while ( $text =~ /\d+/go ) {
		$number++;
	}
	
	$len = length( $text );
		
	# is dt sorted?
	my $sortDebug = 0;
	
	my $key1 = "a";
	my $key2 = "";
	my $key1_2 = "a";
	my $key2_2 = "";
	my $key1_3 = "a";
	my $key2_3 = "";
	my $errors = 0;
	my $errors2 = 0;
	my $errors3 = 0;
	my ($length,$length2,$length3) = (0,0,0);
	my %seen = ();
	foreach (@dt) {
		next if exists($seen{lc($_)});
		$seen{lc($_)}++;
		my ($first, $sec, $last) = (split (/[^a-zA-Z]+/, $_))[0,1,-1];
		# sort nach erstem namen
		$_ = $first || "";
		if ( m/^[A-Z]\w{2,}/ ) {
			$length++;
			$_ = lc($_);
			if ($_ lt $key1) {
				if ($key2 eq "") {
					print "1. \U$_\n"	if $sortDebug;
					$errors++;
					$key2 = $_;
				} elsif ($_ ge $key2) {
					$key1 = $key2;
					$key2 = "";
					print "1. \L$_ (new key)\n" if $sortDebug;
				} else {
					print "1. \U$_\n"	if $sortDebug;
					$key2 = $_;
					$errors++;
				}
			} else {
				print "1. \L$_\n"	if $sortDebug;
				$key1 = $_;
				$key2 = "";
			}
		}
		# sort nach zweitem namen
		$_ = $sec || "";
		if ( m/^[A-Z]\w{2,}/ ) {
			$length2++;
			$_ = lc($_);
			if ($_ lt $key1_2) {
				if ($key2_2 eq "") {
					print "2. \U$_\n"	if $sortDebug;
					$errors2++;
					$key2_2 = $_;
				} elsif ($_ ge $key2_2) {
					$key1_2 = $key2_2;
					$key2_2 = "";
					print "2. \L$_ (new key)\n" if $sortDebug;
				} else {
					print "2. \U$_\n"	if $sortDebug;
					$key2_2 = $_;
					$errors2++;
				}
			} else {
				print "2. \L$_\n"	if $sortDebug;
				$key1_2 = $_;
				$key2_2 = "";
			}
		}
		# sort nach letztem namen
		$_ = $last || "";
		if ( m/^[A-Z]\w{2,}/ ) {
			$length3++;
			$_ = lc($_);
			if ($_ lt $key1_3) {
				if ($key2_3 eq "") {
					print "3. \U$_\n"	if $sortDebug;
					$errors3++;
					$key2_3 = $_;
				} elsif ($_ ge $key2_3) {
					$key1_3 = $key2_3;
					$key2_3 = "";
					print "3. \L$_ (new key)\n" if $sortDebug;
				} else {
					print "3. \U$_\n"	if $sortDebug;
					$key2_3 = $_;
					$errors3++;
				}
			} else {
				print "3. \L$_\n"	if $sortDebug;
				$key1_3 = $_;
				$key2_3 = "";
			}
		}
	}
	if ( $length2 > 0.9*$length && $errors2 < $errors ) {
		$errors = $errors2;	
		$length = $length2;
	}
	if ( $length3 > 0.9*$length && $errors3 < $errors ) {
		$errors = $errors3;	
		$length = $length3;
	}
	
	# normalize, classify, output	
	if ( $length > 10 && $length/$len > 0.005 && ($errors < 2 || $length > 50 && $errors < 9 || $length > 100 && $errors < 18 || $length > 500 && $errors < 30 || $length > 1000 && $errors < 50)) {
		print "$filename   \t$length\t$errors\t$number  \t$theWayThatIfoundYou\tYES[1]\n";
	} elsif ( $length/$len > 0.0075 && $length > 10 || @dt > 100 && @dt/$len > 0.0075) { #neu: nach oder
		print "$filename   \t",$length/$len,"  \t$len\t$number  \t$theWayThatIfoundYou\ YES[2]\n";
	} elsif ( $base == 7 ) {
		print "$filename   \t",@dt+0,"  \t$len  \tNO PERSON LIST\n";
	}

	print "\n" if DEBUG;
}
closedir( DH );

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/countries.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/cities.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
		chop $reg;
	$reg .= ")";
	return $reg;
}
sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}