#!/usr/bin/perl -w

# erkennt glossar
# sucht a-z-Listen, sortierte Aufzählungen, "glossary" etc.


# ------------------------  HISTORY -----------------------
# 13.12.05	new			
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "29";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $nameExp = initNames();

my $filename;
my $text = "";
my $num_files = 50;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $glossary = 0;
	my $az = 0;
	my $len = 0;
	my @dt = ();	# fuer reihenfolge
	my %dt = ();	# fuer anzahl

	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );

	# unfug + select
	$text =~ s/(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	$text =~ s/&nbsp;/ /gs;
	$text =~ s/\s+/ /gio;
	$text =~ s#<strong[^>]*>#<b>#gio;
	$text =~ s#</strong>#</b>#gio;
	
	# falls <dl> gefunden: push <dt>... 
	if ( $text =~ /<dl/i ) {
		while ( $text =~ m#<dt[^>]*>(.*?)(,.*?)?(?:</dt>|<dd)#ig ) {
			$_ = $1;
			s/<[^>]+>//g;
			s/^\s+|\s+$//g;	# trim
			next unless m/^[a-z]/i && length($_) <= 30;
			push( @dt, lc($_))	unless exists $dt{lc($_)};
		}
	} else {
		# ... sonst: push <b>wort</b>[.:-], <b>wort[.:-]</b>, GROSS[.:-]
		while ( $text =~ m#> ?<(?:b|B)[^>]*>([a-zA-Z ]{2,30})</(?:b|B)> ?[.:-]|(?:b|B)[^>]*>([a-zA-Z ]{2,30}) ?[.:-] ?</(?:b|B)>|> ?([A-Z -]{1,30}[A-Z]) ?[.:-]#g ) {
			$_ = lc($1)		if defined $1;
			$_ = lc($2)		if defined $2;
			$_ = lc($3)		if defined $3;
			s/^\s+|\s+$//g; # trim
			next unless m/^[a-z]{2,}/;
			s/<[^>]+>//g;
			push( @dt, lc($_))	unless exists $dt{lc($_)};
			$dt{$_}++;
		}
		# wenn keine <b>begriff</b> oder BEGRIFF-Liste gefunden, dann Tabellen
		if ( @dt < 5 ) {
			@dt = ();
			while ( $text =~ m#<tr> ?<td[^>]*> ?.??([a-zA-Z ]{2,30})</td> ?<td[^>]*>.*?</td> ?</tr>#g ) {
				$_ = lc($1);
				s/^\s+|\s+$//g;	# trim
				next unless m/^[a-z]{2,}/;
				s/<[^>]+>//g;
				push( @dt, lc($_))	unless exists $dt{lc($_)};
				$dt{$_}++;
			}
		}
		# wenn immer noch nichts gefunden, dann <ol> und <ul>
		if ( @dt < 5 && $text =~ /<ol|<ul/i ) {
			while ( $text =~ m#<li[^>]*> ?<b[^>]*> ?.??([a-z ]{2,30}) ?</b>#ig ) {
				$_ = $1;
				s/<[^>]+>//g;
				s/^\s+|\s+$//g; # trim
				next unless m/^[a-z]/i && length($_) <= 30;
				push( @dt, lc($_))	unless exists $dt{lc($_)};
			}
		}
	}
	
	$text = decode_entities( $text );
	$text =~ tr/´`·‐‑‒–—―⁃/''.-/;
	
	# "glossary" etc. (nicht in <a></a>)
	my $textNoA = $text;
	$textNoA =~ s#<a[^>]*>.*?</a>##gsio;
	$textNoA =~ s/<([^>]+?)>//gs;
	$glossary++	if  $textNoA =~ /glossary|definition|lexicon|reference|dictionary|(?<!usage )terms(?! of usage)/gis;
	
	$text =~ s/<([^>]+?)>//gs;
	$text =~ s/\s+/ /gio; # del later
	$len = length( $text );
	
	# expand a-e to abcde
	my $alphabet = "abcdefghijklmnopqrstuvwxyz";
	my @subs;
	my @orig;
	while ( $text =~ /((?<=\W)(\w) ?- ?(\w)(?=\W|$))/gis ) {
		next if lc($2) eq "a" && lc($3) eq "z";
		my $start = index( $alphabet, lc($2) );
		my $length = index( $alphabet, lc($3) ) - $start +1;
		next if $length <= 0;
		my $org = $1;
		$org =~ s/-/\-/;
		push(@subs, substr($alphabet,$start,$length));
		push(@orig, $org);
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	
	$az++ if $text =~ /a\W*b\W*c\W*d\W*e\W*f\W*g\W*h\W*i\W*j\W*k\W*l\W*m\W*n\W*o\W*p\W*q\W*r\W*s\W*t\W*u\W*v\W*w\W*x\W*y\W*z/is;
	
	# frueher abbruch: a-z und "glossary" gefunden
	if ( $az > 0 && $glossary > 0 ) {
		print "$filename  \t$glossary\t$az\tIS GLOSSARY\n";
		next DIR;
	}
	
	
	# is dt sorted?
	my $sortDebug = 0;
	
	my $key1 = "a";
	my $key2 = "";
	my $errors = @dt/20;
	foreach (@dt) {
		last if $errors < -1;
		if ($_ lt $key1) {
			# "the", "to" wegwerfen
			if ($key1 =~ /^(?:the|to) (.*)/i) {
				if ($_ ge $1) {
					print "\L$_\n"	if $sortDebug;
					$key1 = $_;
					next;
				}
			}
			# ".", "'" wegwerfen
			my $key_tmp = $key1;
			s#[.'/:-]+##g;
			$key_tmp =~ s#[.'/:-]+##g;
			if ($_ ge $key_tmp) {
				print "\L$_\n"	if $sortDebug;
				$key1 = $_;
				next;
			}
			if ($key2 eq "") {
				print "\U$_\n"	if $sortDebug;
				$errors--;
				$key2 = $_;
			} elsif ($_ ge $key2) {
				$key1 = $key2;
				$key2 = "";
				print "\L$_ (new key)\n" if $sortDebug;
			} else {
				print "\U$_\n"	if $sortDebug;
				$errors--;
			}
		} else {
			print "\L$_\n"	if $sortDebug;
			$key1 = $_;
			$key2 = "";
		}
	}
	
	# normalize, classify, output	
	if ( $errors > 0 && @dt >= 10 ) {
		my $names = 0;
		while ( $text =~ /$nameExp/go ) {
			$names++;
		}
		$names = sprintf "%.3f", $names/$len;
		if ( $names < 0.025 ) {
			print "$filename  \t$glossary\t$az\t$errors\t$names\tIS GLOSSARY\n";
		} elsif ( $base == 29 ) {
			print "$filename  \t$glossary\t$az\t$errors\t$names\tNO GLOSSARY\n";
		}
	} elsif ( $base == 29 ) {
		print "$filename  \t$glossary\t$az\t$errors\tNO GLOSSARY\n";
	}

	print "\n" if DEBUG;
}
closedir( DH );

# ==============  SUB  ================ #


sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/countries.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/cities.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
		chop $reg;
	$reg .= ")";
	return $reg;
}

