#!/usr/bin/perl -w

# erkennt 11: Dates (Veranstaltungen, Timelines, ...)
# sucht sortierte Aufzählungen mit Zahlen

# ------------------------   TODO  ----------------------------
# auch datum mit - oder . oder / erkennen


# ------------------------  HISTORY ---------------------------
# 14.12.05	new			
# 15.12.05	changed min -> max (war fehler), length -> + 0 (auch)
#			changed	month-names toLower, added "sept"
#			changed in text-suche keine Jahreszahlen oder 22:30. zu ambig.
#			comment	keywords bringen nichts, sind aber hier nur auskommentiert.
# 22.01.06	changed: dir -> base

$VERSION = '22.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on

my $base = $ARGV[0] || "11";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# ----------- exp ------------
my $monthsE = qr/January|February|March|April|May|June|July|August|September|October|November|December/i;
my $monthsAbbE = qr/Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/i;
my %months = (january => "01",february => "02",march => "03",april => "04",may => "05",june => "06",
		july => "07",august => "08",september => "09",october => "10",november => "11",december => "12");
my %monthsAbb = (jan => "01",feb => "02",mar => "03",apr => "04",may => "05",jun => "06", jul => "07",
		aug => "08",sep => "09",sept => "09",oct => "10",nov => "11",dec => "12");
my $dayE = qr/0[1-9]|[12][0-9]|3[01]/;
my $monthNE = qr/0[1-9]|10|11|12/;
my $yearE = qr/1\d{3}|20\d{2}/;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $len = 0;
	my @datesYMD = ();
	my @datesMD = ();
	my @datesY = ();
	my @datesT = ();
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );

	# unfug + select
	$text =~ s/(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|b|strong|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	$text =~ s/&nbsp;/ /gs;
	
	# <pre>
	my @subs;
	my @orig;
	while ( $text =~ m#<pre[^>]*>((?!</pre>).*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	$text =~ s/\s+/ /gio;
	
	
	# normalize dates (vgl. timeline_train.txt)
	$text =~ s/(?<=\W)(Time|Date|Mon|Tue|Wed|Wedn|Thu|Thur|Fri|Sat|Sun|Monday|Tuesday|Wednsday|Thursday|Friday|Saturday|Sunday)(?=\W)|,|\Wc\.|~//gio;
	$text =~ s/(?<![\d\$])([1-9]|[01][0-9]|2[0-3])\.([0-5][0-9])(?=\D)/$1:$2/g;	#Uhrzeit mit : statt .
	$text =~ s/(?<=\D)(\d)(?=\D)/0$1/g;	# Zahlen zweistellig
	$text =~s/(?<=\W)(dep|arr)(?=\W)//gi;	#für Fahrpläne
	$text =~ s/($monthsE|$monthsAbbE)/\L$1/gio;	# monatsnamen toLower
	# Angaben mit Jahrezahl -> 1963-01-22
	$text =~ s/((?:$monthsE|$monthsAbbE\.?) $dayE) ?- ?(?:$monthsE|$monthsAbbE\.?) $dayE/$1/gio;
	$text =~ s/((?:$monthsE|$monthsAbbE\.?) $dayE) ?- ?$dayE/$1/gio;
	$text =~ s/($dayE)[ -]($monthsE) ($yearE)/$3-$months{$2}-$1/gio;
	$text =~ s/($monthsE)[ -]($dayE) ($yearE)/$3-$months{$1}-$2/gio;
	$text =~ s/($yearE) ($monthsE)[ -]($dayE)(?=\D)/$1-$months{$2}-$3/gio;
	$text =~ s/($dayE)[ -]($monthsAbbE)\.? ($yearE)/$3-$monthsAbb{$2}-$1/gio;
	$text =~ s/($monthsAbbE)\.? ($dayE) ($yearE)/$3-$monthsAbb{$1}-$2/gio;
	$text =~ s/($yearE) ($monthsAbbE)\.? ($dayE)(?=\D)/$1-$monthsAbb{$2}-$3/gio;
	# Angaben ohne Jahr -> 01-22
	$text =~ s/($dayE)[ -]($monthsE)\W/$months{$2}-$1/gio;
	$text =~ s/($monthsE)[ -]($dayE)(?=\D)/$months{$1}-$2/gio;
	$text =~ s/($dayE)[ -]($monthsAbbE)\W/$monthsAbb{$2}-$1/gio;
	$text =~ s/($monthsAbbE)\.? ($dayE)(?=\D)/$monthsAbb{$1}-$2/gio;
	
	$text =~ s/\s+/ /gio;
	
	# search dates: erst in dl > ul > table > text
	# falls <dl> gefunden: push <dt>... 
	if ( $text =~ /<dl/i ) {
		while ( $text =~ m#<dt[^>]*>\D{0,2}?(?:($yearE-$monthNE-$dayE)|($monthNE-$dayE)|($yearE)|(\d{2}:\d{2}))(?=\D)#ig ) {
			push( @datesYMD, $1 ) if defined $1;
			push( @datesMD, $2 )  if defined $2;
			push( @datesY, $3 )   if defined $3;
			push( @datesT, $4 )   if defined $4;
		}
	}
	if ( getMax(@datesYMD+0,@datesMD+0,@datesY+0,@datesT+0) < 4 && $text =~ /<ul/i ) {
		# falls weniger als 5 gefunden und es <ul> gibt: push <li>
		@datesYMD = ();
		@datesMD = ();
		@datesY = ();
		@datesT = ();
		while ( $text =~ m#<li[^>]*>\D{0,2}?(?:($yearE-$monthNE-$dayE)|($monthNE-$dayE)|($yearE)|(\d{2}:\d{2}))(?=\D)#ig ) {
			push( @datesYMD, $1 ) if defined $1;
			push( @datesMD, $2 )  if defined $2;
			push( @datesY, $3 )   if defined $3;
			push( @datesT, $4 )   if defined $4;
		}
	}
	if ( getMax(@datesYMD+0,@datesMD+0,@datesY+0,@datesT+0) < 4 && $text =~ /<table/i ) {
		# ... noch nichts? dann tabellen
		@datesYMD = ();
		@datesMD = ();
		@datesY = ();
		@datesT = ();
		while ( $text =~ m#<t(?:d|h)[^>]*>\D{0,2}?(?:($yearE-$monthNE-$dayE)|($monthNE-$dayE)|($yearE)(?! BC)|(\d{2}:\d{2}))(?=\D)#ig ) {
			push( @datesYMD, $1 ) if defined $1;
			push( @datesMD, $2 )  if defined $2;
			push( @datesY, $3 )   if defined $3;
			push( @datesT, $4 )   if defined $4;
		}
	}
	if (getMax(@datesYMD+0,@datesMD+0,@datesY+0,@datesT+0) < 4) {
		# ... immer noch nichts? dann nach >
		@datesYMD = ();
		@datesMD = ();
		@datesY = ();
		@datesT = ();
		while ( $text =~ m#>\D{0,3}?(?:($yearE-$monthNE-$dayE)|($monthNE-$dayE)|($yearE)|(\d{2}:\d{2}))(?=\D)#ig ) {
			push( @datesYMD, $1 ) if defined $1;
			push( @datesMD, $2 )  if defined $2;
			push( @datesY, $3 )   if defined $3;
			push( @datesT, $4 )   if defined $4;
		}
	}
# 	if (getMax(@datesYMD+0,@datesMD+0,@datesY+0,@datesT+0) < 4) {
# 		# ... immer noch nichts? dann irgendwo. aber keine Y oder T, zu mehrdeutig.
# 		@datesYMD = ();
# 		@datesMD = ();
# 		@datesY = ();
# 		@datesT = ();
# 		while ( $text =~ m#(?<=\D)(?:($yearE-$monthNE-$dayE)|($monthNE-$dayE))(?=\D)#ig ) {
# 			push( @datesYMD, $1 ) if defined $1;
# 			push( @datesMD, $2 )  if defined $2;
# 		}
# 	}

	my $sortDebug = 0;

	# längste liste
	my $longestList = "YMD";
	my @sortMe = ();
	$longestList = "MD" if @datesMD > @datesYMD;
	$longestList = "Y" if @datesY > @datesMD && @datesY > @datesYMD;
	$longestList = "T" if @datesT > @datesMD && @datesT > @datesYMD && @datesT > @datesY;
	if ($longestList eq "YMD") {
		@sortMe = map {s/\D//g;$_} @datesYMD;
	} elsif ($longestList eq "MD") {
		@sortMe = map {s/\D//g;$_} @datesMD;
	} elsif ($longestList eq "Y") {
		@sortMe = map {s/\D//g;$_} @datesY;
	} elsif ($longestList eq "T") {
		@sortMe = map {s/\D//g;$_} @datesT;
	} 
	if ( @sortMe < 4) {
		print "$filename: NO LIST\n" if $dir eq "../../korpus/clean/trainkorpus/11/";
		next DIR;
	}	
	print "$longestList: "if $sortDebug;
	
	# determine sort-order (asc? desc?) # später mal: erst so, dann so, dann vergleichen
	my $so = 0;
	for (my $i=1; $i<20 && $i<@sortMe; $i++) {
		print $sortMe[$i]."\n" if $sortDebug;
		$so++ if $sortMe[$i-1] < $sortMe[$i];
		$so-- if $sortMe[$i-1] > $sortMe[$i];
	}
	if ($so < -getMin( 10,length(@sortMe) )/2 ) {
		$so = -1;
		print "sort desc " if $sortDebug;
	} elsif ($so > getMin( 10,length(@sortMe) )/2 ) {
		$so = 1;
		print "sort asc  " if $sortDebug;
	} else {
		print "$filename IS UNSORTED\n" if $dir eq "../../korpus/clean/trainkorpus/11/";
		next DIR;
	}
	
	my $key1 = 0;
	my $key2 = 0;
	my $errors = 0;
	my $errorsInRow = 0;
	my $switches = 0;
	foreach (@sortMe) {
		last if $switches == 3;
		if ($errorsInRow == 3) {
			$so *= -1;
			$errorsInRow = 0;
			$switches++;
			print "switch\n"	if $sortDebug;
		}
		if ($so*$_ < $so*$key1) {
			if ($key2 == 0) {
				print "-$_\n"	if $sortDebug;
				$errors++;
				$errorsInRow++;
				$key2 = $_;
			} elsif ($so*$_ >= $so*$key2) {
				$key1 = $key2;
				$key2 = 0;
				$errorsInRow = 0;
				print "+$_ (new key)\n" if $sortDebug;
			} else {
				print "-$_\n"	if $sortDebug;
				$errors++;
				$errorsInRow++;
			}
		} else {
			print "+$_\n"	if $sortDebug;
			$key1 = $_;
			$key2 = 0;
			$errorsInRow = 0;
		}
	}
	
	# keywords
# 	my $textNoA = $text;
# 	$textNoA =~ s#<a[^>]*>.*?</a>##gsio;
# 	$textNoA =~ s/<([^>]+?)>//gs;
# 	$textNoA = substr($textNoA, 0, length($textNoA)/4);
	
	$text =~ s/<([^>]+?)>//gs;
	$text =~ s/\s+/ /gio; # del later
	$len = length( $text );
	
	my $magic = (1-$errors*$errors/@sortMe/@sortMe)*(@sortMe/$len);
	if ( $magic > 0.001 && $errors/@sortMe < 0.33 && $errors < 30 && 
		($longestList eq "Y" && $switches < 2 || $longestList ne "Y" && $switches < 4)) {
		print "$filename: $longestList, $errors Fehler, $switches Switches, ",@sortMe+0," Zeilen, $len Zeichen | $magic\n";
	} elsif ($base == 11) {
		print "$filename NOPE ($longestList, $errors Fehler, $switches Switches, ",@sortMe+0," Zeilen, $len) | $magic\n";
	}
	
	# keywords pt. 2	
# 	my %keywordsTop = ();
# 	my %keywords = ();
# 	while ( $textNoA =~ /\W(timeline|calendar|chronology|chronological order|schedule|timetable|upcoming events|events)\W/gis ) {
# 		$keywordsTop{lc($1)}++;
# 	}
# 	while ( $text =~ /\W(timeline|calendar|chronology|chronological order|schedule|timetable|upcoming events|events)\W/gis ) {
# 		$keywords{lc($1)}++;
# 	}

}
closedir( DH );


sub getMin {
	my $min = shift;
	while (defined($_ = shift)) {
		$min = $_ if $_ < $min;
	}
	return $min;
}
sub getMax {
	my $max = shift;
	while (defined($_ = shift)) {
		$max = $_ if $_ > $max;
	}
	return $max;
}
sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}