#!/usr/bin/perl -w

# erkennt directories mit links am anfang/ende


# ------------------------  HISTORY -----------------------
# 08.12.05	new
# 05.01.06	changed		spezialisiert für linklisten: names, big = text
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "10";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;
our %structs;
our %firstnames;
our ($a_ext, $a_int, $a_page, $a_file) = (0,0,0,0);
our $allTimeMax = 0;
our $allTimeSec = 0;
our $allTimeMaxTag = "";
our $allTimeSecTag = "";
our $isLinklist = 0;
our $linkLines = 0;
&initHashes();

# date
my $day_e_ord = qr/(?:[1-3]?(?:1st|2nd|3rd|[04-9]th)|11th|12th|13th)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept)/;
my $year = qr/(?:(?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/(?:[1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#(?:[01][0-9]|2[0-3])[.-:][0-5][0-9]#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/(?:$day, ?$time|$time|$day_e_ord $month_e(?: $year)?|$month_e $day_e_ord(?:, $year)?|$month_e $day_num(?:, $year)?|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next if $filename =~ /^\./;
	next unless !defined($oneFile) || $filename eq $oneFile;
	last if $num_files-- == 0;

	print STDERR ".";
	print "\n$filename: "	if DEBUG;
	
	### main
	# vars
	$allTimeMax = 0;
	$allTimeSec = 0;
	$allTimeMaxTag = "";
	$allTimeSecTag = "";
	$isLinklist = 0;
	$linkLines = 0;
	my @lists;
	my @tables;
	my @paragraphs;
	my @divs;
	my $foundAList = 0; # something stupid (<li> als bild!)
	($a_ext, $a_int, $a_page, $a_file) = (0,0,0,0);
	
	open( FH, $dir.$filename );
	binmode(FH, ":utf8"); 
	$_ = <FH>;
	close( FH );
	
	s#</?tbody>##gsio;
	s/\s+/ /gsio;
	s/< /</gsio;
	s/ >/>/gsio;
	s#<br ?\/?>#<p>#gsio;
	s#<p[^>]*>#</p><p>#gsio;
	$_ = decode_entities( $_ );
	
	while ( m#<table[^>]*>(( ?<tr[^>]*>.*?</tr> ?)+)</table>#gsio ) {
		push( @tables, $1 );
	}
	while ( m#<ul[^>]*>(( ?<li[^>]*>.*?)+)</ul>#gsio ) {
		$foundAList = 1;
		push( @lists, "UL::$1" );
	}
	while ( m#<ol[^>]*>(( ?<li[^>]*>.*?)+)</ol>#gsio ) {
		$foundAList = 1;
		push( @lists, "OL::$1" );
	}
	while ( m#<dl[^>]*>(( ?<d(t|d)[^>]*>.*?)+)</dl>#gsio ) {
		$foundAList = 1;
		push( @lists, "DL::$1" );
	}
	while ( m#((<p[^>]*>.*?</p> ?){4,200})#gsio ) {
		push( @paragraphs, $1 );
	}
	while ( m#((<div[^>]*>.*?</div> ?){4,200})#gsio ) {
		push( @divs, $1 );
	}
	
	# struktur finden (ohne verschachtelung)
	%structs = ();
	TAB: foreach (@tables) {
		if ($_ =~ m#<table[^>]*>(.*)#gi) {	# nested: look at inner table
			push( @tables, $1 );
			next TAB;
		}
		my @lines = split /<\/tr> |<tr[^>]*>/i;
		next TAB if @lines < 5;		# min 5 zeilen
		my @structure = ();
		my $num_tds = 0;
		foreach (@lines) {
			next if $_ =~ /^\s*$/;
			#print "*T* $_\n";
			my @tds = split /<\/?td[^>]*>|<\/?th[^>]*>/i;
			$num_tds = @tds if $num_tds == 0;
			next TAB if $num_tds != @tds;	# colspan/rowsapn -> no dir
			my $struct = "";
			foreach (@tds) {
				next if $_ =~ /^\s*$|^<\/?t(d|h)[^>]*>$/gi;
				while ( m#<(/?\w+)([^>]*)>|($dateExp)|([^<\d]+)#gi ) {
					$struct .= &fillStruct($1,$2,$3,$4);
				}
				$struct .= " | ";
			}
#			print "*** $_ \n ->$struct\n" if $struct =~ /A_INT/;
			s#<[^>]*>##gi;
			if ($struct ne "") {
				#$struct .= length;
				push( @structure, $struct );
			}
		}
		if ( @structure > 3 ) {
			&compare(@structure) ;
		}
		#print "\n\nTABLE\n", join("\n", @structure) if @structure > 4;
	}
	print "TABLE: " if DEBUG;
	&printStruct();
	
	%structs = ();
	LI: foreach (@lists) {	# nur innerste liste betrachten dl-dt|dd
		if ($_ =~ m#^UL::.*?<ul[^>]*>(.*)|<ol|<dl#gi) {
			push( @lists, "UL::$1" ) if defined $1;
			next LI;
		} elsif ($_ =~ m#^OL::.*?<ol[^>]*>(.*)|<ul|<dl#gi) {
			push( @lists, "OL::$1" ) if defined $1;
			next LI;
		} elsif ($_ =~ m#^DL::.*?<dl[^>]*>(.*)|<ul|<ol#gi) {
			push( @lists, "DL::$1" ) if defined $1;
			next LI;
		}
	}
	if (@lists > 1) {
		# merge
		@lists = join ('\n', @lists);
	}
	foreach (@lists) {	# nur innerste liste betrachten dl-dt|dd
		s/DL::|OL::|UL:://;
		
		my @lines = split /<\/li> ?|<li[^>]*>|<dt[^>]*>|<\/dt> ?|<dd[^>]*>|<\/dd> ?/i;
		my @structure = ();
		foreach (@lines) {
			next if $_ =~ /^\s*$/;
			my $struct = "";
			while ( m#<(/?\w+)([^>]*)>|($dateExp)|([^<\d]+)#gi ) {
				$struct .= &fillStruct($1,$2,$3,$4);
			}
#			print "-> ** $_\n$struct\n" if $struct =~ /^A_INT|^A_EXT/;
			s#<[^>]*>##gi;
			if ($struct ne "") {
				#$struct .= length;
				push( @structure, $struct );
			}
		}
		if ( @structure > 3 ) {
			&compare(@structure) ;
		}
		#print "\n\nLIST\n", join("\n", @structure);
	}
	print "LIST: " if DEBUG;
	&printStruct();
	
	%structs = ();
	PAR: foreach (@paragraphs) {
		my @lines = split /<\/p> ?<p[^>]*>/i;
		my @structure = ();
		foreach (@lines) {
			next if $_ =~ /^\s*$/ || length > 1000;
			s/<\/?p>//gi;
			my $struct = "";
			while ( m#<(/?\w+)([^>]*)>|($dateExp)|([^<\d]+)#gi ) {
				next PAR if defined $1 && (($1 eq "ol" || $1 eq "ul" || $1 eq "dl" || $1 eq "li") && $foundAList == 1 || $1 eq "table"); # keine liste oder table in par
				$struct .= &fillStruct($1,$2,$3,$4);
			}
			#print "*P* ",substr($_,0,50),"\n" if $struct =~ /date/;
			s#<[^>]*>##gi;
			if ($struct ne "") {
				#$struct .= length;
				push( @structure, $struct );
			}
		}
		if ( @structure > 3 ) {
			&compare(@structure) ;
		}
		#print "\n\nPARAGRAPHS\n", join("\n", @structure) if @structure > 4;
	}
	print "PARAGRAPHS: " if DEBUG;
	&printStruct();
	
	%structs = ();
	DIV: foreach (@divs) {
		my @lines = split /<\/div> ?|<div[^>]*>/i;
		my @structure = ();
		foreach (@lines) {
			s/<\/?p>//gi;
			next if $_ =~ /^\s*$/ || length > 1000;
			#print "*D* $_\n";
			my $struct = "";
			while ( m#<(/?\w+)([^>]*)>|($dateExp)|([^<\d]+)#gi ) {
				next DIV if defined $1 && ($1 eq "ol" || $1 eq "ul" || $1 eq "dl" || $1 eq "li") && $foundAList == 1; # keine liste in div
				$struct .= &fillStruct($1,$2,$3,$4);
			}
			s#<[^>]*>##gi;
			if ($struct ne "") {
				#$struct .= length;
				push( @structure, $struct );
			}
#			print "*D* ",substr($_,0,50),"\n" if $struct =~ /^A_INT/;
		}
		if ( @structure > 3 ) {
			&compare(@structure) ;
		}
		#print "\n\nDIVS\n", join("\n", @structure) if @structure > 4;
	}
	print "DIVS: " if DEBUG;
	&printStruct();
	
	
	# IS LINKLIST?
	
	if ($isLinklist || ($allTimeMaxTag =~ /(^|\| |date\+)(A_INT|A_EXT|A_#)|(A_INT|A_EXT|A_#)$/ && $allTimeMaxTag !~ /post/ && $allTimeMax > 5 || $allTimeSec > 8 && $allTimeMax - $allTimeSec < $allTimeMax/2 && $allTimeSecTag =~ /(^|\| |date\+)(A_INT|A_EXT|A_#)|(A_INT|A_EXT|A_#)$/ && $allTimeSecTag !~ /post/)) {
		my @texts = split /<a [^>]+>.*?<\/a>|<select[^>]*>.*?<\/select>|<input[^>]*>/i, $_;
		my $longest = 0;
		foreach (@texts) {
			s/<[^>]+>//go;
			$longest = length($_) if length($_) > $longest;
		}
		my $input = 0;
		while (m/<input[^>]*text|<textarea/gi) {
			$input++;
		}
		s/<[^>]+>//go;
		my $ratio = sprintf "%.3f", 10000*$linkLines/length($_);
		if ( ($ratio > 70 || $linkLines > 200) && $longest < 2000 && $linkLines/$longest > 0.07 && $input < 5) {
			print "$filename IS LINKLIST ($ratio, $linkLines, $longest, ",$linkLines/$longest,")\n";
		} elsif ($base == 10) {
			my $allLinks = 0;
			while ( m#(<a [^>]*?href.*?</a>)#gio ) {
				$allLinks .= $1;
			}
			$allLinks =~ s/<[^>]*>//go;
			s/<[^>]*>//go;
			my $ratio = sprintf "%.3f", length($allLinks)/length($_);
			if ($ratio > 0.35 ) {
				print "$filename  IS LINKLIST ($ratio)\n";
			} elsif ($base == 10) {
				print "$filename  NO[1] LINKLIST ($ratio)\n";
			}
		}
	} else {
		my $allLinks = 0;
		while ( m#(<a [^>]*?href.*?</a>)#gio ) {
			$allLinks .= $1;
		}
		$allLinks =~ s/<[^>]*>//go;
		s/<[^>]*>//go;
		my $ratio = sprintf "%.3f", length($allLinks)/length($_);
		if ($ratio > 0.35 ) {
			print "$filename  IS LINKLIST ($ratio)\n";
		} elsif ($base == 10) {
			print "$filename  NO[2] LINKLIST ($ratio)\n";
		}
	}
	
}
closedir( DH );


# ===============================     =============================== #
# =============================== FUN =============================== #
# ===============================     =============================== #

sub fillStruct () {
	my ($tag, $att, $zahl, $text) = @_;
	my $struct = "";
	if (defined $tag && $tag !~ m#/#) {
		if ($tag =~ /^a/i && defined $att) {
			if ($att =~ /href ?= ?["']#(.)/i) {
				if ($1 !~ /["']/) {
					$struct .= "A_PAGE+";
				} else {
					$struct .= "A_#+";
				}
			} elsif ($att =~ /href ?= ?["']ftp:|href ?= ?["'][^"']*\.(pdf|zip|ps|ppt|gz|doc)/i) {
				$struct .= "A_FILE+";
			} elsif ($att =~ /href ?= ?["'](http):/i) {
				$struct .= "A_EXT+";
			} elsif ($att =~ /href/i && $att !~ /javascript/i) {
				$struct .= "A_INT+";
			}
		} elsif ($tag !~ /^font|^b$|^i$|^csobj|^center$/) {
			$struct .= uc( (split /\s+|:/, $tag )[0] )."+";
		}
	}
	$struct .= "date+" 	if defined $zahl;
	if ( defined $text && $text !~ /^\s*$/ ) {
		if ( 10*(sprintf "%d", length($text)/10) > 0 ) {
			# ganz kurze texte vernachlässigen,	textlänge in 500er-schritten
			my $tl = 500*(sprintf "%d", length($text)/500);
			$struct .= "post+" if $text =~ /posted by/i;
			$struct .= "text-".$tl."+";
		}
	}
	return $struct;
}

sub compare () {
	# sucht gleiche zeilen in @structures und füllt %structs
	foreach (@_) {
		# löschen von trailing br und p; leading li, div; font, b, i | und doppelten elementen
		s/(P\+)+$//g;
		s/(BR\+)+$//g;
		s/( \| )|(^ \| )//g;
		s/^(LI\+|DIV\+)+//g;
		s/(\w+\+)\1+/$1/g;
		s/(text-\d+\+)+/$1/g;
		if (m/^(A_EXT|A_INT)/) {
			$_ = "$1";
		}
		$structs{$_}++;
	}
}

sub printStruct () {
	my ($key, $val);
	my ($max, $sec) = (0, 0);
	my ($maxTag, $secTag) = ("","");
	#print "(";
	while ( ($key, $val) = each %structs ) {
		#print "$key => $val\n ";#	if $key =~ /A_EXT|A_INT/;			
		if ($val > $max) {
			$secTag = $maxTag;
			$maxTag = $key;
			$sec = $max;
			$max = $val;
			if ($max > $allTimeMax) {
				$allTimeMax = $max;
				$allTimeMaxTag = $maxTag;
				$allTimeSec = $sec;
				$allTimeSecTag = $secTag;
				$linkLines = $max if $maxTag =~ /(^|\| |date\+)(A_EXT|A_INT|A_#)|(A_EXT|A_INT|A_#)$/;
			}
		} elsif ($val > $sec) {
			$sec = $val;
			$secTag = $key;
			if ($sec > $allTimeSec) {
				$allTimeSec = $sec;
				$allTimeSecTag = $secTag;
				$linkLines = $sec if $secTag =~ /(^|\| |date\+)(A_EXT|A_INT|A_#)|(A_EXT|A_INT|A_#)$/
									&& $maxTag !~ /(^|\| |date\+)(A_EXT|A_INT|A_#)|(A_EXT|A_INT|A_#)$/;
			}
		}
	}
	#print ")";
	print "-> $max, $sec, $maxTag, $secTag\n"  if DEBUG;
	# enthält auch dann Linkliste, wenn AllTimeMax keine ist, aber dafür TempMax min 20 externe elemente hat
	if ($maxTag =~ /(^|\| |date\+)A_EXT|A_EXT$/ && $maxTag !~ /post/ && $max > 20 || $sec > 20 && $max - $sec < $max/2 && $secTag =~ /(^|\| |date\+)A_EXT|A_EXT$/ && $secTag !~ /post/) {
		$isLinklist = 1;
		my $tmpLinkLines = ($maxTag =~ /(^|\| |date\+)A_EXT|A_EXT$/ && $maxTag !~ /post/ && $max > 20) ? $max : $sec;
		$linkLines = $tmpLinkLines if $tmpLinkLines > $linkLines;
	}
}

sub initHashes {
	open( L1, "../../wortlisten/e_vornamen.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$firstnames{lc($_)} = 1;
	}
	close( L1 );
}
