#!/usr/bin/perl -w

# erkennt statistik: anzahl \d+, \d+%, <td>\d</td> - ENUM

# ------------------------  HISTORY -----------------------
# 28.11.05	new			created from find_literatur.pl, version 23.11.
# 02.12.05	changed		replace $bla = s///; with while (m//) $bla++;
# 22.12.05	changed		$percent und $number vertauscht
# 22.01.06	changed 	dir -> base
# 25.01.06	added		filter "zip,zipcode,road" etc. -> gegen adressverzeichnisse

$VERSION = '25.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "28";
my $dirBase = $ARGV[1] || "train";
my $dir = "../../korpus/clean/".$dirBase."korpus/".$base."/";


my $filename;
my $num_files = 50;
undef $/;

# open dir
opendir( DH, $dir ) or die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
#	next unless $filename eq "file11.html";# || $filename eq "file_271g1.html";
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR  ".";
	
	# ------- calculate features  ----------
	
	# read content
	open( FH, $dir.$filename ) || die $!;
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input,textarea,form + p)
	$text =~s#(?:<(head|script|style|noframes|select|textarea|form)[^>]*>.*?</\1>)|(?:<!--.*?-->)|(?:</?(?:big|blockquote|small|font|b|i|u|em|div|center|span|strong|a|input|o|yle|st1|p)(?:[ :=][^>]+)?/?>)##gsio;
	
	next if substr($text,0,2500) =~ /\b(phone|zip|zipcode|road|ave|drive)\b/io;
	
	if ($text =~ /class=xl/i) {
		print "$filename\t IS STAT: XLS\n";
		next;
	}
	
	$text =~ s/\s+/ /gio;
	
	my $numTD = 0;
	my $contNums = 0;
	
	my $niceNumber = qr#-?\d+(?:[ ,]\d{3})*(?:\.\d{1,2})?#;
	
	while ( $text =~ m#<td[^>]*> ?($niceNumber) ?%?</td>#gio ) {
		$numTD++;
		$contNums++ if $numTD eq $1;
	}
	
	if ( $numTD > 3 ) {
		if ( $contNums < 4 ) {
			print "$filename\t IS STAT: TD-NUM\n";
		} else {
			#print "$filename\t IS ENUM\n";
		}
		next;
	}
	
#	$text =~ s#<ol((?!</ol).*)</ol>##gio;
	
	$text =~ s/<[^>]+>//go;
	my $textlength = length( $text ) || 1;
	next DIR if $textlength < 100;
	
	my $percent = 0;
	while ( $text =~ /(\d+ ?%)/go ) {
		$percent++;
	}
	
	my $percent_ratio = $percent/$textlength || 0;
	if ($percent < 20 && $percent_ratio > 0.005 || $percent > 20 && $percent_ratio > 0.001) {
		# bei vielen %-zeichen geringere ratio notwendig
		print "$filename\t IS STAT: PER\n";
		next;
	}
	
	
	my $number = 0;
	while ( $text =~ /(\d+)/go ) {
		$number++;
	}
	# falls nicht in tabellen
	my $number_ratio = $number/$textlength;
	if ($number < 3000 && $number_ratio > 0.09 || $number > 3000 && $number_ratio > 0.04 || $textlength < 1000 && $number_ratio > 0.03) {
		# bei viele zahlen geringere ratio notwendig
		print "$filename\t IS STAT: NUM\n";
		next;
	}
	if ($base == 28) {
		print "$filename\t NO STAT ($number, $number_ratio, $percent, $percent_ratio, $contNums, $textlength)\n";
	}
}
close( DH );
