#!/usr/bin/perl -w

# erkennt "nichts"


# ------------------------  HISTORY -----------------------
# 13.12.05	new			
# 22.01.06	changed 	dir -> base
# 22.01.06	added 		google
# 25.01.06	added		num, cur && !error -> kein fehler

$VERSION = '25.01.06';


#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
my $base = $ARGV[0] || "40";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );

	# unfug
	$text =~ s/(?:<(head|script|style|noframes)[^>]*>.*?<\/\1>)|(?:<!--.*?-->)//gsi;
	$text =~ s/\s+/ /gio;
	
	# enthält form? dann nicht "nichts" - außer wenn fehlermeldung
	my $form = 0;
	while ($text =~ m/<input ([^>]+)>/gi ) {
		my $m = $1;
		$form++ if $m =~ /text/i || $m !~ /type/i;
	}
		
	$text =~ s/<([^>]+?)>//gs;
	$text = decode_entities( $text );
	
	my $len = length( $text );
	next DIR if $len > 6000;
	
	# viele zahlen? dann nicht nichts
	my $num = 0;
	while ($text =~ /\d+/go ) {
		$num++;
	}

	# prozent, währungen?
	my $cur = 0;
	while ( $text =~ /\$|£|€|(&#36;)|(&#163;)|(&#xA[3C];)|(&euro;)|(&pound;)|eur|%/gi ) {
		$cur++;
	}
	
	my $error_text = 0;
	$error_text = 1 if $text =~ m#\W(error|file not found|index of |page not found|could not be found)\W#io;
	
	my $error = $error_text;
	$error += 1 if $text =~ /\b40\d\b/o;
	
	my $scriptErr = 0;
	$scriptErr = 1 if $text =~ /Stack Trace|Seeing this instead of the website you expected|web server .{0,200}?problem .{0,200}? request/io;

	# formular/num/cur und keine fehlermeldung?
	next DIR if $form > 0 && $error == 0 && $scriptErr == 0;
	next DIR if $num  > 5 && $error_text == 0 && $scriptErr == 0;
	next DIR if $cur  > 2 && $error == 0 && $scriptErr == 0;
	
	if ( $len < 300 || $len < 1000 && $error || $len < 5000 && $scriptErr ) {
		print "$filename  \t$len\t$error\t$scriptErr\tIS NOTHING\n";
	} elsif ( $base == 40 ) {
		print "$filename  \t$len\t$error\t$scriptErr\tNOT NOTHING\n";
	}

}
closedir( DH );
