#!/usr/bin/perl -w

# erkennt formulare (vgl. summary) (_enthaelt_ formular!) >= 3 elemente, davon >= 2 input


# ------------------------  HISTRORY -----------------------
# 02.11.05	new			created from find_katalog.pl, version 02.11.
#			deleted		reg-exp for keywords (name, state, zip, phone...)
# 23.11.05	added		Astro::guess_charset, USE_CLEAN
# 02.12.05	changed		output
# 17.01.05	changed		google
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "38";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $num_files = 20;
undef $/;

# --------- regex ----------- 

my $input_exp = qr#(?:<input[^>]*(type=["']?([^"' >]+)["']?[^>]*>)|(?:<input[^>]*>)|(?:<textarea[^>]*>))#i;

my $select_exp = qr/<select[^>]*>(.*?)<\/select>/i;

my $form_exp = qr/<form[^>]*>(.*?)<\/(?:form|body)>/i;

my $garbage = qr/(?:<(head|(?:script[^>]*?)|style|noframes)>.*?<\/\1>)|(?:<\/?(?:(?:font[^>]*?)|b|i|u|blink|(?:div[^>]*?)|(?:label[^>]*?)|center|strong|(?:span[^>]*?))>)|(?:<!--.*?-->)/i;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	#print $filename.": ";
	
	# ------- calculate features  ----------
	my $text = "";
	my $input = 0;
	my @select = ();
#	my $textlength = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	# remove text in <script>, <head>, <style>, remove <font>, <b>, <i>, comments
	$text =~ s/$garbage//gsi;
	$text =~ s/&nbsp;/ /gsi;

	$text =~ s/\s+/ /gsi;
	
	my $formLength = 0;
	my $length = 0;
	my $isForm = 0;
	
	# search forms
	while ( $text =~ /$form_exp/gsi ) {
		my $form = $1;
		my $input_tmp = 0;
		# text-fields: type="text" is default
		while ( $form =~ /$input_exp/gsi ) {
			$input_tmp++	if !defined $2 || (defined $2 && $2 =~ /^t|^pass/i);
		}
		if ($input_tmp > $input) {
			$input = $input_tmp;
			@select = ();
			# select with number of options
			while ( $form =~ /$select_exp/gsi ) {
				my $sel = $1;
				push( @select, ($sel =~ s/<option[^>]*>//gsi) );
			}
		}
		$form =~ s/<[^>]+>//go;
		$formLength += length( $form ) + 15*$input + 10*@select;
		if ( $input > 1 && $input + @select > 2 ) {
			$isForm = 1;
		}
	} # end form-loop
	
	$text =~ s/<[^>]+>//go;
	$length = length( $text );
	$formLength = sprintf "%.2f", $formLength/$length;
	if ( $formLength > 0.3 && ($input > 1 && $input + @select > 2 || $isForm == 1) ) {
		print "$filename  \tIS FORM: $formLength\n" ;
		next DIR;
	}
	
	# google-special
	if ($input > 0 && length($text) < 500) {
		print "$filename  $input, ",$length,"\tIS FORM (small): $formLength\n" ;
	} elsif ($base == 38) {
		print "$filename  $input, ",@select+0,", $length\tNO : $formLength\n" ;
	}
	
}
close( DH );

