#!/usr/bin/perl -w

# erkennt gesetze


# ------------------------  HISTORY -----------------------
# 06.01.06	new
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "25";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# exp
our %adjHash = ();
initAdj();
my $ordExp = qr#first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th#oi;
my $keywordExp = qr#amendments?|constitution|act|proclamation|statutes|code|contract|bill|rules?#io;
my $nameExp = initNames();

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next if $filename =~ /^\./;
	next unless !defined($oneFile) || $filename eq $oneFile;
	last if $num_files-- == 0;

	print STDERR ".";
	print $dir."".$filename.": "	if DEBUG;
	
	# ------- calculate features  ----------
	my $text = "";
	my $length = 0;
	my $digits = 0;
	my $ord = 0;
	my $article = 0;
	my $keywords = 0;
	my $name = 0;
	my $adj = 0;
	my $you = 0;

	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# unfug + select
	$text =~ s/(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	$text =~ tr/´`/''/;
	
	$text =~ s/<([^> ]+?) [^>]*>/<$1>/gs;
	
	my @subs;
	my @orig;
	# process <pre>
	while ( $text =~ m#<pre>(.*?)(</pre>|$)#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	$text = decode_entities( $text );
	$text =~ s/\s+/ /gsi;
	
	my @lines = split /<br>|<p>|<\/p>|<\/div>|<\/tr>|<tr>|<br ?\/>/i, $text;
	
	# get Features
	while ( $text =~ m#<ol>(.*?)</ol>#gio ) {
		my $m = $1;
		while ( $m =~ m#<li>#gio ) {
			$ord++;
		}
	}
	
	$text =~ s/<[^>]+>//go;
	$length = length($text);
	next DIR if $length == 0;
	
	foreach ( @lines ) {
		s/<[^>]+>//go;
#		print "*** $_\n" if $_ =~ /\d/;
		while ( /\d+/g ) {
			$digits++;
		}
		while ( /^ *([IXV]{2,}|$ordExp|\d+\.|[a-z]\.|\(\d+\)|\([a-z]\)|\([IXV]+\))(?=\W)/gi ) {
			# ordinalzahlen, III, (a), (2)
			$ord++;
		}
		while ( / *(article|section|art\.|sec\.) ?(\d+|[a-z]|[IXV]{2,}|$ordExp)(?=\W)/gi ) {
			$article++;
		}
		while ( /([a-z]{2,})/gi ) {
			my $m = $1;
			$keywords++ if $m =~ /$keywordExp/io;
			$name++ 	if $m =~ /[A-Z]/o && $m =~ /$nameExp/o;
			$you++		if $m =~ /you|your|yourself|yourselves/io;
			$adj++		if exists $adjHash{lc($m)};
		}
	}
	$digits = sprintf "%.3f", 1000*$digits/$length;
	$ord = sprintf "%.3f", 1000*$ord/$length;
	$article = sprintf "%.3f", 1000*$article/$length;
	$keywords = sprintf "%.3f", 1000*$keywords/$length;
	$name = sprintf "%.3f", 1000*$name/$length;
	$adj = sprintf "%.3f", 1000*$adj/$length;
	$you = sprintf "%.3f", 1000*$you/$length;
	
	if ( $digits > 1 && $ord+2*$article > 2.5 && $keywords > 0.5 && $name < 9 && $adj < 25 && $you < 0.5) {
		print "$filename   \t$digits, $ord, $article, $keywords, $name, $adj, $you, $length IS LAW\n";
	} elsif ( $base == 25 ) {
		print "$filename   \t$digits, $ord, $article, $keywords, $name, $adj, $you, $length NO LAW\n";
	}
}

closedir( DH );

# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}

sub initAdj {
	open( L1, "../Wortlisten/adjectives_neg.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$adjHash{lc($_)} = 1;
	}
	open( L1, "../Wortlisten/adjectives_pos.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$adjHash{lc($_)} = 1;
	}
 	open( L1, "../Wortlisten/adjectives_neut.txt" ) || die $!;
 	binmode(L1, ":utf8"); 
 	while ( <L1> ) {
 		chomp;
		$adjHash{lc($_)} = 1;
 	}
	close( L1 );
}

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}