#!/usr/bin/perl -w

# erkennt drehbuch

# ------------------------  HISTORY -----------------------
# 15.12.05	new			from find_glossary
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "16";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;
my $archaicExp = initArch();
my $nameExp = initNames();
$nameExp .= "|HERMES|TRYGAEUS|LEADER|SERVANT|DAUGHTER|HIEROCLES|SON|CITIZEN";
my $timeExp = qr/(AFTERNOON|DUSK|DAWN|SUNNY|RAINY|SNOW|LATER|SIMULTANEOUSLY|EXT\.|INT\.)/;
my $contExp = qr/(CONT\.|CONT'D|VO|V\.O\.|CONTINUED)/;
my $regieExp = qr/fade|dissolve|blackout|black screen|pause/i;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $time = 0;
	my $name = 0;
	my $cont = 0;
	my $pre = 0;
	my $len = 0;
	my $regie = 0;
	my $ing = 0;
	my $archaic = 0;
	my $colon = 0;
	my $script = 0;
	my $num = 0;
	my $numR = 0;
	my $colonR = 0;
	my $nameR = 0;
	my $verb = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	my $filePrint = substr($filename, 0, 10);

	# unfug + select
	$text =~ s/(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	
 	while ($text =~ /<pre[^>]*>/go) {
 		$pre++;
 	}
	
	$text =~ s/<\s*([^>]+?) [^>]*>/<$1>/gs;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	my @subs;
	my @orig;
	# process <pre>
	while ( $text =~ m#<pre[^>]*>(.*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}
	my @lines = split /<br>|<p>|<\/p>|<\/div>|<\/tr>|<tr>|<br ?\/>/i, $text;

	$text =~ s/\s+/ /gio;
	$text =~ s/<[^>]+?>//gs;
	
	# laenge
	$len = length( $text );
	
	if ($len < 2500) {
		print "$filePrint NO DREHBUCH $len\n" if $base == 16;
		next DIR;
	}
	
	# pos
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -c '\\WV'`;
	$verb = sprintf "%.3f", 1000*$verb/$len;
	next DIR if $verb < 15;	
	
	# "script" im ersten viertel
	my $limit = $len < 1000 ? $len : 1000;
	my $text1 = substr($text, 0, $limit);
	while ($text1 =~ /([a-z]{3,})/gi) {
		$script++  if $1 =~ m/^(scene|act|cast|roles|characters|script|drama|theater|screenplay)$/io;
	}
	
	foreach ( @lines ) {
		# colon
		$colon++ if m/^[a-z ]{2,20}:/io;
		
		# name, time, cont in GROSS
		while ($_ =~ /\b([A-Z][A-Z.]{1,})\b/go) {
			my $m = $1;
			$name++	if $m =~ /^$nameExp$/;
			$time++	if $m =~ /^$timeExp$/;
			$cont++	if $m =~ /^$contExp$/;
		}
		
		# -ing, -ly und regie in klammern (eckig und rund)
		while ($_ =~ /\((.*?)\)/gio) {
			my $m = $1;
			next if length($m) > 500;
			$ing++ if $m =~ /ing[ .]?$|ly[ .]?$/o;
			$regie++ if $m =~ /(^|\W)$regieExp($|\W)/io;
		}
		while ($_ =~ /\[(.*?)\]/gi) {
			my $m = $1;
			next if length($m) > 500;
			$ing++ if $m =~ /ing[ .]?$|ly[ .]?$/;
			$regie++ if $m =~ /(^|\W)$regieExp($|\W)/io;
		}
		
 		# numbers
 		while ( $_ =~ /\d+/g ) {
 			$num++;
 		}

		# norm
		$numR = sprintf "%.3f", 1000*$num/$len;
		$colonR = sprintf  "%.3f", 1000*$colon/$len;
		$nameR = sprintf  "%.3f", 1000*$name/$len;
		
		next DIR if $numR > 15;
		
		if ( $cont > 1 || $regie > 20 || $regie > 1 && $ing > 10 && $nameR > 3.5 || $time > 0 && $ing > 3 && $nameR > 3.5 ){
			print "$filePrint IS DREHBUCH (1)\t$cont,\t$regie,\t$ing,\t$nameR,\t$time,\t$len\n";
			next DIR;
		} elsif ( $len < 9000 && $nameR > 3 && $script > 1 ) {
			print "$filePrint IS DREHBUCH (2)\t$script,\t$nameR,\t$len\n";
			next DIR;
		}
	} # end line-loop

	next DIR if $numR > 15;
	
	if ( $nameR > 3.5 && ($colonR < 0.5 || $script > 1) && ($ing > 3 || $regie > 0) ) { # war: ing > 5
		print "$filePrint IS DREHBUCH (3)\t$script,\t$colonR,\t$regie,\t$nameR,\t$ing,\t$len\n";
		next DIR;
	} elsif ($base == 16) {
		print "$filePrint NO DREHBUCH (3)\t$script,\t$pre,\t$verb,\t-----,\t$colonR,\t$regie,\t$nameR,\t$numR,\t$ing,\t$time,\t\t$cont,\t$len\n";
		next DIR;
	}	
	
#	# archaic (erst hier wegen performance) - unnötig
#	while ($text =~ /([a-z]{2,})/gio) {
#		my $m = lc($1);
#		$archaic++ if $m =~ m/^$archaicExp$/io;
#	}
	
}
closedir( DH );

# =================  SUB  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$_ = uc($_);
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}
sub initArch {
	my $reg = "(?:";
	open( L1, "../Wortlisten/archaic.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	chop $reg;
	$reg .= ")";
	close( L1 );
	return $reg;
}

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}