#!/usr/bin/perl -w

# erkennt interviews


# ------------------------  HISTORY -----------------------
# 08.12.05	new			
# 24.01.06	changed 	dir -> base
# 27.01.06	added		use utf8

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "19";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

# regexp
my $timeExp = qr/(AFTERNOON|DUSK|DAWN|SUNNY|RAINY|SNOW|LATER|SIMULTANEOUSLY|EXT\.|INT\.)/;
my $contExp = qr/(CONT\.|CONT'D|VO|V\.O\.|CONTINUED)/;
my $regieExp = qr/fade|dissolve|blackout|black screen|pause/i;

my $filename;
my $text = "";
my $num_files = 50;

# --------- regex ------------

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $question = 0;
	my $colon = 0;
	my $colon_pronoun = 0;
	my %interviewer = ();
	my $okaywords = 0;
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# unfug + a + form + ol
	$text =~ s/(?:<(head|script|style|noframes|a|form|ol)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|b|i|u|em|div|big|blockquote|small|strong|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	$text =~ s/&nbsp;/ /gs;
	$text =~ s/\s+/ /gs;
	
	# versuch: ergänzen von bla:.*[?|...]<irgendein tag mit <br>
	$text =~ s/(>\s*[a-z. ]{1,30}:[\w., ]+\? )/$1<br>/gsi;
	
	$text =~ s#<br>|<br ?/>|<br/>|</p>|<p[^>]*>|</div>|</tr>#\n#gsio;	#br2nl
	
	$text = decode_entities( $text );
	$text =~ s/<([^>]+?)>//gs;
	$text =~ s/[`´‘’˚']/'/go;
	$text =~ s/[“”]/"/go;
	$text =~ s/·/./go;
	
	my $len = length( $text );
	next if $len > 150000;
	
	my $interview = 0;
	$interview++ if $text =~ m#\b(interview|conversation|explained|told|discussed)\b#i;
	my $faq = 0;
	$faq++ if $text =~ m#\W(faq|q&a)\W#i;

	# ? am zeilenende, : am anfang
	my @lines = split /\n/, $text;
	my $num_lines = 0;
	foreach (@lines) {
#		print "* $_\n" if $_ =~ /\?/;
		s/\s+/ /gs;
		next if m#^\s*$#;
		$num_lines++;
		while (m#\b(exactly|yes|indeed|no|yeah|not really|why not|OK|okay|don't think so|right)\b#gsio) {
			$okaywords++;
		}
		#print "*$_ :: ",length($_),"\n" if DEBUG;
		if (length($_) < 1000 && $_ =~ /(^.*?\?\s*$|\.{3}\s*$)/io) {
			my $q = $1;
			if ($q =~ /^\s*[.a-z_-].*?[a-z]/i) {
				print "Q:$q\n" if DEBUG;
				$question++;
			}
		}
		if (length($_) < 1000 && $_ =~ /^(\D{1,30}?):(.*)/o) {
			my $name = $1;
			my $first = $2;
			if ( defined $first && ($first =~ /^[\sa-z'",.;:!?-]+$/io || $first =~ /\?/) ) {
				print "C:$_\n" if DEBUG;
				$interviewer{lc( $name )}++;
				if ($first =~ /\b(i|me|my|we|our|you|your)\b/io) {
					$colon_pronoun++;
				}
				$colon++;
			}
		}
	}
	$num_lines = 1 if $num_lines == 0;
	$colon = 1 if $colon == 0;
	my $num_int = keys %interviewer;
	my $main_int = 0;
	foreach (keys %interviewer) {
		$main_int++ if $interviewer{$_} > 1;
	}
	if ( ( $question/$num_lines > 0.12 && $question > 1 || $question > 15 && $question/$num_lines > 0.05 || $question > 20 || $colon/$num_lines > 0.1 && $colon_pronoun/$colon > 0.4 && $main_int > 1) && $num_int < 10 && $okaywords > 1 && $interview == 1 && $len < 150000 && $faq == 0 ) {
		
		# ist drehbuch?
		my $regie = 0;
		while ($text =~ /\b([A-Z][A-Z.]{1,})\b/go) {
			# GROSS
			$_ = $1;
			$regie++	if m/^$timeExp$/;
			$regie++	if m/^$contExp$/;
		}
		while ($text =~ /\((.*?)\)/gio) {
			$_ = $1;
			next if length($_) > 500;
			$regie++ if m/(^|\W)$regieExp($|\W)/io;
		}
		while ($text =~ /\[(.*?)\]/gi) {
			$_ = $1;
			next if length($_) > 500;
			$regie++ if m/(^|\W)$regieExp($|\W)/io;
		}
		if ( $regie < 2 ) {
			print "$filename  \t$question\t$colon\t$main_int\t$len\t$okaywords\t$interview\t$faq\t$regie \tIS INTERVIEW\n";
		} elsif ( $base == 19 ) {
			print "$filename  \t$question\t$colon\t$main_int\t$len\t$okaywords\t$interview\t$faq\t$regie \tNO INTERVIEW\n";
		}
	} elsif ( $base == 19 ) {
		print "$filename  \t$question\t$colon\t$main_int\t$len\t$okaywords\t$interview\t$faq\tNO INTERVIEW\n";
	}
	print "\n" if DEBUG;
}
closedir( DH );
