#!/usr/bin/perl -w

# erkennt faq


# ------------------------  HISTORY -----------------------
# 10.12.05	new			
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "27";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# --------- regex ------------

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	
	print $filename."\n"	if DEBUG;
	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $question = 0;
	my $faq = 0;
	my $faqNoLink = 0;
	my $len = 0;
	my $q_words = 0;
	my %interviewer = ();
	my $speakers = 0;
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# unfug + a
	$text =~ s/(?:<(head|script|style|noframes)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|b|i|u|em|div|big|blockquote|small|strong|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi; #|(?:<a[^>]*href[^>]*>.*?<\/a>)
	$text =~ s/&nbsp;/ /gs;
	
	# ergänzen von /bla:.*? /mit <br>
	$text =~ s/(>\s*[a-z. ]{1,30}:[\w., ]+\?\s)/$1<br>/gsi;
	
	my @subs;
	my @orig;
	# process <pre>
	while ( $text =~ m#<pre[^>]*>(.*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/<br>$subs[$k]<br>/;
	}
	$text =~ s/\s+/ /gio;
	
	$text =~ s#<br>|<br ?/>|<br/>|</p>|<p[^>]*>|</div>|</tr>|<li[^>]*>|</ul>|</ol>|</li>#\n#gsio;	#br2nl

	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;

	my $testText = $text;
	$testText =~ s#<a[^>]*href[^>]*>.*?<\/a>##gsi;
	$testText =~ s/<([^>]+?)>//gs;
	$faqNoLink++ if $testText =~ m#\b(faqs?|q&a|Frequently Asked Questions)\b#iso;
	
	# form
	my $form = 0;
	while ( $text =~ m#<input ([^>]*)>#gio ) {
		$_ = $1;
		$form++ if m#text# || $_ !~ m#type#;
	}
	
	$text =~ s/<([^>]+?)>//gs;
	$len = length( $text );
	next DIR if $len > 500000 && $len < 2000;
	
	$faq++ if $text =~ m#\b(faqs?|q&a|Frequently Asked Questions)\b#iso;

	# ? am zeilenende, : am anfang
	my @lines = split /\n/, $text;

	my $num_lines = 0;
	foreach (@lines) {
		s/\s+/ /gs;
		next if m#^\s*$#;
		#print "*$_\n" if DEBUG;
		$num_lines++;
		#print "*$_ :: ",length($_),"\n" if DEBUG;
		if (length($_) < 200 && $_ =~ /(^.*?\?\s*$)/io) {
			my $q = $1;
			#if ($q =~ /^\s*[.a-z_-].*?[a-z]/i) {
				print "Q:$q\n" if DEBUG;
				$question++;
				if ($q =~ m#who|when|where|what|how|why|which|whom|can i|do i|may i|could i#iso) {
					print "--> $&\n" if DEBUG;
					$q_words++;
				}
			#}
		}
		if (length($_) < 1000 && $_ =~ /^([^:]{1,30}?):(.*)/o) {
			my $name = $1;
			if ( defined $name && ($name !~ /^\s*(q|a|question|answer|subject)\s*$|http$|from$|date$/io ) ) {
				print "C:$_\n" if DEBUG;
				$interviewer{lc( $name )}++;
			}
		}
	}
	
	# name vor ":" kommt min 2 mal vor
	while ( my($name, $num) = each %interviewer ) {
		$speakers++ if $num > 1;
	#	print " * $name * \n" if $num > 1;
	}
	
	# normalize, classify, output
	$num_lines = 1 if $num_lines == 0;
	
	my $isFAQ = 0;
	$isFAQ += 2*$question/$num_lines;
	my $pp	= "NAN";	#avoid div 0 in print
	if ($question > 0) {
		$pp = $q_words/$question;
		$isFAQ += $q_words/$question;
	}
	$isFAQ++	if $faq > 0;
	$isFAQ++	if $faqNoLink > 0;
	
	$form = sprintf "%.3f", $form/$num_lines;
	
	# filter
	if ( $isFAQ > 1.7 && $len < 500000 && $len > 2000 && $question > 0 && $speakers < 2 && $form < 0.1 ) { 
		print "$filename  \t",$pp,"\t$faq\t$faqNoLink\t",$question/$num_lines ,"\t$speakers,\t$len\t$form\tIS FAQ\n";
	} elsif ( $base == 27 ) {
		print "$filename  \t",$pp,"\t$faq\t$faqNoLink\t",$question/$num_lines ,"\t$speakers,\t$len\t$form\tNO FAQ\n";
	}

	print "\n" if DEBUG;
}
closedir( DH );


sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	$t =~ s#/#\/#gs;
	return $t;
}
