#!/usr/bin/perl -w

# erkennt anleitungen

# ------------------------  HISTRORY -----------------------
# 07.01.06	new			created from find_literatur.pl, version 07.01.06
# 24.01.06	debug

$VERSION = '24.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "26";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";
my $dirT = "../Korpus/tagged_".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;

# exp
my $ordExp = qr#first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|\d*1st|\d*2nd|\d*3rd|\d+th|\d{1,2}\.#oi;
my $measureExp = qr#tablespoons?|cups?|teaspoons?|oz|tbsp|tsp#io; ## incomplete
my $headlineExp = qr#how to|tutorials?|guidelines?|using|recipes?#io;
my $pro1Exp = qr#I|my|me|myself|mine#io;
my $nameExp = initNames();

undef $/;

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input-hidden)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<input[^>]*(hidden|submit|reset|radio|checkbox)[^>]*>//gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|a|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	
	
	# get Features
	my $length = 0;
	my $you = 0;
	my $he = 0;
	my $we = 0;
	my $ing = 0;
	my $ord = 0;
	my $name = 0;
	my $measure = 0;
	my $headline = 0;
	my $pro1 = 0;
	my $form = 0;
	my $verb = 0;
	my $past = 0;
	my $conj = 0;
	
	# ordinalzahlen, form-elemente
	while ( $text =~ m#<ol>(.*?)</ol>#gio ) {
		my $m = $1;
		while ( $m =~ m#<li>#gio ) {
			$ord++;
		}
	}
	while ( $text =~ m#<input|<textarea#gio) {
		$form++;
	}
	next DIR if $form > 8;
	
	my $startText = substr($text,0,200);
	$startText =~ s#<a[^>]>.*?</a>##gio;
	$startText =~ s/<[^>]+>//go;
	$text =~ s/<[^>]+>//go;
	
	$length = length( $text );
	next DIR if $length > 90000 || $length < 1000;
	
	# headline
	while ( $startText =~ /(\W|^)$headlineExp($|\W)/gio ) {
		$headline++;
	}
	
	# words
	while ( $text =~ /([a-z0-9'.]+)/iog ) {
		my $m = $1;
		$you++ if $m =~ /^you$/gio;
		$he++ if $m =~ /^he$|^she$/gio;
		$we++ if $m =~ /^we$/gio;
		$ing++ if $m =~ /ing$/gio;
		$ord++ if $m =~ /^$ordExp$/gio;
		$name++ if $m =~ /^$nameExp$/go;
		$measure++ if $m =~ /^$measureExp$/gio;
		$pro1++ if $m =~ /^$pro1Exp$/gio;
		next DIR if 1000*$pro1/$length > 4 || 1000*$name/$length > 12 || 1000*$you/$length > 8;
	}
	
	# pos
	my $ft = $dirT."/".$filename;
	$verb = `cat $ft | grep -A 1 'SENT\$' | grep -c '\\WV'`;
	$past = `cat $ft | grep -c 'VBD\$'`;
	$conj = `cat $ft | grep -c 'CC\$'`;
	
	$you = sprintf "%.3f", 1000*$you/$length;
	$he = sprintf "%.3f", 1000*$he/$length;
	$we = sprintf "%.3f", 1000*$we/$length;
	$ing = sprintf "%.3f", 1000*$ing/$length;
	$verb = sprintf "%.3f", 1000*$verb/$length;
	$ord = sprintf "%.3f", 1000*$ord/$length;
	$pro1 = sprintf "%.3f", 1000*$pro1/$length;
	$conj = sprintf "%.3f", 1000*$conj/$length;
	my $past_rel = sprintf "%.3f", 1000*$past/$length;
	
	if ( $length < 70000 && $length > 1000 && $pro1 < 4 
		&& ($ing > 3 && $ord > 0 || $ord > 1 || $measure > 0) 
		&& ($he < 0.25 && $verb > 0.1 || $he < 1.1 && $verb > 9) 
		&& ($past_rel < 0.4 || $past < 3) 
		&& ($pro1+$we > 0 || $you > 4 || $measure > 0) 
		&& ($verb+$you+$we > 2.5 && $headline > 0 || $verb+$you+$we > 3.5) 
		&& $you < 8 && $name < 12 && $form < 8 
		&& $conj > 2 && $conj < 11 ) {
		print "$filename $length, $you, $he, $we, $headline, $ing, $verb, $ord, $name, $measure, $pro1, $conj, $past, $past_rel IS\n";
	} elsif ($dir =~ /26/) {
		print "$filename $length, $you, $he, $we, $headline, $ing, $verb, $ord, $name, $measure, $pro1, $conj, $past, $past_rel NO\n";
	}
}

close( DH );

# ================  FUN  ================ #

sub initNames {
	my $reg = "(?:";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	close( L1 );
	chop $reg;
	$reg .= ")";
	return $reg;
}




