#!/usr/bin/perl -w

# erkennt gedicht

# ---------------------------  TODO ------------------------
# zeilenlänge: varianz der zeilenlängen, werte, die satrk vom median abweichen abschneiden
# "blockquote" löschen

# ------------------------  HISTRORY -----------------------
# 26.11.05	new			created from find_literatur.pl, version 23.11.
# 02.12.05	changed		as_quotemeta only for org, not sub
# 28.12.05	added		enthaelt nicht "Chapter", nicht zu viel " am anfang, nicht zu viele namen
# 29.12.05	added		Geos, split an <td>, löschen von <br>,<td> etc. vor tags zählen
# 22.01.06	changed: dir -> base

$VERSION = '22.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
use POSIX qw(ceil floor);
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "14";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";


my $filename;
my $num_files = 50;
my $nameExp = initNames();
my $titles = "Mr|Mrs|Ms|Dr|Prof|Rev|Bishop|Archbishop|Father|Lord|Earl|Duke|Count|Countess|Baron|Baroness|Governor|President";
my $comp = "Division|Farm|Bank|Systems|Corporation|Broadcast|Productions?|United|Ltd|Designs?|Holdings?|Services?|Inc|Limited|University|Company|Centre|Academy|Department|Group|AG|Theatre|Magazine|Publications|Association|Associates|Co|Laboratories|Corp|Works|Engineering|Technology|Electronics";
my $geoExp = initGeos();
undef $/;

# open dir
opendir( DH, $dir ) or die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR  ".";
	
	# ------- calculate features  ----------
	
	# read content
	open( FH, $dir.$filename ) || die $!;
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (_not_ usual nonsense)
	$text =~ s#(?:<(head|script|style|noframes|select|textarea|ul|ol|dl)[^>]*>.*?<\/\1>)|(?:<!--.*?-->)|(?:</?(?:big|small|font|b|i|u|em|strong)(?:[ :=][^>]+)?/?>)##gsio;
	$text =~ s/&nbsp;/ /goi;
	$text =~ s/<([a-z]+) [^>]*>/<$1>/gio;
	
	if ($text =~ /Chapter|CHAPTER/) {
		print "$filename\tNO POEM\n" if $dir =~ /14/;
		next DIR;
	}
	
	my @subs;
	my @orig;
	# process <pre>
	while ( $text =~ m#<pre[^>]*>(.*?)</pre>#gsi ) {
		my $sub = $1;
		my $org = $1;
		$sub =~ s#[\n\r]#<br>#gs;
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/<br>__PRE__<br>$subs[$k]<br>__ENDPRE__<br>/;
	}
	$text =~ s/\s+/ /gio;
	
	# lines end with: <br>, <p>, </div>
	my @lines = split /<br>|<p>|<\/p>|<\/div>|<\/tr>|<tr>|<br\/>|<td>|<\/td>/i, $text;
	
	my $isPoem = 1;
	my $blocksize = @lines/200; # anzahl gedichtzeilen: min 1/100 des textes
	my @block;
	my $biggestblock = "";
	my $pre = 0;
	my $c = 0;
	
	my $lines = 0;
	foreach (@lines) {
		$c++;
		s/<br>|<p>|<\/p>|<\/div>|<\/tr>|<tr>|<br\/>|<td>|<\/td>//gio;
		
		my $link = s#<a>(?!</a>).*?</a>##gio;
		my $tags += s/<[^>]+>//gio;	# delete and count tags
		$_ = decode_entities( $_ );
		my $signs = s/([\\_\/#*=~$%|+():,;.\[\]])/$1/go;

		$isPoem = 0 if $link > 0 && m/^\s*$/;	# keine links mitten im gedicht
		next if $_ =~ /^\s*$/ && $c < @lines-1; # skip blank lines
		$lines++;

		if ($_ eq "__PRE__") {
			$pre = 1;
			$isPoem = 0;
		}
		if ($_ eq "__ENDPRE__") {
			$pre = 3;
		}
		
		if (length($_) < 70 && length($_) > 5 && !$isPoem && $tags < 2 && $signs < 5 && $pre != 3) {
			# erste gedichtzeile
			$isPoem = 1;
			@block = ();
		}
		if ( $c < @lines-1 && length($_) < 70 && length($_) > 5 && $isPoem && $tags < 2 && $signs < 5 && $pre != 3 ) {
			# weitere gedichtzeilen
			$isPoem = 1;
			push(@block, $_);
		} else {
			# falsche zeile (zu lang, zu viele satzzeichen: ende gedicht)
			if ($pre == 2) {
				print "$filename: \tNO [PRE]\n" if $dir =~/14/;
				next DIR;
			}
			if (@block > $blocksize) {
				my $tmp = join("\n", @block);
				my $names = 0;
				my $quot = 0;
				my $num = 0;
				my $quest = 0;
				my $klammer = 0;
				
				# nicht zu viele namen etc. 
				while ($tmp =~ m/\([^)]+\n/go) {
					$klammer++;
				}
				next if $klammer > 0.05*@block;	# öffnende ohne schließende klammer -> kein block
				
				while ( $tmp =~ m/(?:\W|^)($nameExp|$titles|$comp|$geoExp)(?:\W|$)/go ) { 
					$names++;
				}
				next if $names > 0.5*@block;
				
				while ( $tmp =~ m/(^|\n) ?["'`´‘“”’˚']/go ) {
					$quot++;
				}
				next if $quot > 0.5*@block;
				
				while ( $tmp =~ m/(^|\n) ?[\d(*-]/go ) {
					$num++;
				}
				next if $num > 0.15*@block;
				
				while ( $tmp =~ m/[?:] ?($|\n)/go ) {
					$quest++;
				}
				next if $quest > 0.6*@block;
				
				$blocksize = @block;
				$biggestblock = $tmp;
			}
			$isPoem = 0;
		}
		$pre = 2 if $pre == 1;
		$pre = 0 if $pre == 3;
	}

	$text =~ s#<a>(?!</a>).*?</a>##gio;
	$text =~ s/<[^>]+>//gio;
	my $length = length ($text);
	
	# nicht zu viele BIG, Punktuation, Zahlen
	my $bigWords = 0;
	while ($biggestblock =~ /([A-Z]{2,})/og) {
		$bigWords++;
	}
	my $punc = 0;
	while ($biggestblock =~ /[()_\/#*=~$%|+"\[\]]/og) {
		$punc++;
	}
	my $num = 0;
	while ($biggestblock =~ /\d+/og) {
		$num++;
	}
	my $colon = 0;
	while ($biggestblock =~ /: ?[\n\r]/go ) {
		$colon++;
	}
#	print "$biggestblock\n\n";
	
	# check, ob sätze: enthält pronomen, konjunktion oder hilfsverben
	# (ist so wegen opt.; man könnte auch mit tagger nach verben schauen)
	my $isSent = 0;
	while ( $biggestblock =~ /([a-z]+)/gio ) {
		$_ = $1;
		$isSent++ if m/^(or|and|was|is|not|who|why|where|what|will|can|could|would|has|have|been|be|may|I|he|her|hers|herself|him|himself|his|it|its|itself|me|mine|my|myself|our|ours|ourself|ourselves|she|their|theirs|them|themselves|they|us|we|you|your|yours|yourself|yourselves)$/i;
	}
	

	my $blockL_rel = sprintf "%.3f", $blocksize/$lines; # relative größe (zeilenzahl)
	my $block_rel = sprintf "%.3f", length($biggestblock)/$length; # relative größe (zeichenzahl)
	my $lineLength = sprintf "%.3f", length($biggestblock)/$blocksize; # durchschnittliche zeilenlänge im block
	$punc = sprintf "%.3f", $punc/$blocksize;
	$num = sprintf "%.3f", $num/$blocksize;
	
	if (($blocksize > 10 && $block_rel > 0.1 && $blockL_rel > 0.2 ||
		 $blocksize > 3 && $block_rel > 0.4 && $blockL_rel > 0.3 ) &&
		$lineLength >= 14 && $lineLength < 90 && $isSent > 5 && $colon/$blocksize < 0.3 &&
		$bigWords < 4 && $num < 0.16 && $punc < 0.8) {
		
		print "$filename\tIS POEM: $blocksize,\t$lines,\t$block_rel,\t$blockL_rel,\t$lineLength,\t$bigWords,\t$num,\t$punc\t$isSent\n";
	} elsif ($base == 14) {
		print "$filename\tNO POEM: $blocksize,\t$lines,\t$block_rel,\t$blockL_rel,\t$lineLength,\t$bigWords,\t$num,\t$punc\t$isSent\n";
	}
}
close( DH );

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\//\\\//gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	return $t;
}

sub initNames {
	my $reg = "";
	open( L1, "../Wortlisten/names.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	chop $reg;
	close( L1 );
	return $reg;
}
sub initGeos {
	my $reg = "";
	open( L1, "../Wortlisten/cities.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	open( L1, "../Wortlisten/countries.txt" ) || die $!;
	binmode(L1, ":utf8"); 
	while ( <L1> ) {
		chomp;
		$reg .= "$_|";
	}
	chop $reg;
	close( L1 );
	return $reg;
}
