#!/usr/bin/perl -w

# erkennt literaturlisten aus struktur (vgl. summary)

# ---------------------------  TODO ------------------------
# zahlen in tags nicht erkennen! < ... 1234 ...>	OK
# das:  (July 1998 est.) soll nicht gehen!	OK
# zahlen < aktuelles jahr	OK
# text enthaelt kein Monday, Tuesday,...	OK

# ------------------------  HISTRORY -----------------------
# 05.11.05	new			created from find_linklist.pl, version 28.10.
#			added		erkennt name, jahr
# 22.11.05	added		erkennt bib-refs
# 23.11.05	changed		date = month? year, page = pages? \d+--?\d+
#			added		constant USE_CLEAN
# 24.11.05	changed		simplified regex, finds bibref-lists with min 5 items
# 07.01.06	changed		IS literatur, tagging
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "10";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $num_files = 50;
undef $/;

#-> autor[.,]? (?jahr)?.? "?titel"?[[.,]? buch]
#-> buch := journal | (ort[.:] verlag[. seite]) | (verlag[, ort])
#-> journal := (zeitschrift[,? ausgabe][:,] seite)

# --------- regex ----------- 
#my $year_exp = qr#1[6789]\d{2}|200\d#;	# changed: 1600-2009
my $year_exp = qr#1[89]\d{2}|200\d#;
my $isbn_exp = qr#ISBN:? \d{10}|97[89]\d{10}#;
my $month_exp = qr#"January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec#;
my $date_exp = qr#(?:(?:\d{1,2} )?$month_exp )?$year_exp|$isbn_exp|$year_exp $month_exp(?: \d{1,2})?|$month_exp \d{1,2}, $year_exp#;
my $date_exp2 = qr# ?\($date_exp\) ?|[.,;:] ?$date_exp ?[.,;:)]#;
my $date_exp3 = qr#> ?$date_exp ?[.,;:]#;
my $bib_exp = qr#(?:>[^<]{20,200}$date_exp2|>[^<]{0,200}$date_exp2 ?[^<]{20,200}<|$date_exp3 ?[^<]{20,200}<)|(?:>[^<]{0,300}$isbn_exp[^<]{0,100}<)#;
#my $bib_exp = qr#>[^<]{20,200}[.,;:\(] ?$date_exp[.,;:\) <]|<[^>]+>[.,;:\(] ?$date_exp ?[.,;:\)] ?[^<]{20,200}<#; #added <[^>]+>

# open dir
opendir( DH, $dir ) || die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $bibs = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input,textarea)
	$text =~ s/\s+/ /gsio;
	$text =~ s/<(\w+)[ :][^>]*>/<$1>/gio;
	$text =~s#(?:<(head|script|style|noframes|select|textarea)[^>]*>.*?<\/\1>)|(?:</?(?:font|b|i|u|em|div|center|span|strong|a|input|o|yle|st1|![^>]+)(?:[ :=][^>]+)?/?>)|(?:<!--.*?-->)##gsio;
	$text =~ s/&nbsp;/ /gsoi;
	$text =~ s/\s+/ /gsio;
	$text = decode_entities( $text );
	$text =~ s/["'`´‘“”’˚']/'/gio;
	
	# --> this works: counts bib-refernces
	while ( $text =~ /($bib_exp)/gs ) {
		print "* ".$1."\n"	if DEBUG;
		$bibs++;
	}
	print " $bibs - " if DEBUG;
	
	my %pos = ();
	$pos{"N"} = 0;
	$pos{"V"} = 0;
	$pos{"ADJ"} = 0;
	
	my $bib = "";
	my $bibLength = 0;
	while ( $text =~ /((?:$bib_exp.{0,400}?){5,})/gso ) {
		$bib = $1;
		$bibLength += length( $bib );
		$bib =~ s/<[^>]*>//go;
		$bib =~ s/[.;:?!]/./go;
		
		open( TMP, ">__tmp".$$.".txt" );
		while ($bib =~ /([a-z]+('[a-z]+)?|,|\d+|\.)/gio) {
			print TMP "$1\n";
		}
	}
	next DIR if $bib eq "";
	
	# tag
	my $tagged = `cat __tmp$$.txt | tree-tagger ~/_magisterarbeit/code/tagger/lib/english.par`;
	while ( $tagged =~ /(\w+)/g) {
		# det, n, adj, v
		my $m = $1;
		if ($m =~ /^N/) {
			$pos{"N"}++;
		} elsif ($m =~ /^J|^RB/) {
			$pos{"ADJ"}++;
		} elsif ($m =~ /^V/) {
			$pos{"V"}++;
		}
	}
	
	
	my $allPos = $pos{"N"}+$pos{"V"}+$pos{"ADJ"} || 1;
	
	$text =~ s/<a[^>]*>.*?<\/a>//gio;
	$text =~ s/<[^>]*>//gsio;
	my $length_bib = sprintf "%.3f", $bibLength/length($text);
	
	if ($bib !~ /Monday|Tuesday|Wednsday|Thursday|Friday|Saturday|Sunday|\d{2}:\d{2}/iog && $pos{"N"}/$allPos > 0.8 && $pos{"V"}/$allPos < 0.13 && $pos{"N"} > 40 && $length_bib > 0.3) {
		print "$filename IS (".$pos{"N"}.", ".$pos{"V"}.", ".$pos{"ADJ"}.", $length_bib, $bibs)\n";
	} elsif ($base == 10) {
		print "$filename NOPE (".$pos{"N"}.", ".$pos{"V"}.", ".$pos{"ADJ"}.", $length_bib, $bibs)\n";
	}
}
system "rm __tmp$$.txt" if -X "rm __tmp$$.txt";
close( DH );







