#!/usr/bin/perl -w

# erkennt wissenschaftiche texte

# ------------------------  HISTORY -----------------------
# 28.11.05	new			created from find_stat.pl, version 30.11.
# 22.01.06	changed: dir -> base

$VERSION = '22.01.06';

#------------------------------------------------------------------------------
# Standard pragmas & CPAN modules
#------------------------------------------------------------------------------

use strict;
use HTML::Entities;
use Encode;
use Encode::Byte;
require 5.004;
use utf8;

#use lib ('../helpers');
#use Astro;

binmode(STDOUT, ":utf8"); 

# --------- vars -------------
use constant DEBUG 		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "1";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";


my $filename;
my $num_files = 50;
undef $/;

# open dir
opendir( DH, $dir ) or die "$dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ($filename = readdir( DH )) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR  ".";
	
	# ------- calculate features  ----------
	
	# read content
	open( FH, $dir.$filename ) || die $!;
	binmode(FH, ":utf8"); 
	my $text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# remove several tags (usual nonsense + select,input,textarea,form,p -b,center,font)
	$text =~s#<(head|script|style|noframes|select|textarea|form)[^>]*>.*?</\1>|<!--.*?-->|</?(?:big|blockquote|small|i|u|em|span|a|input|o|yle|st1|p|div)(?:[ :=][^>]+)?/?>##gsio;
	
	$text =~ s/&nbsp;/ /gio;
	$text =~ s/\s+/ /gio;
	
	my $headlines_main = 0;
	my $headlines_norm = 0;
	while ( $text =~ m#(<h\d[^>]*>.*?</h\d>|<b(?: [^>]*)?>.*?</b>|<center(?: [^>]*)?>.*?</center>|<font(?: [^>]*)?>.*?</font>)#gio ) {
		my $headline = $1 if defined $1;
		$headline =~ s#<[^>]+>##go;
		if ( $headline =~ /^ ?(Abstract|Synopsis|Acknowledgments|References|Introduction|Discussion|Conclusion) ?$/i) {
			$headlines_main++;
		} elsif ( $headline =~ /(Results|Appendix|Summary|Evaluating|Evaluation|Bibliography|Biography|Footnotes|Synthesis|Description|Suggested Reading|Ressources|Previous Studies)/gio) {
			$headlines_norm++;
		}
		if ($headlines_main > 1 || $headlines_main > 0 && $headlines_norm > 1) {
			print "$filename  \tIS SCIENCE: H ($headlines_main, $headlines_norm)\n";
			next DIR;
		}
	}
	
	
	
	$text =~ s/<[^>]+>//go;
	my $textlength = length( $text );
	my $bigrams = ( $text =~ s/\b(our results|our notion|our estimate|our analysis|our application|our conclusion|our data|our definition|our evaluation|our finding|our heuristic|our measure|our method|our methodology|our model|our research|our sample|we chose|we assume|we expect|we compare|we conclude|we propose|we suggest|we note|we estimate|we omit|we discuss|we test|we studied|we examine|we analyze|we calculate|we compute|we define|we encountered|we exclude|we focus|we include)\b//gio );
	
	if ( $bigrams > 4 && $bigrams/$textlength > 0.0001 ) {
		print "$filename  \tIS SCIENCE: BI\n";
	} elsif ($base == 1) {
		print "$filename  \tNO SCIENCE: $bigrams, ",$bigrams/$textlength,", $headlines_main, $headlines_norm\n";
	} elsif ($headlines_main > 0) {
		print "$filename  \tNO SCIENCE: $headlines_main, $headlines_norm\n";
	}
}
close( DH );
