#!/usr/bin/perl -w

# erkennt chat aus date, emo, acro, emo_img, a_file, buchstaben-wiederholungen, <h>, "re:" und "posts:", ccr, length
# und: nicht wordpress etc.

# ------------------------  HISTORY -----------------------
# 04.01.06	new		from find_blog.pl
# 24.01.06	changed 	dir -> base
# 12.02.06	changed 	length 15.000 -> 100.000

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "35";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;
our %emoticons;
our %acronyms;
initHashes();

# date
my $day_e_ord = qr/([1-3]?(1st|2nd|3rd|[04-9]th)|11th|12th|13th)/;
my $month_e = qr/(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept)/;
my $year = qr/((?:19|20)[0-9]{2}|'\d{2}|[901]\d)/;
my $sep_e = qr/[.\/-]/;
my $day_num = qr/([1-9]|0[1-9]|[12][0-9]|3[01])/;
my $month_num = qr/0[1-9]|10|11|12|[1-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Sept/;
my $time = qr#([01][0-9]|2[0-3])[-.:][0-5][0-9]#;
my $day = qr#yesterday|today#i;

my $dateExp = qr/($day, ?$time|$day_e_ord $month_e( $year)?|$month_e $day_e_ord(, $year)?|$month_e $day_num(, $year)?|$day_num$sep_e$month_num$sep_e$year|$month_num$sep_e$day_num$sep_e$year)/;

# links etc.
my $file_link_exp = qr#ftp://|\.(pdf|zip|ps|ppt|gz|doc)#;
my $notint_link_exp = qr/href\s*?=\s*?["'](#.*?|\s*?|javascript:.*?|http:.*?)["']/;
my $smiley_exp = qr/src\s*?=.*?(smilies|emoticons).*?gif['"]/;
my $poweredBy_exp = qr#(?:<a [^>]*href="http:\/\/wordpress\.org"[^>]*>WordPress[^<]*<\/a>)|(?:<a [^>]*href="http://www.movabletype.org"[^>]*>Movable Type[^<]*</a>|<a href=["']javascript:HaloScan)#i;
	
# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next if $filename =~ /^\./;
	next unless !defined($oneFile) || $filename eq $oneFile;
	last if $num_files-- == 0;

	print STDERR ".";
	print $dir."".$filename.": "	if DEBUG;
	
	# ------- calculate features  ----------
	my $text = "";
	my $date = 0;
	my $emo = 0;
	my $acro = 0;
	my $emo_img = 0;
	my $a_file = 0;
	my $multiple = 0; # buchstaben-wiederholungen (soooo)
	my $h = 0;
	my $re = 0;
	my $ccr = 0;
	my $len = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# unfug
	$text =~ s/(?:<(head|script|style|noframes)[^>]*>.*?<\/\1>)|(?:<\/?(?:font|i|b|strong|u|em|div|big|blockquote|small|center|span)(?: [^>]+)?>)|(?:<!--.*?-->)//gsi;
	$text =~ s/&nbsp;/ /gs;

	$text = decode_entities( $text );
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	$text =~ s/\s+/ /gsi;
	
	if ($text =~ /$poweredBy_exp/io) { # is blog
		print "$filename poweredby \n" if $base == 35;
		next DIR;
	}
	
	# links, form, ccr
	my $code = 0;
	while ( $text =~ /(<([^>]+?)( [^>]*?)?>)/gs ) {
		$code += length( $1 );
		my $tag = "";
		my $atts ="";
		$tag = $2 if defined $2;
		$atts = $3 if defined $3;
		next if $tag =~ m#^/#;						# ignore closing tags
		if ( ( $tag eq "img" ) && ( $atts =~ /$smiley_exp/ ) ) {
			$emo_img++;
		} elsif ( $tag =~ /h\d/ ) {
			$h++;
		} elsif ( ( $tag eq "a" ) && ( $atts =~ m#$file_link_exp# )  ){
			$a_file++;
		}
	}
	
	$text =~ s/<[^>]+?>//gs;
	$text =~ s/\s+/ /gio;
	$len = length( $text );
	if ($len < 100 || $len > 100000) {
		print "$filename length ($len) \n" if $base == 35;
		next DIR;
	}
	
	$ccr = sprintf("%.3f", $code/$len/10); # eigentlich andersrum...
	$a_file = sprintf("%.3f", 10000*$a_file/$len);
	$emo_img = sprintf("%.3f", 10000*$emo_img/$len);
	
	# emos, acros, big & multiple
	while ( $text =~ /([\S]+)/gs ) {
		my $word = $1;
		my $stripped = $1;
		$stripped =~ s/\W+$|^\W+//g; # strip leading/trailing nonWord-chars
		$emo++	if defined $emoticons{lc($word)} || defined $emoticons{lc($stripped)};
		$acro++ if defined $acronyms{lc($word)} || defined $acronyms{lc($stripped)};
		$re++ 	if $word =~ /^re:?$|^posts:?$/i;
		while ($word =~ /([a-vxyzA-VXYZ!?])\1{2,}/gs) { # ohne www, IEEE, CCC
			$multiple++ unless $stripped eq "IEEE" || $stripped eq "CCC";
		}
	}

	# dates
	while ( $text =~ /$dateExp/gi ) {
		$date++;
	}
	$emo = sprintf("%.3f", 10000*$emo/$len);
	$acro = sprintf("%.3f", 10000*$acro/$len);
	$multiple = sprintf("%.3f", 10000*$multiple/$len);
	$date = sprintf("%.3f", 1000*$date/$len);
	$re = sprintf("%.3f", 10000*$re/$len);
	
	# output
	if ( $a_file < 0.005 && $len < 100000 && $h < 8 && $date > 0.5 && $date < 15 && $ccr > 0.05 && $re+$multiple+3*$emo+5*$emo_img+5*$acro > 4) {
		print "$filename: $ccr, $h, $re, $a_file, $emo, $emo_img, $acro, $multiple, $date, $len FORUM\n";
	} elsif ( $base == 35 ) {
		print "$filename: $ccr, $h, $re, $a_file, $emo, $emo_img, $acro, $multiple, $date, $len NO\n";
	}
}

closedir( DH );

sub initHashes {
	keys %emoticons = 200;
	keys %acronyms = 350;
	$/ = "\n";
	open( AK, "../Wortlisten/akronyme.txt" ) or die "can't find acronyms!\n";
	binmode(AK, ":utf8"); 
	while ( <AK> ) {
		chomp;
		$acronyms{lc($_)} = 1;
	}
	close( AK );
	open( EM, "../Wortlisten/emoticons.txt" ) or die "can't find emoticons!\n";
	binmode(EM, ":utf8"); 
	while ( <EM> ) {
		chomp;
		$emoticons{lc($_)} = 1;
	}
	close( EM );
	undef $/;
}
