#!/usr/bin/perl -w

# erkennt blog aus blog-woertern


# ------------------------  HISTORY -----------------------
# 26.10.05	new
# 28.10.05	added	use utf8;
# 30.10.05	added 	charset-recognition
# 05.11.05	added	movable type
# 23.11.05	added	Astro::guess_charset, USE_CLEAN, 
# 04.12.05	changed	output
# 21.12.05	added	"> ?posted" aus blog_posted (evt. wieder löschen, falls 35 nicht filterbar)
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;
use utf8;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "36";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

my $comment_exp = qr/(?:(?:reader )|(?:blog(?:s')? )|(?:no )|(?:\(\?d+\)? ))?comments?(?:\s*\(\d+\))?/i;
my $trackback_exp = qr/(?:(?:no )|(?:\(\d+\) ?))?(?:(?:track)|(?:feed))back(?:\s*\(\d+\))?/i;
my $halo_exp = qr/<script type="text\/javascript\">postCount.*?<\/script>/i;
my $link_exp = qr/$comment_exp|$trackback_exp|(?:(?:\w+-)?rss)|permalink|permanent link|$halo_exp|0 comments/i;
my $wordpress_exp = qr/(?:(?:entries)|(?:comments)) \(rss\)/i;
my $rss_exp = qr#<link (?:(?:rel="alternate" ?)|(?:type="application/rss\+xml" ?)|(?:title="RSS.*?" ?)){3}.*?>#i;
my $poweredBy_exp = qr#(?:<a [^>]*href="http:\/\/wordpress\.org"[^>]*>WordPress[^<]*<\/a>)|(?:<a [^>]*href="http://www.movabletype.org"[^>]*>Movable Type[^<]*</a>|<a href=["']javascript:HaloScan)#i;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next if $filename =~ /^\./;
	next unless !defined($oneFile) || $filename eq $oneFile;
	last if $num_files-- == 0;

	print STDERR ".";
	print $dir."".$filename.": "	if DEBUG;
	
	# ------- calculate features  ----------
	my $text = "";
	my $blogword = 0;
	my $wordpress = 0;
	my $rss = 0;
	my $posted = 0;
	
	# read content
	open( FH, $dir.$filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	$text =~ s/\s+/ /gsi;
	
	# posted
	while ( $text =~ /> ?posted: /gi ) {
		$posted++;
	}
	if ($posted > 5) {
		print "$filename    \tIS BLOG (posted)\n";
		next DIR;
	}
	
	# blogwords
	while ( $text =~ /(?:<a [^>]*href[^>]*?>(?:($link_exp)|($wordpress_exp))<\/a>)|($poweredBy_exp)|($rss_exp)/gi ) {
		$blogword++			unless defined $2 || defined $3 || defined $4;
		$wordpress++		if defined $2 || defined $3;
		$rss++				if defined $4;
	}
	# output
	print "blogwords: ".$blogword.", WP: ".$wordpress.", RSS: ".$rss."\n" if DEBUG;
	if ($blogword + $wordpress + $rss > 3) {
		print "$filename    \tIS BLOG\n";
	} elsif ( $base == 36 ) {
		print "$filename    \tNO BLOG (blogwords: ".$blogword.", WP: ".$wordpress.", RSS: ".$rss.")\n";
	}
}

closedir( DH );