﻿#!/usr/bin/perl -w

# erkennt waehrungen und klassifiziert als katalog, falls > 3
# filtert (kleiner trick vorerst) files mit "expenses && incomes? && balance" -> 41


# ------------------------  HISTORY -----------------------
# 19.10.05	new			created from find_code1.pl, version 14.10.
# 19.10.05	added		feature currency (without uncoded euro etc.)
# 20.10.05	added		feature shop-words: shop, cart, buy, price, offer, shopping
# 21.10.05	changed		added (wrong) EUR 1,00 to currency-regexp
#						added &#36; uncoded euro and pound to currency-regexp
#			deleted		da currency alleine prima reicht, del rest.
# 25.10.05	added		readdir
# 30.10.05	added 		charset-recognition
# 23.11.05	added		Astro::guess_charset, USE_CLEAN
# 02.12.05	added		search 1 input-text, min 3 elems, filter annual report
# 24.01.06	changed 	dir -> base
# 12.02.06	changed		"<a" erlaubt nach preis, auch ohne formular katalog

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;

use utf8;
binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "8";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# regex
# currencies
my $price_base = qr/(?:(\$|£|€|(&#36;)|(&#163;)|(&#xA[3C];)|(&euro;)|(&pound;)|(eur)) ?\d+(,\d{3})*([,\.]\d{2})?)|(?:\d+(,\d{3})*(\.\d{2})? eur)/;
my $price = qr/(?:$price_base ?(?:-|(?:to)) ?)?$price_base/;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;
	print STDERR ".";
	print $filename.": " if DEBUG;
	
	# ------- calculate features  ----------
	my $text = "";
	my $nofile = 0;
	my $currency = 0;
	my $textlength = 0;
	
	# read content
	open( FH, $filename ) || die " does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
		
	# remove text in <script>, <head>, <style>
	$text =~ s/(?:<(head|script|style|noframes|select)[^>]*>.*?<\/\1>)|(?:<!--.*?-->)//gsi;

	# formulare
	my $form = 0;
	while ( $text =~ /<input[^>]*?text[^>]*?>|<input ((?!type)[^>])*>/gio ) {
		$form++;
	}

	$text = decode_entities( $text );
	
	# currency
	#while ( $text =~ /$price(?:\)|\s)*?(?:(?:<br)|(?:<\/\w+?>))/gsio ) {
	while ( $text =~ m#($price(\s|\))*?(<a|<br|</))#gsio ) {
		$currency++;
	}

	$text =~ s/<[^>]*?>//gs;
	$text =~ s/\s+/ /go;
	my $cur_rel = sprintf "%.3f", 1000*$currency/length( $text );


	print "currency-symbols: ".$currency.", "	if DEBUG;
	if ($currency > 3 && $form > 0 && $cur_rel > 1.5 || $cur_rel > 3 && $cur_rel < 20 && $currency > 5) {
		print "$filename  \t$currency, $form, $cur_rel\tIS KATALOG\n";
	} elsif ($base == 8) {
		print "$filename  \t$currency, $form, $cur_rel\tNO KATALOG\n";
	}
}
closedir( DH );
