#!/usr/bin/perl -w

# erkennt code aus <pre>, <code>, code.txt, identifiern, ; am zeilenende
# exclusive


# ------------------------  HISTORY -----------------------
# 09.01.06	new			created from find_code_opt.pl, version 23.11.
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "32";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../Korpus/".$dirBase."/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# --------- regex ------------
# aus wortlisten/level1.txt
my $codewords_exp = qr/#include\b|\btypedef\b\bforeach\b|\bfor_each\b|\bstd::|\bSystem::|\.io\b|\bWriteLine\b|\bReadLine\b|\bstackalloc\b|\buint\b|\bulong\b|\bushort\b|\bgoto\b|\belsif\b|\$ARGV\b|\.out\b|\binstanceof\b|\bstrictfp\b|\benum\b|\bsizeof\b|<\?php\b|\brequire_once\b|\binclude_once\b|\bfopen\b|\bfclose\b|\barray_push\b|\barray_pop\b|\bmysql_connect\b|\bmysql_query\b|\bargs\b|<html|<script|<body|<xsl|<xml|<head|<style|<table|<div|\bstdio\b|\bstdlib\b|\btypeof\b|\bHTML::|\bCGI::|\bCPAN::|\bExtUtils::|\bFile::|\bIO::|\bGetopt::|\bIPC::|\bMath::|\bNet::|\bPod::|\bSys::|\bTerm::|\bTest::|\bText::|\bThread::|\bTie::|\bTime::|\bUser::|\bApache::|\breadline\b|\bprintf\b|\bsprintf\b|\bopendir\b|\bbinmode\b|\bdbmopen\b|\bfcntl\b|\bfileno\b|\bgetc\b|\bquotemeta\b|\bsubstr\b|\bwaitpid\b|\bdispinterface\b|\bdownto\b|\bresourcestring\b|\bthreadvar\b|\.PI\b|\.random\b|\.cosW|\.sinW|\.tanW|\.ceilW|\.floorW|\.roundW|\.poWW|\.toLoWerCaseW|\.toUpperCaseW|\.splitW|\.sliceW|\.spliceW|\.charAtW|\.textW|\.lengthW|\.alertW|\.focusW|setIntervalW|\.lineToW|\.moveToW|\.heightW|\.WidthW|\._xW|\._yW|\._WidthW|\._heightW|\.appendW|raW_inputW|\.loWerW|\.upperW|divmodW|\.indexW|\.removeW|\.keysW|\.valuesW|gotoAndStopW|gotoAndPlayW|\.toStringW|\.popW|\.pushW|\.shift\b|\bsubroutine\b|\bMath\.PI|\._rotation|\._x|\._y|\bcreateEmptyMovieClip\b|\bstdin\b|\bstdout\b|\bexec\b|\bconcat\b/i;

# typische identifier: mit_underscore, $i, camelCase, var2
my $ident_exp = qr/[^a-zA-Z_](?:[A-Z]?[a-z]+(?:(?:[A-Z0-9]+|_)[a-z_]*)+)[^a-zA-Z_0-9]|(?:\$[a-zA-Z_][a-zA-Z0-9_]*)[^a-zA-Z0-9_]/;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;

	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $codetags = 0;
	my $lev1 = 0;
	my $semicolon = 0;
	my $ident = 0;
	my $code_features = 0;
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# DON'T remove text in <script>, <head>, <style> !!
	# $text =~ s/<((?:head)|(?:script)|(?:style)|(?:noframes))>.*?<\/\1>//gsoi;

	while ( $text =~ /<pre\W|<xmp\W|<code\W|<samp\W|<font\W[^>]*?courier|<[^>]+\s+class\W[^>]*?(?:preformat|pre|code(?!base))/gsio ) {
		$codetags++;
	}

	$code_features++			if $codetags > 0;
	print  "tags - "			if $codetags > 0 && DEBUG;
	
	# \n -> <br> in <pre>, <xmp>; delete \s+
	my @subs;
	my @orig;
	while ( $text =~ /<(pre|xmp)>(.*?)<\/\1>/gsoi ) {
		my $sub = $2;
		my $org = $2;
		$sub =~ s#[\n\r]#::BREAK::#gso;
		$sub =~ s#<#&lt;#gsio 	if defined $1 && $1 eq "xmp";
		$sub =~ s#>#&gt;#gsio 	if defined $1 && $1 eq "xmp";
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}

	$text =~ s/\s+/ /gs;
	
	$text =~ s#<br>|<br ?/>|<br/>|</p>|</div>|::BREAK::#\n#gsio;	#br2nl
	$text =~ s/<([^>]+?)>//gs;
	
	$text = decode_entities( $text );			# NOW: decode entities
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;
	my $length = length( $text );

	# identifier
	while ( $text =~ /$ident_exp/gos ) {
		$ident++;
	}
	$code_features++			if $ident > 5;
	print  "identifier - "		if $ident > 5 && DEBUG;
	
	# code.txt
	while ( $text =~ /$codewords_exp/igos ) {
		$lev1++;
	}
	$code_features++			if $lev1 > 0;
	print  "level 1 - "			if $lev1 > 0 && DEBUG;
	
	# ; am zeilenende
	my @lines = split /\n/, $text;

	my $isCode = 1;
	my $blocksize = 0;
	my @block;
	my $biggestblock;
	
	foreach (@lines) {
		next if $_ =~ m@^\s*$|^\s*#|^\s*/\*|^\s*//@; # skip blank lines && comments
		if (length($_) < 100 && !$isCode && $_ =~ /[;{}>]\s*$/ && $_ !~ /(.*?;){3,}/) {
			$isCode = 1;
			@block = ();
		}
		if (length($_) < 100 && $isCode && $_ =~ /[;{}>]\s*$/) {
			$isCode = 1;
			push(@block, $_);
		} else {
			if (@block > $blocksize) {
				$blocksize = @block;
				$biggestblock = join("\n", @block);
			}
			$isCode = 0;
		}
	}
	if (@block > $blocksize) {
		$blocksize = @block;
		$biggestblock = join("\n", @block);
	}

	$code_features++						if $blocksize > 3;
	print  "semicolon: ".$blocksize." - "	if $blocksize > 3 && DEBUG;
	
	$blocksize = sprintf "%.3f", 10000*$blocksize/$length;
	$lev1 = sprintf "%.3f", 10000*$lev1/$length;
	$ident = sprintf "%.3f", 10000*$ident/$length;
	
	if ( $code_features > 2 && $blocksize+$lev1+$ident > 100 ) {
		print "$filename  ($blocksize, $lev1, $ident, $codetags) \tIS CODE\n";
	} elsif ( $dir =~ /32/ ) {
		print "$filename  ($blocksize, $lev1, $ident, $codetags) \tNO CODE\n";
	}

	print "\n" if DEBUG;
}
closedir( DH );


# ------- init hashes, as_quotemeta --------

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	$t =~ s#/#\/#gs;
	return $t;
}