#!/usr/bin/perl -w

# erkennt code aus <pre>, <code>, code.txt, identifiern, ; am zeilenende


# ------------------------  HISTORY -----------------------
# 09.01.06	new			created from find_code_opt.pl, version 23.11.
# 24.01.06	changed 	dir -> base

$VERSION = '24.01.06';

#--------------------------------------------------------------
# pragmas & modules

use strict;
use HTML::Entities;
use Encode;
require 5.004;

binmode(STDOUT, ":utf8"); 

#use lib ('../helpers');
#use Astro;

# --------- vars -------------
use constant DEBUG		=> 0;			# 0: off, 1: on
use constant USE_CLEAN 	=> 1;			# 0: off, 1: on

my $base = $ARGV[0] || "32";
my $dirBase = $ARGV[1] || "train";
my $oneFile = $ARGV[2] || undef;
my $dir = "../../korpus/clean/".$dirBase."korpus/".$base."/";

my $filename;
my $text = "";
my $num_files = 50;

# --------- regex ------------
# aus wortlisten/level1.txt
my $codewords_exp = qr/#include\W|\Wtypedef\W\Wforeach\W|\Wfor_each\W|\Wstd::|\WSystem::|\.io\W|\WWriteLine\W|\WReadLine\W|\Wstackalloc\W|\Wuint\W|\Wulong\W|\Wushort\W|\Wgoto\W|\Welsif\W|\$ARGV\W|\.out\W|\Winstanceof\W|\Wstrictfp\W|\Wenum\W|\Wsizeof\W|<\?php\W|\Wrequire_once\W|\Winclude_once\W|\Wfopen\W|\Wfclose\W|\Warray_push\W|\Warray_pop\W|\Wmysql_connect\W|\Wmysql_query\W|\Wargs\W|<html|<script|<body|<head|<style|<table|<div|\Wstdio\W|\Wstdlib\W|\Wtypeof\W|\WHTML::|\WCGI::|\WCPAN::|\WExtUtils::|\WFile::|\WIO::|\WGetopt::|\WIPC::|\WMath::|\WNet::|\WPod::|\WSys::|\WTerm::|\WTest::|\WText::|\WThread::|\WTie::|\WTime::|\WUser::|\WApache::|\Wreadline\W|\Wprintf\W|\Wsprintf\W|\Wopendir\W|\Wbinmode\W|\Wdbmopen\W|\Wfcntl\W|\Wfileno\W|\Wgetc\W|\Wquotemeta\W|\Wsubstr\W|\Wwaitpid\W|\Wdispinterface\W|\Wdownto\W|\Wresourcestring\W|\Wthreadvar\W|\.PI\WW|\.randomW|\.cosW|\.sinW|\.tanW|\.ceilW|\.floorW|\.roundW|\.powW|\.toLowerCaseW|\.toUpperCaseW|\.splitW|\.sliceW|\.spliceW|\.charAtW|\.textW|\.lengthW|\.alertW|\.focusW|setIntervalW|\.lineToW|\.moveToW|\.heightW|\.widthW|\._xW|\._yW|\._widthW|\._heightW|\.appendW|raw_inputW|\.lowerW|\.upperW|divmodW|\.indexW|\.removeW|\.keysW|\.valuesW|gotoAndStopW|gotoAndPlayW|\.toStringW|\.popW|\.pushW|\.shift\W|\Wsubroutine\W|\WMath\.PI|\._rotation|\._x|\._y|\WcreateEmptyMovieClip\W/i;

# typische identifier: mit_underscore, $i, camelCase, var2
my $ident_exp = qr/[^a-zA-Z_](?:[A-Z]?[a-z]+(?:(?:[A-Z0-9]+|_)[a-z_]*)+)[^a-zA-Z_0-9]|(?:\$[a-zA-Z_][a-zA-Z0-9_]*)[^a-zA-Z0-9_]/;

# ----------- main -----------
undef $/;

# open dir
opendir( DH, $dir ) || die "dir not found!";
print $dir, "\n", "-" x 50, "\n";
chdir( $dir );

# ------ process files -------
DIR: while ( $filename = readdir( DH ) ) {
	next unless !defined($oneFile) || $filename eq $oneFile;
	next if $filename =~ /^\./;
	last if $num_files-- == 0;

	print STDERR ".";
	
	# ------- calculate features  ----------
	my $text = "";
	my $codetags = 0;
	my $lev1 = 0;
	my $semicolon = 0;
	my $ident = 0;
	my $code_features = 0;
	
	# read content
	open( FH, $filename ) || die "$filename does not exist";
	binmode(FH, ":utf8"); 
	$text = <FH>;
	close( FH );
	
	# decode
	if (!USE_CLEAN) {
		my $charset = guess_charset($dir.$filename, "e");
		print STDERR "(".$charset.") "			if DEBUG;
		$text = decode_utf8( $text ) 			if $charset eq "UTF8";
		$text = decode( 'ascii', $text )		if $charset eq "ASCII";
		$text = decode( 'iso-8859-1', $text )	if $charset eq "ISO-1";
		$text = decode( 'iso-8859-15', $text )	if $charset eq "ISO-15";
		$text = decode( 'cp1252', $text )		if $charset eq "WINDOWS";
		$text = decode( 'MacRoman', $text )		if $charset eq "MAC";
	}
	
	# DON'T remove text in <script>, <head>, <style> !!
	# $text =~ s/<((?:head)|(?:script)|(?:style)|(?:noframes))>.*?<\/\1>//gsoi;

	while ( $text =~ /<pre\W|<xmp\W|<code\W|<samp\W|<font\W[^>]*?courier|<[^>]+\s+class\W[^>]*?(?:preformat|pre|code(?!base))/gsio ) {
		$codetags++;
	}

	$code_features++			if $codetags > 0;
	print  "tags - "			if $codetags > 0 && DEBUG;
	
	# \n -> <br> in <pre>, <xmp>; delete \s+
	my @subs;
	my @orig;
	while ( $text =~ /<(pre|xmp)>(.*?)<\/\1>/gsoi ) {
		my $sub = $2;
		my $org = $2;
		$sub =~ s#[\n\r]#::BREAK::#gso;
		$sub =~ s#<#&lt;#gsio 	if defined $1 && $1 eq "xmp";
		$sub =~ s#>#&gt;#gsio 	if defined $1 && $1 eq "xmp";
		push(@subs, $sub);
		push(@orig, as_quotemeta($org));
	}
	for (my $k=0; $k<@subs; $k++) {
		$text =~ s/$orig[$k]/$subs[$k]/;
	}

	$text =~ s/\s+/ /gs;
	
	$text =~ s#<br>|<br ?/>|<br/>|</p>|</div>|::BREAK::#\n#gsio;	#br2nl
	$text =~ s/<([^>]+?)>//gs;
	
	$text = decode_entities( $text );			# NOW: decode entities
	$text =~ s/[`´‘’˚']/'/gio;
	$text =~ s/[“”]/"/gio;

	# identifier
	while ( $text =~ /$ident_exp/gos ) {
		$ident++;
	}
	$code_features++			if $ident > 5;
	print  "identifier - "		if $ident > 5 && DEBUG;
	
	if ( $code_features > 2 ) {
		print "$filename  \tIS CODE\n";
		next DIR;
	}
	
	# code.txt
	while ( $text =~ /$codewords_exp/igos ) {
		$lev1++;
	}
	$code_features++			if $lev1 > 0;
	print  "level 1 - "			if $lev1 > 0 && DEBUG;
	
	if ( $code_features > 2 ) {
		print "$filename  \tIS CODE\n";
		next DIR;
	}

	# ; am zeilenende
	my @lines = split /\n/, $text;

	my $isCode = 1;
	my $blocksize = 0;
	my @block;
	my $biggestblock;
	
	foreach (@lines) {
		next if $_ =~ m@^\s*$|^\s*#|^\s*/\*|^\s*//@; # skip blank lines && comments
		if (length($_) < 100 && !$isCode && $_ =~ /[;{}>]\s*$/ && $_ !~ /(.*?;){3,}/) {
			$isCode = 1;
			@block = ();
		}
		if (length($_) < 100 && $isCode && $_ =~ /[;{}>]\s*$/) {
			$isCode = 1;
			push(@block, $_);
		} else {
			if (@block > $blocksize) {
				$blocksize = @block;
				$biggestblock = join("\n", @block);
			}
			$isCode = 0;
		}
	}
	if (@block > $blocksize) {
		$blocksize = @block;
		$biggestblock = join("\n", @block);
	}

	$code_features++						if $blocksize > 3;
	print  "semicolon: ".$blocksize." - "	if $blocksize > 3 && DEBUG;
	
	if ( $code_features > 2 ) {
		print "$filename  \tIS CODE\n";
		next DIR;
	}

	print "\n" if DEBUG;
}
closedir( DH );


# ------- init hashes, as_quotemeta --------

sub as_quotemeta {
	my $t = shift;
	$t =~ s/\\/\\\\/gs;
	$t =~ s/\[/\\\[/gs;
	$t =~ s/\]/\\\]/gs;
	$t =~ s/\{/\\\{/gs;
	$t =~ s/\}/\\\}/gs;
	$t =~ s/\?/\\\?/gs;
	$t =~ s/\+/\\\+/gs;
	$t =~ s/\*/\\\*/gs;
	$t =~ s/\./\\\./gs;
	$t =~ s/\-/\\\-/gs;
	$t =~ s/\$/\\\$/gs;
	$t =~ s/\^/\\\^/gs;
	$t =~ s/\(/\\\(/gs;
	$t =~ s/\)/\\\)/gs;
	$t =~ s#/#\/#gs;
	return $t;
}