#!/usr/bin/perl -w
# A script to turn Penn like trees to dot graphs which can be rendered.

use strict;
$|++; # Making sure we see all the error messages.

# Reading a complete tree
# It can read trees depending on matching brackets and possibly broken across lines.
# TODO: Support for Tags of words
# TODO: Detecting scores

my ($line, $tree, $brackets) = ("", "", 0);

do {
	$line = <>;
	chomp $line;
	$line =~ s/^\s*//g;
	$line =~ s/\s*$//g;
	$line .= " ";
	$brackets += () = $line =~ m/\(/g;
	$brackets -= () = $line =~ m/\)/g;
	$tree .= $line;
} while($brackets>0);

die "Error in parsing." if $brackets<0;
print "##  Parsing the following tree:\n# $tree \n\n";


########################
# The global node counter.
my $number_nodes = 0;

# On every call, it returns a unique root number.
sub get_new_node_number {
	$number_nodes++;
	return $number_nodes;
}

my (@leaf_nodes, @preterminal_nodes) = ( (), () ); 
print "digraph Parse_tree {\n";
f_node($number_nodes, $tree);

print "\t{rank=same; ".join(" ",@leaf_nodes)." };\n";
print "\t{rank=same; ".join(" ",@preterminal_nodes)." };\n";
print "}\n";

# This deals with non-terminal nodes.
# Assumed to be of the form: (name (<rest_of_the_tree>))
sub f_node {
	my $root_number = shift;
	my $rest = shift;
	my ($new_root, $children, $left);
	if (($new_root, $children) = $rest =~ m/\((\S+)\s+(\(.+\)+)\)/) {
		# Found a new node.
		my $new_root_number = &get_new_node_number;
		print "\tN$new_root_number [label=\"$new_root\"];\n";
		# Avoiding the unnecessary first node above the ROOT node.
		print "\tN$root_number -> N$new_root_number;\n" if $root_number > 0;
		while( ($children, $left) = get_children($children) ) {
			print "\t// Working with $children.\n\n";
			f_node($new_root_number, $children);
			$children = $left;
		}
	} else {
		print "\t// Found leaf:$rest\n\n";
		f_leaf($root_number, $rest);
	}
}

# This finds a proper child, balancing parenthesis and returning
# the leftover string.
sub get_children {
	my $str = shift;
	my $original = $str;

	# Fail if there is no starting parenthesis.
	return if $original !~ m/\s*\(/;
	$str =~ s/^\s*//;

	my ($child, $left, $counter, $change) = ( "", "", 0, 0);
	do {
		$change = 0;
		# Encountered an opening bracket.
		if ($str =~ m/^[^\)]*?\(/) {
			$change = 1;
			$child .= $&;
			$str = $';
			++$counter;
		}
		# Encountered a closing bracket.
		if ($str =~ m/^[^\(]*?\)/) {
			$change = 1;
			$child .= $&; 
			$str = $';
			--$counter;
		}
	} while($change && $counter!=0);

	die "Ill formed expression:$original\n" if $counter != 0;

	return $child, $str;
}

# This deals with the leaf nodes
# Assumes (name value), or the occasional (value) form.
sub f_leaf {
	my $root_number = shift;
	my $rest = shift;
	my ($name, $value);
	if( ($name, $value) = $rest =~ m/\((.+)\s+(.+)\)/ ) {
		my $new_root_number = &get_new_node_number;
		print "\t// $name -> $value\n\n";
		print "\tN$new_root_number [label=\"$name\" style=filled fillcolor=violet];\n";
		print "\tN$root_number -> N$new_root_number;\n";
		my $new_value_number = &get_new_node_number;
		print "\tN$new_value_number [label=\"$value\" style=filled fillcolor=lightblue];\n";
		print "\tN$new_root_number -> N$new_value_number;\n";
		push (@leaf_nodes, "N$new_value_number");
		push (@preterminal_nodes, "N$new_root_number");
	} else {
		($value) = $rest =~ m/\((.*)\)/;
		my $new_value_number = &get_new_node_number;
		print "\t// $value was stray\n\n";
		print "\tN$new_value_number [label=\"$value\" style=filled fillcolor=red];\n";
		print "\tN$root_number -> N$new_value_number;\n";
		push (@leaf_nodes, "N$new_value_number");
		push (@preterminal_nodes, "N$root_number");
	}
}


# This is to sanitize the output so that dot can read it without fear.
sub sanitize {
	my $str = shift;
	$str =~ s/\\/\\\\/g;
	$str =~ s/"/\\"/g;
	return $str;
}
