#! /usr/bin/perl

# To run:
#
#   siptrace-merge [selection-options] file.xml ... >merged.xml
#
# Takes several siptrace XML files and merges them into one.
# Based on timestamps and branch-ids, detects when a packet is logged in 
# more than one file and combines them.  Works only when the clocks are
# tightly synchronized, but usually all the logs were taken from servers
# on one host, so this isn't a problem.  Output copies of packets are
# timestamped with the earliest mention in any of the input files, which
# can cause time-warp effects if not all components of the commserver
# are used as input to siptrace-merge.
#
# "selection-options" allow you to exclude packets you aren't interested in.
#
# --after=timestamp
# excludes all packets whose timestamps are lexically less than 'timestamp'.
#
# --before=timestamp
# excludes all packets whose timestamps are lexically greater than 'timestamp'.
#
# --exclude-method=method (*)
# excludes all packets that are requests or responses with the
# specified method (in the CSeq field).
#
# --include-method=method (*)
# includes only packets that are requests or responses with the
# specified method (in the CSeq field).
#
# --containing=string (*)
# excludes packets that do not contain matches to the specified string
#
# --offset=file:t1->t2
# apply an offset to all the packets in 'file' so that a packet with time
# t1 is merged as if it specified time t2
#
# (*) These values are actually patterns, not strings.  You can use any of
# the Perl regular expression matching features.  Beware that you not only
# have to quote the usual regexp special characters, but also '/', '$', and
# especially '@'

# Amount of clock skew and transit time allowed.
$time_slack = 0.75;

# Set up the clock offset table.
# $clock_offset{$file} is the amount to add to the reported times in $file.
%clock_offset = ();

# Get the Perl library for converting dates from components to
# integral time values.
use Time::Local;

# Load translation table.
$xml_escape{'<'} = '&lt;';
$xml_escape{'>'} = '&gt;';
$xml_escape{'&'} = '&amp;';

# Initialize the debugging level.
$debug = 0;

# Read the options and construct the selection subroutine.
$subroutine = '1 ';
while ($ARGV[0] =~ m/^-/) {
    $option = shift(@ARGV);
    if ($option eq '-d') {
	$debug++;
    } elsif ($option =~ m/^--after=(.*)$/) {
	$subroutine .= ' && ($time ge "' . quotemeta($1) . '") ';
    } elsif ($option =~ m/^--before=(.*)$/) {
	$subroutine .= ' && ($time le "' . quotemeta($1) . '") ';
    } elsif ($option =~ m/^--exclude-method=(.*)$/) {
	$subroutine .= ' && ($entry !~ m/\nCSeq:\s*\d+\s*(' . $1 . ')\b/i) ';
    } elsif ($option =~ m/^--include-method=(.*)$/) {
	$subroutine .= ' && ($entry =~ m/\nCSeq:\s*\d+\s*(' . $1 . ')\b/i) ';
    } elsif ($option =~ m/^--containing=(.*)$/) {
	$subroutine .= ' && ($entry =~ m/' . $1 . '/) ';
    } elsif ($option =~ m/^--offset=(.*)$/) {
	$o = $1;
	die "Invalid --offset option '$o'.\n"
	    unless ($f, $a, $b) = ($o =~ m/^(.*?):(.*)->(.*)$/);
	print STDERR "\$f = $f \$a = $a \$b = $b\n";
	die "Invalid --offset time '$a'.\n"
	    unless $a =~ m%^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z?$%;
	die "Invalid --offset time '$b'.\n"
	    unless $b =~ m%^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z?$%;
	$clock_offset{$f} = &numeric_time($b) - &numeric_time($a);
    } else {
	die "Unparsable option: '$option'\n";
    }
}
$subroutine =
    "sub select {\nmy(\$entry, \$identifier, \$time) = \@_;\nreturn " .
    $subroutine . ";\n}\n";
print STDERR $subroutine, "\n" if $debug;
eval $subroutine;
die "Constructing 'sub select' failed: $@\n" if $@;

# Read the input file(s), extracting the <branchNode> elements.
%messages = ();
# Assemble the list of input files.
@files = @ARGV;
@files = ('-') if $#files == -1;
# Loop through the files.
foreach $file (@files) {
    if (!open(F, $file)) {
	print STDERR "Error opening file '$file' for input: $!\n";
	print STDERR "Skipping '$file'.\n";
	next;
    }
    # Get the name of the file into XML form.
    $file_xml = $file;
    $file_xml =~ s/(<>&)/$xml_escape{$1}/eg;
    # Get the clock offset for the file.
    $clock_offset = $clock_offset{$file} + 0;
    print STDERR "File $file offset $clock_offset\n" if $debug;
    # Process all the packets in the file.
    while (<F>) {
	next if m%^<\?xml %;
	next if m%^</?sipTrace>%;
	if (m%<branchNode>%) {
	    # Accumulate input until </branchNode> is seen.
	    $element = $_;
	    $identifier = '';
	    $time = '';
	    do {
		$_ = <F>;
		$element .= $_;
		if ($_ =~ m%<branchId>([^<]*)</branchId>%) {
		    $identifier .= $1 . $;;
		} elsif ($_ =~ m%<method>([^<]*)</method>% ||
			 $_ =~ m%<responseCode>([^<]*)</responseCode>%) {
		    $identifier .= $1;
		} elsif ($_ =~ m%<time>(.*)Z</time>%) {
		    $time = $1;
		}
	    } until ($_ =~ m%<\/branchNode>%);
	    # $element is the entire element, $identifier is a string of
	    # the branchId elements and the method or responseCode.
	    # It is a fine-grained identifier of the message, and all
	    # duplicates of this message will have the same identifier.

	    # Check whether this element is to be included.
	    next unless &select($element, $identifier, $time);

	    # Insert the file name into the frameId.
	    $element =~
		s%<frameId>(.*)</frameId>%<frameId>$file_xml:$1</frameId>%;

	    # Calculate the time of the message.
	    $time_value = &numeric_time($time) + $clock_offset;

	    # Find the list of messages with this identifier.
	    $list = $messages{$identifier};
	    if (! defined($list))
	    {
		$list = [];
		$messages{$identifier} = $list;
	    }

	    # Insert this message into the list.
	    &insert_message_into_list($time_value, $element, $list);
	}
    }
    close F;
}

# Read out all the messages, re-indexing them by their time.
%time_list = ();
foreach $key (keys %messages) {
    $list = $messages{$key};
    while (@{$list}) {
        $m = pop(@{$list});
        ($time_value, $element) = @{$m};
        $time_list{$time_value} = $element;
        #print STDERR "b", $time_value, "\n";
        #print STDERR $time_value, "\n";
    }
    #print STDERR "\n";
}

# Write out the messages in time order.
print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
print "<sipTrace>\n";
$frame_number = 0;
foreach $key (sort keys %time_list) {
    $t = $time_list{$key};
    $frame_number++;
    $t =~ s%<frameId>%<frameId>$frame_number %;
    print $t;
}
print "</sipTrace>\n";

exit 0;

sub insert_message_into_list {
    my($time_value, $element, $list_ref) = @_;
    my($i);
    
    # Find any matching message in the list.
    for ($i = 0; $i <= $#{$list_ref}; $i++) {
        # Get reference to component.
        my($entry) = ${$list_ref}[$i];
        my($old_time) = ${$entry}[0];
        # Check whether the time matches within $time_slack second.
        #print $time_value - $old_time, "\t", $time_value, "\t", $old_time, "\n";
        if (abs($time_value - $old_time) < $time_slack) {
            # These messages are the same.
            ${$entry}[1] = &combine_elements(${$entry}[1], $element);
            ${$entry}[0] = ($old_time < $time_value) ? $old_time : $time_value;
            return;
        }
    }
    # No match found, append this message to the list.
    push(@{$list}, [$time_value, $element]);
}

sub combine_elements {
    my($message1, $message2) = @_;
    my($t1, $t2);

    #print STDERR $message1, $message2;

    # Get the minimum time.
    ($t1) = $message1 =~ m%<time>(.*)</time>%;
    ($t2) = $message2 =~ m%<time>(.*)</time>%;
    if ($t1 gt $t2) {
        $message1 =~ s%<time>.*</time>%<time>$t2</time>%;
    }

    # Get a source that contains a hyphen.
    ($t2) = $message2 =~ m%<source>(.*)</source>%;
    if ($t2 =~ m%-%) {
        $message1 =~ s%<source>.*</source>%<source>$t2</source>%;
    }

    # Get a destination that contains a hyphen.
    ($t2) = $message2 =~ m%<destination>(.*)</destination>%;
    if ($t2 =~ m%-%) {
        $message1 =~ s%<destination>.*</destination>%<destination>$t2</destination>%;
    }

    # Get a sourceAddress that contains a hyphen.
    ($t2) = $message2 =~ m%<sourceAddress>(.*)</sourceAddress>%;
    if ($t2 =~ m%-%) {
        $message1 =~ s%<sourceAddress>.*</sourceAddress>%<sourceAddress>$t2</sourceAddress>%;
    }

    # Get a destinationAddress that contains a hyphen.
    ($t2) = $message2 =~ m%<destinationAddress>(.*)</destinationAddress>%;
    if ($t2 =~ m%-%) {
        $message1 =~ s%<destinationAddress>.*</destinationAddress>%<destinationAddress>$t2</destinationAddress>%;
    }

    # Combine the frameId's.
    ($t2) = $message2 =~ m%<frameId>(.*)</frameId>%;
    $message1 =~ s%<frameId>(.*)</frameId>%<frameId>$1 $t2</frameId>%;

    return $message1;
}

# Turn text time into numeric value.
sub numeric_time {
    my($t) = @_;
    my($year, $month, $mday, $hour, $minute, $second, $fraction);

    ($year, $month, $mday, $hour, $minute, $second, $fraction) =
	$t =~ m%^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)(\.\d+)?%;
    $time_value = timegm($second, $minute, $hour, $mday, $month-1,
			 $year-1900) +
			     $fraction;
    return $time_value;
}
