#!/usr/bin/perl -w

use strict;
#use Time::Local;
use Benchmark;

my (@allrecords);


sub extract_sessions {
    my ($records, $timeout) = (@_);

    my (@todo) = (@$records);
    my (@sessions) = ();

    while (scalar(@todo)) {
	# find next nonnull record;
	my $lastrecord;
	do {
	    $lastrecord = shift(@todo);
	} while (!$lastrecord && scalar(@todo));
	if (!$lastrecord) {last};
	#print "new session: ";
	#dump_record_short($lastrecord);

	my (@session) = ($lastrecord);
	my $i;
	my $userid = "0"; 
	for ($i = 0; $i <= $#todo; $i++) {
	    if (!$todo[$i]) { 
		#print "  skipping $i\n";
		next;
	    }

	    my $stoptime = $lastrecord->[3] + $timeout;
	    if ($todo[$i][3] > $stoptime) {
		#print "  timeout ", isodate ($todo[$i][3]), " > ", isodate($stoptime), "\n";
		last;
	    }

	    #print "  checking: ";
	    #dump_record_short($todo[$i]);
	    # if we have UserID cookies, it's easy
	    if ($todo[$i][1] ne "0" and $userid ne "0") {
		if ($todo[$i][1] eq $userid) {
		    $lastrecord = $todo[$i];
		    push(@session, $lastrecord);
		    $todo[$i] = undef;
		    #print "    match (userid $userid)\n";
		} else {
		    #print "    no match (userid ", $todo[$i][1], " ne $userid)\n";
		}
		next;
	    }

	    # ignore AuthUID. A user can pass through several realms (and hence, different uids) in one session

	    # else use some heuristics.

	    # same hostname and same browser. This is somewhat too optimistic. If the host is a multiuser machine, it is quite
	    # probable that most users will use the same browser. If the host is a proxy, the probability is smaller but not zero.
	    # Let's hope that our site doesn't attract too many users from the same site at once.
	    # We could use the referrer entry as an additional check, but even that helps only if the users are surfing distinct
	    # pagesets.

	    # On the other hand, with hierarchical proxies it is possible for one session to be routed through different proxies.

	    if ($todo[$i][0] eq $lastrecord->[0] && 
	        $todo[$i][10] eq $lastrecord->[10]
	    ) {
		$lastrecord = $todo[$i];
		push(@session, $lastrecord);
		$todo[$i] = undef;
		if ($userid eq "0" && $lastrecord->[1] ne "0") {
		    $userid = $lastrecord->[1];
		}
		#print "    match (host/browser)\n";
	    } else {
		#print "    no match (host/browser)\n";
	    }
	}
	push (@sessions, \@session); 
    }
    return @sessions;
}

sub isodate {
    my ($time) = (@_);
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
    return sprintf("%04d-%02d-%02dT%02d:%02d:%02d", 
		   $year+1900,$mon+1,$mday,$hour,$min,$sec);
}

sub dump_record_short {
    my ($j) = (@_);
    print isodate($j->[3]), " ", 
	  $j->[0], " ", 
	  $j->[1], " ", 
	  $j->[5], " ", 
	  $j->[9], " ", 
	  $j->[10], "\n"; 
}
    
sub dump_sessions {
    my ($sessions) = (@_);

    for (my $i = 0; $i <= $#$sessions; $i++) {
	printf "%4d: \n", $i;
	my (@session) = @{$sessions->[$i]};
	for (my $j = 0; $j <= $#session; $j++) {
	    printf "    %4d: ", $j;
	    dump_record_short($session[$j]);
	}
    }
}

my $t0 = new Benchmark;
my $record;
while (<>) {
    push (@allrecords, [split]);
}

print $#allrecords, " records total\n";

my $t1 = new Benchmark;
my $td = timediff($t1, $t0);
print STDERR "read: ", timestr($td), "\n";
$t0 = $t1;


my @sessions = extract_sessions (\@allrecords, 10 * 60);
print $#sessions, " sessions\n";

$t1 = new Benchmark;
$td = timediff($t1, $t0);
print STDERR "extract_sessions: ", timestr($td), "\n";

#dump_sessions(\@sessions);
my $i;
my @sesslength;
my @realsessions; 
my %finalpages;
my %nonfinalpages;
my %sessnext;
my %sessprev;
my %refprev;
for $i (@sessions) {
    my $len = scalar(@$i);
    $sesslength[$len] ++;
    if ($len > 1) {
	push (@realsessions, $i);
    }
    my $j;
    for ($j = 0; $j < $#$i; $j++) {
        my $file = $i->[$j][5];
        my $next = $i->[$j+1][5];
	$nonfinalpages{$file}++;
	if (!$sessnext{$file}) {
	    $sessnext{$file} = [ 0, {} ];
	}
	$sessnext{$file}->[0]++;
	$sessnext{$file}->[1]->{$next}++;
	if (!$sessprev{$next}) {
	    $sessprev{$next} = [ 0, {} ];
	}
	$sessprev{$next}->[0]++;
	$sessprev{$next}->[1]->{$file}++;
    }
    $finalpages{$i->[$j][5]}++;
    for ($j = 0; $j <= $#$i; $j++) {
        my $file = $i->[$j][5];
        my $referrer = $i->[$j][9];
	if (!$refprev{$file}) {
	    $refprev{$file} = [ 0, {} ];
	}
	$refprev{$file}->[0]++;
	$refprev{$file}->[1]->{$referrer}++;
    }
}
print "Session Length histogram:\n";
for ($i = 0; $i <= $#sesslength; $i++) {
    if ($sesslength[$i]) { printf("%3d %3d\n", $i, $sesslength[$i]); }
}

print "Final pages:\n";
for $i (sort {$finalpages{$b} <=> $finalpages{$a}} (keys %finalpages )) {
    print "  ", $i, ": ", $finalpages{$i}, "\n";
}

print "Nonfinal pages:\n";
for $i (sort {$nonfinalpages{$b} <=> $nonfinalpages{$a}} (keys %nonfinalpages )) {
    print "  ", $i, ": ", $nonfinalpages{$i}, "\n";
}

print "Previous in Session:\n";
for $i (sort {$sessprev{$b}->[0] <=> $sessprev{$a}->[0]} (keys %sessprev )) {
    print "  ", $i, ": ", $sessprev{$i}->[0], "\n";
    my $j = $sessprev{$i}->[1];
    for my $k (sort {$j->{$b} <=> $j->{$a}} (keys %$j )) {
	print "    ", $k, ": ", $j->{$k}, "\n";
    }
}

print "Next in Session:\n";
for $i (sort {$sessnext{$b}->[0] <=> $sessnext{$a}->[0]} (keys %sessnext )) {
    print "  ", $i, ": ", $sessnext{$i}->[0], "\n";
    my $j = $sessnext{$i}->[1];
    for my $k (sort {$j->{$b} <=> $j->{$a}} (keys %$j )) {
	print "    ", $k, ": ", $j->{$k}, "\n";
    }
}

print "Referrer:\n";
for $i (sort {$refprev{$b}->[0] <=> $refprev{$a}->[0]} (keys %refprev )) {
    print "  ", $i, ": ", $refprev{$i}->[0], "\n";
    my $j = $refprev{$i}->[1];
    for my $k (sort {$j->{$b} <=> $j->{$a}} (keys %$j )) {
	print "    ", $k, ": ", $j->{$k}, "\n";
    }
}



dump_sessions(\@realsessions);

# vim:tw=132
