#!/usr/bin/perl -w

use strict;

use DBI;
use Net::FTP;

use Data::Dumper;

my $DEBUG = 0;

$|++;
print "started @{[ scalar localtime ]}\n";

my $db = 'seqdb_demo';
my $user = 'seqdb_user';     # db user name here
my $password = 'seqdb_pass';  # db password for this user here
my $host = '';  # add hostname if db running on different machine than
                # update script

# where's our working directory?
my $taxdir = shift(@ARGV) || "/default/dir/taxonomy";

# remove trailing directory separator, if necessary:
$taxdir =~ s!/$!!;

# go get the files we need:
unless ($DEBUG) {
    my $ftp = Net::FTP->new('ftp.ncbi.nlm.nih.gov');
    $ftp->login('anonymous', 'my@address.com');
    $ftp->cwd('/pub/taxonomy');
    $ftp->get('taxdump.tar.gz', "$taxdir/taxdump.tar.gz");
    $ftp->get('gi_taxid_prot.dmp.gz', "$taxdir/gi_taxid_prot.dmp.gz");
    $ftp->quit();

    # unpack them; overwrite previous files, if necessary
    system("gunzip -f $taxdir/taxdump.tar.gz");
    system("cd $taxdir; tar -xf taxdump.tar ; rm -f taxdump.tar");
    system("gunzip -f $taxdir/gi_taxid_prot.dmp.gz");
}

my $connect = "dbi:mysql";
$connect .= "(AutoCommit=>1):database=$db";

if ($host) {
    $connect .= ";host=$host";
}

my $dbh = DBI->connect($connect,
		       $user,
		       $password,
		       { RaiseError => 1,
			 AutoCommit => 1,
			 PrintError => 1
		       }
		      ) or die $DBI::errstr;

my %sth = (
	   add_gc => q{
INSERT INTO taxon_gc (gc_id, name, cde, starts) VALUES (?, ?, ?, ?)
},
	   upd_gc => q{
UPDATE taxon_gc SET name = ?, cde = ?, starts = ? WHERE gc_id = ?
},
	   del_gc => q{
DELETE FROM taxon_gc WHERE gc_id = ?
},

	   add_tax => q{
INSERT INTO taxon (taxon_id, parent_id, rank, gc_id, mgc_id) VALUES (?, ?, ?, ?, ?)
},
           upd_tax => q{
UPDATE taxon SET parent_id = ?, rank = ?, gc_id = ?, mgc_id = ? WHERE taxon_id = ?
},
           del_tax => q{
DELETE FROM taxon WHERE taxon_id = ?
},

	   add_taxname => q{
INSERT INTO taxon_name (taxon_id, name, class) VALUES (?, ?, ?)
},
           upd_taxname => q{
UPDATE taxon_name SET taxon_id, name = ?, class = ? WHERE id = ?
},
           del_taxname => q{
DELETE FROM taxon_name WHERE id = ?
},

	   add_gitax => q{
UPDATE annot SET taxon_id = ? WHERE gi = ?
},
           upd_gitax => q{
UPDATE annot SET taxon_id = ? WHERE gi = ?
},
           del_gitax => q{
UPDATE annot SET taxon_id = 0 WHERE gi = ?
},

	   get_children => q{
SELECT taxon_id FROM taxon WHERE parent_id = ?
},
	   set_left => q{
UPDATE taxon SET left_id = ? WHERE taxon_id = ?
},
	   set_right => q{
UPDATE taxon SET right_id = ? WHERE taxon_id = ?
},
	  );

# prepare all our statements
@sth{keys %sth} = map { $dbh->prepare($_) } values %sth;

my @locktables = qw(taxon_gc taxon taxon_name annot);

# lock all the tables we'll need:
$dbh->do('LOCK TABLES ' . join(", ", map { $_ .= ' WRITE' } @locktables));

our @new;
our @old;
my ($ins, $upd, $del, $nas);

###### enter the genetic codes:

@old = @{
    $dbh->selectall_arrayref(q{SELECT gc_id, name, cde, starts FROM taxon_gc ORDER BY gc_id}) || []
};

open(GC, "<$taxdir/gencode.dmp") or die $!;
while (<GC>) {
    push @new, [ (split(/\s*\|\s*/o, $_))[0, 2..4] ];
}
close(GC);

($ins, $upd, $del, $nas) =
    handle_diffs(\@old,
		 \@new,
		 sub { return $sth{add_gc}->execute(@_) },
		 sub { return $sth{upd_gc}->execute(@_[1..3,0]) },
		 sub { return $sth{del_gc}->execute(@_[0..0]) }
		);

print "    Genetic codes: $ins inserted, $upd updated, $del deleted, $nas untouched\n";
@new = @old = ();

##### enter the taxonomy nodes:

@old = @{
    $dbh->selectall_arrayref(q{SELECT taxon_id, parent_id, rank, gc_id, mgc_id FROM taxon ORDER BY taxon_id}) || []
};

open(TAX, "<$taxdir/nodes.dmp") or die $!;
while (<TAX>) {
    push @new, [ (split(/\s*\|\s*/o, $_))[0..2, 6, 8] ];
}
close(TAX);

($ins, $upd, $del, $nas) =
    handle_diffs(\@old,
		 \@new,
		 sub { return $sth{add_tax}->execute(@_) },
		 sub { return $sth{upd_tax}->execute(@_[1..4,0]) },
		 sub { return $sth{del_tax}->execute(@_[0..0]) }
		);

print "    Taxonomy nodes: $ins inserted, $upd updated, $del deleted, $nas untouched\n";
@new = @old = ();

##### enter the taxonomy names:

open(NAMES, "<$taxdir/names.dmp") or die $!;

# remove what's already there - we'll replace it
$dbh->do(q{DELETE FROM taxon_name});

while (<NAMES>) {
    my @data = split(/\s*\|\s*/o, $_);
    $sth{add_taxname}->execute(@data[0, 1, 3]);
}
close(NAMES);

##### assign the taxon_id's to gi's:

@old = @{
    $dbh->selectall_arrayref(q{SELECT gi, taxon_id FROM annot WHERE taxon_id <> 0 ORDER BY gi}) || []
};

open(GITAXID, "<$taxdir/gi_taxid_prot.dmp") or die $!;
while (<GITAXID>) {
    m/(\d+)\D*(\d+)/o;
    push @new, [ $1, $2 ];
}
close(GITAXID);

($ins, $upd, $del, $nas) =
    handle_diffs(\@old,
		 \@new,
		 sub { return $sth{add_gitax}->execute(@_[1,0]) },
		 sub { return $sth{upd_gitax}->execute(@_[1,0]) },
		 sub { return 1; }
		);

print "    Taxonomy annotations: $ins inserted, $upd updated, $del deleted, $nas untouched\n";
# @new = @old = ();

# all finished, unlock the database for use:
$dbh->do('UNLOCK TABLES');

print "    rebuilding nested set and taxon_nr @{[ scalar localtime ]}\n";

##### rebuild the nested set left/right id':

$dbh->do('LOCK TABLES taxon WRITE');

my $nodectr = 0;
handle_subtree(1);

$dbh->do('UNLOCK TABLES');

# clean up statement/database handles:
for my $sth (values %sth) {
    $sth->finish()
	if (ref($sth) && $sth->{Active});
}

print "ended @{[scalar localtime]}\n";

$dbh->disconnect();

sub handle_subtree {

    my $id = shift;

    $sth{set_left}->execute(++$nodectr, $id);

    $sth{get_children}->execute($id);
    for my $child ( @{$sth{get_children}->fetchall_arrayref()} ) {
	handle_subtree($child->[0]) unless $child->[0] == $id;
    }

    $sth{set_right}->execute(++$nodectr, $id);

}

sub handle_diffs {

    my ($old, $new, $insert, $update, $delete) = @_;

    my ($is, $ds, $us, $na) = (0, 0, 0, 0);

    # we assume $old is already sorted (came from database).

    # we also assume that $old and $new are both arrays of array
    # references, the first elements of which are the unique id's
    
    for my $i (0 .. $#new) {
	warn $i, Dumper($new->[$i]) unless defined $new->[$i]->[0];
    }

    # we sort $new by id:
    @$new = sort { $a->[0] <=> $b->[0] } @$new;

    my ($o, $n) = (0, 0);
    my ($odone, $ndone) = (0, 0);
    $odone++ unless @{$old || []};
    while ($o < @{$old || [] } || $n < @{$new || []}) {
	if ($odone) {
	    # only new's left to add
	    $insert->(@{$new->[$n]}); $is++;
	    $n++;
	} elsif ($ndone) {
	    # only old's left to remove
	    $delete->(@{$old->[$o]}); $ds++;
	    $o++;
	} else {
	    # both $o and $n are still valid
	    my ($oldentry, $newentry) = ($old->[$o], $new->[$n]);
	    if ($oldentry->[0] == $newentry->[0]) {
		# same id; make sure entry data are identical, otherwise update:
		my $ok = 1;
		CHECK : for my $i (1 .. @$oldentry) {
		    unless ( (defined($oldentry->[$i]) && defined($newentry->[$i]) && $oldentry->[$i] eq $newentry->[$i]) ||
			     (!defined($oldentry->[$i]) && !defined($newentry->[$i]))
			   ) {
			$ok = 0; last CHECK;
		    }
		}
		unless ($ok) {
		    $update->(@{$newentry}); $us++;
		} else {
		    $na++;
		}
		$o++; $n++;
	    } elsif ($oldentry->[0] < $newentry->[0]) {
		# old entry to be removed
		$delete->(@{$oldentry}); $ds++;
		$o++;
	    } else {
		# new entry to be added
		$insert->(@{$newentry}); $is++;
		$n++;
	    }
	}

	if ($o == @$old) {
	    $odone++;
	}

	if ($n == @$new) {
	    $ndone++;
	}
    }
    return ($is, $us, $ds, $na);
}
