#!/usr/bin/perl -w

# read the current nr.lseg file and add any new sequence data to the
# database, marking those entries that are current

use strict;

my $db = 'seqdb_demo';
my $user = 'seqdb_user';     # db user name here
my $password = 'seqdb_pass';  # db password for this user here
my $host = '';  # add hostname if db running on different machine than
                # update script
my $dbd = shift @ARGV;

if (not defined $dbd) {
    warn "No DBD type provided.\n";
    warn "Defaulting to DBD::mysql ...\n";
    $dbd = 'mysql';
} elsif ($dbd ne 'mysql' && $dbd ne 'Pg') {
    warn "DBD type unknown; assuming you supplied a filename instead.\n";
    warn "Defaulting to DBD::mysql ...\n";
    unshift @ARGV, $dbd;
    $dbd = 'mysql';
}

use Inline C => './pIcalc.c';
use DBI;
use File::Temp qw(tempfile);

# filename could be specified at command line
my $nr = shift(@ARGV) or die "Usage: $0 dbdtype filename [limit]\n";

my $num = shift @ARGV;

$|++; # turn on autoflush for STDOUT
print " [ @{[scalar localtime]} ... ";
print STDERR " [ @{[scalar localtime]} ... \n";

my $pref;
(*PREF, $pref) = tempfile();

my $connect = "dbi:$dbd";
if ($dbd eq 'mysql') {
    $connect .= "(AutoCommit=>1):database=$db";
} elsif ($dbd eq 'Pg') {
    $connect .= "(AutoCommit=>0):dbname=$db";
}

if ($host) {
    $connect .= ";host=$host";
}

my $dbh = DBI->connect($connect,
		       $user,
		       $password,
		       { RaiseError => 1
		       }
		      ) or die $DBI::errstr;

my @locktables = qw(protein annot annot_emb annot_gb annot_dbj annot_ref annot_sp annot_tpg annot_old);

# define the preferred database source order:
my %prefs = ( sp  => 1,
              ref => 2,
	      pir => 3,
	      prf => 4,
	      gb  => 5,
	      emb => 6,
	      dbj => 7,
	      pdb => 8,
              tpg => 9,
	    );

# define all the SQL statements we'll need later:
my %sth =
    (
     set_pref      => q{ UPDATE annot SET pref = 1 WHERE gi = ? },
     end_pref      => q{ UPDATE annot SET pref = 0 where gi = ? },

     check_all_nr => q{ SELECT gi FROM annot WHERE gi IN (?) },
     get_protein  => q{ SELECT prot_id FROM annot WHERE gi IN (?) GROUP BY prot_id },

     check_acc    => q{ SELECT acc FROM annot WHERE acc = ? },
     copy_annot   => q{ SELECT gi, descr, db, prot_id, acc, ver, taxon_id FROM annot WHERE acc = ? },
     old_annot    => q{ INSERT INTO annot_old (gi, descr, db, prot_id, acc, ver, taxon_id)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
		      },
     del_annot    => q{ DELETE FROM annot WHERE acc = ? },
     add_nr       => q{ INSERT INTO annot (gi, descr, db, prot_id, acc, ver)
			VALUES (?, ?, ?, ?, ?, ?)
		      },

     add_protein  => q{ INSERT INTO protein (seq, len, pi, mw) VALUES (?, ?, ?, ?) },
     find_pid     => q{ SELECT MAX(prot_id) FROM protein },

     add_sp       => q{ INSERT INTO annot_sp  (acc, ver, name) VALUES (?, ?, ?) },
     add_ref      => q{ INSERT INTO annot_ref (acc, ver, dna_acc) VALUES (?, ?, ?) },
     add_emb      => q{ INSERT INTO annot_emb (acc, ver, dna_acc) VALUES (?, ?, ?) },
     add_gb       => q{ INSERT INTO annot_gb  (acc, ver, dna_acc) VALUES (?, ?, ?) },
     add_dbj      => q{ INSERT INTO annot_dbj (acc, ver, dna_acc) VALUES (?, ?, ?) },
     add_tpg      => q{ INSERT INTO annot_tpg (acc, ver, dna_acc) VALUES (?, ?, ?) },
    );

# prepare all the statement handles:
for my $sth (keys %sth) {
    # WHERE xxx IN ( ? ) queries can't be prepared:
    next if ($sth{$sth} =~ m/IN\s*\(/o);
    $sth{$sth} = $dbh->prepare($sth{$sth});
}

# lock all the tables we'll need:
if ($dbd eq 'mysql') {
    $dbh->do('LOCK TABLES ' . join(", ", map { $_ .= ' WRITE' } @locktables));
}

my $added = 0;
my $ended = 0;
my $new = 0;
my $ct = 0;
my $ctstr = '';

# read in the sequences from database:
open(NR, $nr) or die $!;
{ # open local scope for $/
    local $/ = "\n>";
    while(<NR>) {
	last if (defined $num && --$num < 0);
	unless (++$ct % 5) { 
	    $ctstr .= ' ' unless $ct % 10000;
	    $ctstr .= '.' unless $ct % 1000;
	    unless ($ct % 50000) {
		$ctstr = '';
	        print STDERR !$ENV{EMACS} ? ("\r" . (" " x 80)) : "\n";
	    }
	    if ($ENV{EMACS}) {
		print STDERR "\n", sprintf("%d sequences read %s", $ct, $ctstr) unless $ct % 1000;
	    } else {
		print STDERR "\r", sprintf("%d sequences read %s", $ct, $ctstr);
	    }
	}
	chomp; # remove trailing "\n>" record separator
	my ($header, $sequence) = $_ =~ m/^>?            # record separator (first entry)
	                                  ( [^\n]* ) \n  # header line
                                          (     .* )     # the sequence
				         /osx; # optimize, multiline, commented

					 
	my @entries = split(/\001/, $header);
	@entries = map {
	    my %data;
	    @data{qw(gi dbinfo desc)} = $_ =~ m/ ^ gi \| (\d+) \| # the gi number
		                                 (\S+) \s+        # remaining db info
		                                 (.*)             # description
			                       /ox; # optimize, commented
	    \%data;
	} @entries;

	# make ourselves a list of gi's:
	my $gi_list = join(", ", map { $_->{gi} } @entries);
	my $protein_id = 0; # default "NA" value, will be set below:

	# Do we have this protein sequence in the database already?
	my $can_sth = $sth{check_all_nr};
	$can_sth =~ s/\?/$gi_list/;
	$can_sth = $dbh->prepare($can_sth);
	$can_sth->execute();
	if ( $can_sth->rows() == 0 ) {
	    # this is a new protein, need to add the sequence to the database:
	    $sequence =~ s/\W|\d//sg; # strip non-alphanumeric and numeric characters (leaving alpha)

	    # add the protein sequence:
	    $sth{add_protein}->execute($sequence, length($sequence), pIcalc($sequence), MWcalc($sequence));

	    # get the protein_id for later reference:
	    $sth{find_pid}->execute();
	    ($protein_id) = $sth{find_pid}->fetchrow_array();
	    $sth{find_pid}->finish();
	    $added++;
	} else {
	    my $gp_sth = $sth{get_protein};
	    $gp_sth =~ s/\?/$gi_list/;
	    $gp_sth = $dbh->prepare($gp_sth);
	    $gp_sth->execute();
	    my @protids = map { $_->[0] } @{$gp_sth->fetchall_arrayref()};
	    # hopefully there should only be one of these!
	    if (@protids > 1) {
		warn("Multiple protein sequences in database for single file entry");
	    } elsif (@protids == 0) {
		warn("Couldn't find matching protein sequence for gi number");
	    } else {
		$protein_id = $protids[0];
	    }
	    $gp_sth->finish();
	}

	# add all the missing gi numbers:
	my @oldgi = map { $_->[0] } @{$can_sth->fetchall_arrayref()};
	for my $entry ( grep { !in($_->{gi}, \@oldgi) } @entries ) {
	    add_entry($entry, $protein_id);
	}
	$can_sth->finish();

	# choose the preferred entry:
	@entries = sort { my ($adb) = $a->{dbinfo} =~ m/^([^\|]+)/o;
			  my ($bdb) = $a->{dbinfo} =~ m/^([^\|]+)/o;
			  ($prefs{$adb} || 1000) <=> ($prefs{$bdb} || 1000);
		      } @entries;

	# print out preferred gi
	print PREF $entries[0]->{gi}, "\n";

    } # close while(<NR>)
} # close local $/ scope
close(NR);
close(PREF);
`sort -n $pref > ${pref}.sorted`;
print STDERR "\n";

open(PREF, "<${pref}.sorted") or die $!;
my $prefsth = $dbh->prepare('SELECT gi FROM annot WHERE pref = 1 ORDER BY gi ASC');
$prefsth->execute();
my @end_prefs; my @set_prefs;
iterator(\*PREF, $prefsth, \@end_prefs, \@set_prefs);
close(PREF); `rm -f ${pref}*`;

for my $gi (@end_prefs) {
    $sth{end_pref}->execute($gi);
}

for my $gi (@set_prefs) {
    $sth{set_pref}->execute($gi);
}

if ($dbd eq 'mysql') {
    # all finished, unlock the database for use:
    $dbh->do('UNLOCK TABLES');
} elsif ($dbd eq 'Pg') {
   $dbh->commit;
}

# clean up statement/database handles:
for my $sth (values %sth) {
    $sth->finish()
	if (ref($sth) && $sth->{Active});
}
$dbh->disconnect();

print "@{[scalar localtime]} ] Added $added new sequence@{[$added != 1 ? 's' : '']}; $ended ended, $new new annots\n";
print STDERR "@{[scalar localtime]} ] Added $added new sequence@{[$added != 1 ? 's' : '']}; $ended ended, $new new annots\n";

sub in {

    my ($val, $arr) = @_;
    for (my $i = 0 ; $i < @$arr ; $i++) {
	return 1 if $val == $arr->[$i];
    }
    return 0;
}

sub add_entry {

    my ($entry, $protein_id) = @_;

    # what db does this come from?
    my @dbinfo = split(/\|/, $entry->{dbinfo});
    my $db = shift @dbinfo;
    if ($db eq 'emb' || $db eq 'gb' || $db eq 'dbj' || $db eq 'ref' || $db eq 'tpg') {
	# need to get the accession number off the description line:
	my ($acc) = $entry->{desc} =~ m/^(?:TPA:\s+)?\(\s*([^\)\s]+)\s*\)/o;
	$acc ||= '';
	# strip the dna acc off the descr.
	$entry->{desc} =~ s/^(?:TPA:\s+)?\(\s*[^\)\s]+\s*\)\s*//o;
	# strip the taxonomy info off the descr.
	$entry->{desc} =~ s/\s+\[\s*[^\]]+\s*\]\s*$//o;
	@dbinfo = ($dbinfo[0], $acc);
    } elsif ($db eq 'pir' || $db eq 'prf') {
	@dbinfo = ($dbinfo[1]); # grab the acc/name
    } elsif ($db eq 'sp') {
	# no change: @dbinfo should be fine as is.
    } elsif ($db eq 'pdb') {
	# check to make sure chain is defined:
	$dbinfo[1] ||= '';
	$dbinfo[0] .= "|$dbinfo[1]";
    } else {
	warn "unknown database: ", join("|", 'gi', $entry->{gi}, $db, @dbinfo), "\n";
	next;
    }

    my $acc = shift @dbinfo;
    my $version;

    $acc =~ s/\.(\S+)$//;
    if (defined $1) {
	$version = $1;
    }

    # if this accession already exists, 
    $sth{check_acc}->execute($acc);
    if ($sth{check_acc}->rows() > 0) {
	$sth{copy_annot}->execute($acc);
	my @data = $sth{copy_annot}->fetchrow_array(); $sth{copy_annot}->finish();

	# check the versioning:
	$data[5] ||= 1;
	if (! $version) {
	    $version = $data[5] + 1;
	} elsif ($version <= $data[5]) {
            warn "new GI but not a newer version ($acc.$version vs. saved $data[4].$data[5])\n";
            $version = $data[5] + 1;
        }

	$sth{old_annot}->execute(@data); $sth{old_annot}->finish();
	$sth{del_annot}->execute($acc); $sth{del_annot}->finish();
	$ended++;
    }
    $sth{check_acc}->finish();

    $version ||= 1;

    $acc ||= "gi|$entry->{gi}";
    if($acc) { # not yet handling sp segmental proteins.
        $sth{add_nr}->execute($entry->{gi}, $entry->{desc}, $db, $protein_id, $acc, $version);
        $new++;
        # add the database cross reference, if necessary:
        $sth{"add_$db"}->execute($acc, $version, @dbinfo) if exists $sth{"add_$db"};
    }

}

sub iterator {

    my ($fh, $sth, $ends, $sets) = @_;
    my ($oldgi, $newgi);

    $sth->bind_columns(\$oldgi);

    while ((!eof($fh)) && $sth->{Active}) {
	# grab new values, if necessary:
	chomp($newgi = <$fh>) unless $newgi;
	$sth->fetch() unless $oldgi;

	unless ($oldgi && $newgi) {
            last;
        }

	if ($oldgi == $newgi) {
	    undef $oldgi;
            undef $newgi;
	    next;
	} elsif ($oldgi < $newgi) {
	    push @{$ends}, $oldgi;
	    undef $oldgi;
	} else { # $oldgi > $newgi
	    push @{$sets}, $newgi;
	    undef $newgi;
	}
    }

    # handle leftovers from while() loop
    if ($oldgi) {
	push @{$ends}, $oldgi;
	undef $oldgi;
    }
    if ($newgi) {
	push @{$sets}, $newgi;
	undef $newgi;
    }

    # flush remaining streams:
    if (!eof($fh)) {
	# flush $fh
	while (!eof($fh)) {
	    chomp($newgi = <$fh>);
	    push @{$sets}, $newgi;
	}
    }
    if ($sth->{Active}) {
	# flush $sth
	while ($sth->fetch()) {
	    push @{$ends}, $oldgi;
	}
    }
}

