#!/usr/bin/perl -w

# read the current specified file and add new sequence data to the
# database, replacing any previous data.

use strict;
use DBI;
use Inline C => './pIcalc.c';
use Getopt::Long;

my $db = 'seqdb_demo';
my $user = 'seqdb_user';      # db user name here
my $password = 'seqdb_pass';  # db password for this user here
my $host = '';                # add hostname if db running on different machine than
                              # update script

GetOptions( "db|database=s" => \$db,
            "user=s" => \$user,
            "password=s" => \$password,
            "host=s" => \$host,
          );

# filename should be specified at command line
my $seqfile = shift(@ARGV) or die "Usage: $0 filename [limit]\n";
my $num = shift @ARGV;

$|++; # turn on autoflush for STDOUT
print " [ @{[scalar localtime]} ... ";
print STDERR " [ @{[scalar localtime]} ... \n";

my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
$connect .= ";host=$host" if $host;

my $dbh = DBI->connect($connect,
		       $user,
		       $password,
		      ) or die $DBI::errstr;

if (-t STDIN && (-t STDOUT || !(-f STDOUT || -c STDOUT))) {
    print STDERR "Warning: about to erase any existing sequence data; continue? [Y/n] ";
    my $answer = <STDIN>; chomp($answer);
    if ($answer && $answer !~ /^y(?:es)?/i) {
	print STDERR "Aborting.\n"; exit;
    }
}

my @locktables = qw(protein annot);

# define the preferred database source annotation order:
my %prefs = ( sp  => 1,
              ref => 2,
	      pir => 3,
	      prf => 4,
	      gb  => 5,
	      emb => 6,
	      dbj => 7,
	      pdb => 8,
              tpg => 9,
	    );

# define all the SQL statements we'll need later:
my %sth =
    (
     clear_annot  => q{ DELETE FROM annot },
     clear_prot   => q{ DELETE FROM protein },

     add_nr       => q{ INSERT INTO annot (gi, descr, db, prot_id, acc, ver, dna_acc, sp_name, pref)
			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
		      },
     add_protein  => q{ INSERT INTO protein (seq, len, pi, mw) VALUES (?, ?, ?, ?) },
    );

# lock all the tables we'll need:
$dbh->do('LOCK TABLES ' . join(", ", map { $_ .= ' WRITE' } @locktables));

# prepare all the statement handles, and execute the "clear" delete's
for my $sth (keys %sth) {
    $sth{$sth} = $dbh->prepare($sth{$sth});
    $sth{$sth}->execute if $sth =~ m/^clear_/;
}

my $added = 0;
my $new = 0;
my $ct = 0;
my $ctstr = '';

my $ISA_TTY = -t STDIN && (-t STDOUT || !(-f STDOUT || -c STDOUT));

# read in the sequences from database:
open(SEQ, $seqfile) or die $!;
{ # open local scope for $/
    local $/ = "\n>";
    while(<SEQ>) {
	last if (defined $num && --$num < 0);
	unless (++$ct % 5 || !$ISA_TTY) { 
	    $ctstr .= ' ' unless $ct % 10000;
	    $ctstr .= '.' unless $ct % 1000;
	    unless ($ct % 50000) {
		$ctstr = '';
	        print STDERR !$ENV{EMACS} ? ("\r" . (" " x 80)) : "\n";
	    }
	    if ($ENV{EMACS}) {
		print STDERR "\n", sprintf("%d sequences read %s", $ct, $ctstr) unless $ct % 1000;
	    } else {
		print STDERR "\r", sprintf("%d sequences read %s", $ct, $ctstr);
	    }
	}
	chomp; # remove trailing "\n>" record separator
	my ($header, $sequence) = $_ =~ m/^>?            # record separator (first entry)
	                                  ( [^\n]* ) \n  # header line
                                          (     .* )     # the sequence
				         /osx; # optimize, multiline, commented

					 
	my @entries = split(/\001/, $header);
	@entries = map {
	    my %data;
	    @data{qw(gi dbinfo desc)} = $_ =~ m/ ^ gi \| (\d+) \| # the gi number
		                                 (\S+) \s+        # remaining db info
		                                 (.*)             # description
			                       /ox; # optimize, commented
	    \%data;
	} @entries;

	my $protein_id = 0; # default "NA" value, will be set below:

	$sequence =~ s/\W|\d//sg; # strip non-alphanumeric and numeric characters (leaving alpha)

	# add the protein sequence:
	$sth{add_protein}->execute($sequence, length($sequence), sprintf("%.2f", pIcalc($sequence)), sprintf("%.2f", MWcalc($sequence)));
	$protein_id = $sth{add_protein}->{mysql_insertid};
	$added++;
	
	# choose the preferred entry:
	my $prefindex = 0;
	my $bestdb = 2000;
	for (my $i = 0 ; $i < @entries ; $i++) {
	    my ($db) = $entries[$i]->{dbinfo} =~ m/^([^\|]+)/o;
	    if ($prefs{$db} < $bestdb) {
		$bestdb = $prefs{$db};
		$prefindex = $i;
	    }
	}
	$entries[$prefindex]->{pref} = 1;

	# now add all the entries:
	for my $entry ( @entries ) {
	    add_entry($entry, $protein_id);
	}


    } # close while(<SEQ>)
} # close local $/ scope
close(SEQ);
print STDERR "\n";

$dbh->do('UNLOCK TABLES');

# clean up statement/database handles:
for my $sth (values %sth) {
    $sth->finish()
	if (ref($sth) && $sth->{Active});
}
$dbh->disconnect();

print "@{[scalar localtime]} ] Added $added new sequence@{[$added != 1 ? 's' : '']}, $new new annots\n";
print STDERR "@{[scalar localtime]} ] Added $added new sequence@{[$added != 1 ? 's' : '']}, $new new annots\n";

sub add_entry {

    my ($entry, $protein_id) = @_;
    my ($dna_acc, $sp_name, $pref);

    $pref = $entry->{pref} if exists $entry->{pref};
    $pref = 0 unless defined $pref;

    # what db does this come from?
    my @dbinfo = split(/\|/, $entry->{dbinfo});
    my $db = shift @dbinfo;
    if ($db eq 'emb' || $db eq 'gb' || $db eq 'dbj' || $db eq 'ref' || $db eq 'tpg') {
	# need to get the accession number off the description line:
	my ($acc) = $entry->{desc} =~ m/^(?:TPA:\s+)?\(\s*([^\)\s]+)\s*\)/o;
	$acc ||= '';
	# strip the dna acc off the descr.
	$entry->{desc} =~ s/^(?:TPA:\s+)?\(\s*[^\)\s]+\s*\)\s*//o;
	# strip the taxonomy info off the descr.
	$entry->{desc} =~ s/\s+\[\s*[^\]]+\s*\]\s*$//o;
	@dbinfo = ($dbinfo[0], $acc);
	$dna_acc = $acc;
    } elsif ($db eq 'pir' || $db eq 'prf') {
	@dbinfo = ($dbinfo[1]); # grab the acc/name
    } elsif ($db eq 'sp') {
	# no change: @dbinfo should be fine as is.
	$sp_name = $dbinfo[1];
    } elsif ($db eq 'pdb') {
	# check to make sure chain is defined:
	$dbinfo[1] ||= '';
	$dbinfo[0] .= "|$dbinfo[1]";
    } else {
	warn "unknown database: ", join("|", 'gi', $entry->{gi}, $db, @dbinfo), "\n";
	next;
    }

    my $acc = shift @dbinfo;
    my $version;

    $acc =~ s/\.(\S+)$//;
    if (defined $1) {
	$version = $1;
    }

    $version ||= 1;
    $acc ||= "gi|$entry->{gi}";

    $sth{add_nr}->execute($entry->{gi}, $entry->{desc}, $db, $protein_id, $acc, $version, $dna_acc, $sp_name, $pref);
    $new++;
}
