#!/usr/bin/perl

use strict;

use DBI;
use LWP::Simple qw(get);
use Getopt::Long;

my $db = 'seqdb_demo';
my $user = 'seqdb_user';     # db user name here
my $password = 'seqdb_pass';  # db password for this user here
my $host = 'xdb';  # add hostname if db running on different machine than
                # update script

my $debug = 0;

GetOptions( "db|database=s" => \$db,
            "user=s" => \$user,
            "password=s" => \$password,
            "host=s" => \$host,
            "debug"  => \$debug,
          );

my $godir = shift @ARGV;
$godir ||= ".";
$godir =~ s!(.*)/$!$1!; # strip any trailing /

my $dbh = DBI->connect("dbi:mysql:host=$host;db=$db", $user, $password,
               {RaiseError => 0, PrintError => 0})
    or die $DBI::errstr;

# goto ASSOC;

warn "importing GO terms and relationships:\n";

$dbh->do("DELETE FROM go");
$dbh->do("DELETE FROM go_edge");
$dbh->do("DELETE FROM go_assoc");
$dbh->do("DELETE FROM go_synonym");

my $newnode = $dbh->prepare(q{
               REPLACE INTO go (go_acc, name) VALUES (?, ?)
              });
my $newsyn = $dbh->prepare(q{
              REPLACE INTO go_synonym (go_acc, syn_acc) VALUES (?, ?)
             });
my $newedge = $dbh->prepare(q{
               REPLACE INTO go_edge (child, parent, type)
               VALUES (?, ?, ?)
              });

for my $ont (qw(component function process)) {
    open(ONT, "<$godir/ontology/$ont\.ontology") or die $!;
    my @parents;
    my $depth = 0;
    my ($currid, $name, @syn);
    while (<ONT>) {
	next if m/^!/o; chomp;
	my ($rel, $data) = $_ =~ m/ (\$ | \< | \%) (.*)/ox;

	next unless $rel && $data;

        if (length($`) > $depth) {
            push @parents, $currid;
        } elsif (length($`) < $depth) {
            pop @parents for (1.. ($depth - length($`)));
        }
        $depth = length($`);

        my ($node, @data) = split(/\s+ ( \% | \< ) \s+ /ox, $data);
        ($name, $currid) = split(/\s+ \; \s+/ox, $node);
        ($currid, @syn) = split(/, /, $currid);
        $name =~ s/\\//g;
    
        $newnode->execute($currid, $name);
        for my $syn (@syn) {
            $newsyn->execute($currid, $syn);
        }

        if ($rel eq '<') {
            $newedge->execute($currid, $parents[-1], 'partof')
        } elsif ($rel eq '%') {
            $newedge->execute($currid, $parents[-1], 'isa')
        }
    
        for (my $i = 0 ; $i < @data ; $i += 2) {
            my (undef, $parentid) = split(/\s+ \; \s+/x, $data[$i+1]);
            if ($data[$i] eq '<') {
                $newedge->execute($currid, $parentid, 'partof');
            } elsif ($data[$i] eq '%') {
                $newedge->execute($currid, $parentid, 'isa');
            }
        }

    }

    close(ONT);
}

# build the transitive closure table:

warn "building the transitive closure tables:\n";

$dbh->do(<<EOSQL);
drop table if exists go_tc
EOSQL

$dbh->do(<<EOSQL);
create table go_tc
(
 child char(10) not null,
 parent char(10) not null,
 length int unsigned not null default 1,
 primary key (child, parent),
 index (child),
 index (parent),
 index (length)
)
EOSQL

my $nodes = $dbh->prepare(<<EOSQL);
select child, parent
from go_edge
EOSQL

my $tc_insert = $dbh->prepare(<<EOSQL);
insert into go_tc (child, parent, length)
values (?, ?, ?)
EOSQL

$dbh->do(<<EOSQL);
insert into go_tc (child, parent, length) select go_acc, go_acc, 0 from go
EOSQL

$nodes->execute();
while (my ($acc, $isa) = $nodes->fetchrow_array()) {
    $tc_insert->execute($acc, $isa, 1);
}

my $size = $dbh->prepare(<<EOSQL);
select count(*) from go_tc
EOSQL

my $maxlen = $dbh->prepare(<<EOSQL);
select max(length) from go_tc
EOSQL

my $tc_select = $dbh->prepare(<<EOSQL);
  select distinct tc1.child, tc2.parent, tc1.length + 1
  from   go_tc as tc1 inner join go_tc as tc2 on (tc1.parent = tc2.child)
  where  tc2.length = 1
    and  tc1.length = ?
EOSQL

$size->execute();
my ($oldsize) = $size->fetchrow_array();
my $newsize = undef;

while (!$newsize || $oldsize < $newsize) {
    warn "$newsize vs. $oldsize\n" if $debug;
    my ($len) = $maxlen->execute() && $maxlen->fetchrow_array();
    $tc_select->execute($len);
    while ( my ($start, $end, $len) = $tc_select->fetchrow_array() ) {
        $tc_insert->execute($start, $end, $len);
    }
    $oldsize = $newsize || $oldsize;
    ($newsize) = $size->execute() && $size->fetchrow_array();
}


exit;

ASSOC:

# OK, now for the association data:

warn "importing the association data:\n";

$dbh->{RaiseError} = 1;
$dbh->{PrintError} = 1;

my $newassoc = $dbh->prepare(q{
                REPLACE INTO go_assoc (go_acc, acc, evidence, aspect)
                VALUES (?, ?, ?, ?)
               });

$dbh->do(q{DELETE FROM go_assoc});

warn "building lookup table:\n";

my %lookup;
my %warned;
for my $gp2file (<$godir/gp2protein/gp2protein.*>) {
    my ($db) = $gp2file =~ m/gp2protein\.(.+)$/o;
    next if $db eq "sp" || $db eq "tr" || $db eq "uniprot";
    open(GP, "<$gp2file") or die $!;
    while (<GP>) {
	next if m/^\s*$/o;
	next if m/^!/o;
	chomp;
	my ($from, $to) = split(/\s+/, $_, 2);
	my ($fdb, $fid) = $from =~ m/^(?:([^:]*):)?(.*)$/o;
	$fdb ||= $db;
	push @{$lookup{lc $fdb}->{$fid}},
	    (map { s/^[^:]*://; $_; } split(/\s*;\s*/, $to));
    }
}

#use Data::Dumper qw(DumperX); $Data::Dumper::Indent = 1; die DumperX(\%lookup);

warn "reading association data:\n";

for my $gafile (<$godir/gene-associations/gene_association.*>) {
    my $db;
    if ($gafile =~ m/\.gz$/o) {
        ($db) = $gafile =~ m/gene_association\.(\S+)\.gz$/o;
	next unless $db =~ m/goa/;
        open(GA, "zcat $gafile |") or die $!;
    } else {
        ($db) = $gafile =~ m/gene_association\.(\S+)$/o;
        open(GA, "<$gafile") or die $!;
    }

    warn "$db - $gafile\n";
    # next if $db eq "fb" || $db eq "ddb";

    my $lastdbacc;
    my @dbacc;
    while (<GA>) {
        next if m/^!/o; chomp;

        my ($dbsym, $dbacc, $dbname, $not, $goacc,
            $dbref, $evidence, $with, $aspect, @extra) = split(/\t/, $_);

        unless ($aspect) { # sometimes there's an aberrant newline
            $_ = <GA>;
            ($aspect) = grep { length } split(/\t/, $_);
        }

	next if $not;
	next unless $dbacc;
	$dbsym = lc $dbsym;
        if (!$lastdbacc || ($dbacc ne $lastdbacc)) {
            $lastdbacc = $dbacc;
            # try to get a new $gi and $acc for the database:
	    if ($dbsym eq "sptr") {
		@dbacc = ($dbacc);
	    } elsif (exists $lookup{$dbsym} && exists $lookup{$dbsym}->{$dbacc}) {
		@dbacc = @{$lookup{$dbsym}->{$dbacc}};
	    } else {
		if ($with =~ m/(?:SW|SP|SWALL|Swiss-Prot|SPTR|TREMBL):/io) {
		    @dbacc = map { s/^(?:SW|SP|SWALL|Swiss-Prot|SPTR|TREMBL)://i; $_; }
			grep { m/^(?:SW|SP|SWALL|Swiss-Prot|SPTR|TREMBL):/io }
			    split(/\s*\|\s*/, $with);
		} elsif ($with =~ m/\S+/) {
		    @dbacc = ();
		    for my $xref (split(/\s*;\s*/, $with)) {
			my ($db, $acc) = $xref =~ m/^([^:]+):(.*)/o;
			$acc =~ s/^$db://ig; # strip extra leading MGI:MGI:MGI: crap
			if ($db && exists $lookup{lc $db} && exists $lookup{lc $db}->{$acc}) {
			    push @dbacc, @{$lookup{lc $db}->{$acc}};
			}
		    }
		    unless (@dbacc) {
			warn "$dbsym:$dbacc not found in lookup ($db), with: $with\n";
		    }
		} else {
		    warn "$dbsym:$dbacc not found in lookup ($db)" unless ($warned{$dbsym}++);
		    @dbacc = ();
		    # @dbacc = ("$dbsym:$dbacc");
		}
	    }
        }

        if (@dbacc) {
            for my $dbacc (@dbacc) {
                warn $_ unless $aspect;
		warn $_ if $dbacc eq "1";
                $newassoc->execute($goacc, "\Q$dbacc", $evidence, $aspect);
            }
        }
    }
    close(GA);
}

$dbh->disconnect;
