#!/usr/bin/perl

use strict;

use DBI;
use LWP::Simple qw(get);

my $godir = shift @ARGV;
$godir ||= ".";
$godir =~ s!(.*)/$!$1!; # strip any trailing /

my $db = 'seqdb_demo';
my $user = 'seqdb_user';     # db user name here
my $password = 'seqdb_pass';  # db password for this user here
my $host = '';  # add hostname if db running on different machine than
                # update script

my $dbh = DBI->connect("dbi:mysql:host=$host;db=$db", $user, $password,
		       {RaiseError => 0, PrintError => 0})
    or die $DBI::errstr;

$dbh->do("DELETE FROM go");
$dbh->do("DELETE FROM go_edge");
$dbh->do("DELETE FROM go_assoc");
$dbh->do("DELETE FROM go_synonym");

my $newnode = $dbh->prepare(q{
			   REPLACE INTO go (go_acc, name) VALUES (?, ?)
			  });
my $newsyn = $dbh->prepare(q{
			  REPLACE INTO go_synonym (go_acc, syn_acc) VALUES (?, ?)
			 });
my $newedge = $dbh->prepare(q{
			   REPLACE INTO go_edge (child, parent, type)
			   VALUES (?, ?, ?)
			  });

for my $ont (qw(component function process)) {
    open(ONT, "<$godir/ontology/$ont\.ontology") or die $!;
    my @parents;
    my $depth = 0;
    my ($currid, $name, @syn);
    while (<ONT>) {
	next if m/^!/o; chomp;
	my ($rel, $data) = $_ =~ m/( \$ | \< | \% ) (.*)/ox;

	if (length($`) > $depth) {
	    push @parents, $currid;
	} elsif (length($`) < $depth) {
	    pop @parents for (1.. ($depth - length($`)));
	}
	$depth = length($`);

	my ($node, @data) = split(/\s+ ( \% | \< ) \s+ /ox, $data);
	($name, $currid) = split(/\s+ \; \s+/ox, $node);
	($currid, @syn) = split(/, /, $currid);
	$name =~ s/\\//g;
	
	$newnode->execute($currid, $name);
	for my $syn (@syn) {
	    $newsyn->execute($currid, $syn);
	}

	if ($rel eq '<') {
	    $newedge->execute($currid, $parents[-1], 'partof')
	} elsif ($rel eq '%') {
	    $newedge->execute($currid, $parents[-1], 'isa')
	}

	for (my $i = 0 ; $i < @data ; $i += 2) {
	    my (undef, $parentid) = split(/\s+ \; \s+/x, $data[$i+1]);
	    if ($data[$i] eq '<') {
		$newedge->execute($currid, $parentid, 'partof');
	    } elsif ($data[$i] eq '%') {
		$newedge->execute($currid, $parentid, 'isa');
	    }
	}

    }
    close(ONT);
}

# build the transitive closure table:
$dbh->do(<<EOSQL);
drop table if exists go_tc
EOSQL

$dbh->do(<<EOSQL);
create table go_tc
(
 child char(10) not null,
 parent char(10) not null,
 length int unsigned not null default 1,
 type enum ('partof', 'isa', 'hybrid') not null default 'isa',
 primary key (child, parent, type),
 index (child),
 index (parent),
 index (length)
)
EOSQL

my $nodes_isa = $dbh->prepare(<<EOSQL);
select child, parent
from go_edge
where type = 'isa'
EOSQL

my $nodes_partof = $dbh->prepare(<<EOSQL);
select child, parent
from go_edge
where type = 'partof'
EOSQL

my $tc_insert = $dbh->prepare(<<EOSQL);
insert into go_tc (child, parent, length, type)
values (?, ?, ?, ?)
EOSQL

$dbh->do(<<EOSQL);
insert into go_tc (child, parent, length, type) select go_acc, go_acc, 0, 'isa' from go
EOSQL

$nodes_isa->execute();
while (my ($acc, $isa) = $nodes_isa->fetchrow_array()) {
    $tc_insert->execute($acc, $isa, 1, 'isa');
}

$nodes_partof->execute();
while (my ($acc, $partof) = $nodes_partof->fetchrow_array()) {
    $tc_insert->execute($acc, $partof, 1, 'partof');
}

my $size = $dbh->prepare(<<EOSQL);
select count(*) from go_tc where type = ?
EOSQL

my $maxlen = $dbh->prepare(<<EOSQL);
select max(length) from go_tc where type = ?
EOSQL

my $tc_select = $dbh->prepare(<<EOSQL);
select distinct tc1.child, tc2.parent, tc1.length + 1
from go_tc as tc1 inner join go_tc as tc2 on (tc1.parent = tc2.child and tc1.type = tc2.type)
where tc2.length = 1
and tc1.length = ?
and tc1.type = ?
EOSQL

# handle "strict" transitivity: A isa B isa C => A isa C (same for
# edges composed entirely of "partof" relationships)
for my $type (qw(isa partof)) {

    my ($oldsize, $newsize) = $size->execute($type) && $size->fetchrow_array();

    while (!$newsize || $oldsize < $newsize) {
	# warn "$newsize vs. $oldsize\n";
	my ($len) = $maxlen->execute($type) && $maxlen->fetchrow_array();
	$tc_select->execute($len, $type);
	while ( my ($start, $end, $len) = $tc_select->fetchrow_array() ) {
	    $tc_insert->execute($start, $end, $len, $type);
	}
	$oldsize = $newsize || $oldsize;
	($newsize) = $size->execute($type) && $size->fetchrow_array();
    }
}

# handle logical rule A) from the GO Usage Guide: if A is part of B
# and C is an instance of B, is A part of C? --YES

my $hyb_select = $dbh->prepare(<<EOSQL);
select po.child, po.length, isa.child, isa.length
from go_tc as po inner join go_tc as isa on (po.parent = isa.parent)
where po.type = 'partof' and isa.type = 'isa'
and po.length > 0 && isa.length > 0
EOSQL

$hyb_select->execute();
while (my ($child, $clen, $parent, $plen) = $hyb_select->fetchrow_array) {
    $tc_insert->execute($child, $parent, $clen + $plen, 'hybrid')
}


# OK, now for the association data:

my $newassoc = $dbh->prepare(q{
			    REPLACE INTO go_assoc (go_acc, gi, acc, db,
						  evidence, aspect)
			    VALUES (?, ?, ?, ?, ?, ?)
			   });

$dbh->do(q{DELETE FROM go_assoc});

my @find_sths = ( q{
SELECT sp.gi, acc, 'sp'
FROM   sp left join annot using (gi)
WHERE  acc = ?
ORDER BY current DESC
		   },
		  q{
SELECT sp.gi, acc, 'sp'
FROM   sp left join annot using (gi)
WHERE  name = ?
ORDER BY current DESC
		   },
		  q{
SELECT annot.gi,
       IFNULL(gb.protein_id, IFNULL(emb.protein_id, IFNULL(dbj.protein_id, ''))) AS acc,
       IF(gb.protein_id IS NOT NULL, 'gb',
          IF(emb.protein_id IS NOT NULL, 'emb',
             IF(dbj.protein_id IS NOT NULL, 'dbj', 'unk')
            )
         ) AS db
FROM   annot LEFT JOIN gb ON (annot.gi = gb.gi)
	     LEFT JOIN emb ON (annot.gi = emb.gi)
	     LEFT JOIN dbj ON (annot.gi = dbj.gi)
WHERE  annot.gi = ?
ORDER BY current DESC
		  },
		  q{
SELECT emb.gi, protein_id, 'emb'
FROM   emb left join annot using (gi)
WHERE  protein_id REGEXP ?
ORDER BY current DESC
		   },
		  q{
SELECT gb.gi, protein_id, 'gb'
FROM   gb left join annot using (gi)
WHERE  protein_id REGEXP ?
ORDER BY current DESC
		   },
		  q{
SELECT gi, protein_id, 'dbj'
FROM   dbj left join annot using (gi)
WHERE  protein_id REGEXP ?
ORDER BY current DESC
		   },
		);

@find_sths = map { $dbh->prepare($_) } @find_sths;

my %primaries; # lookup map for swissprot secondary => primary accessions

for my $gafile (<$godir/gene-associations/gene_association.*>) {
    my $db;
    if ($gafile =~ m/\.gz$/o) {
	($db) = $gafile =~ m/gene_association\.(\S+)\.gz$/o;
	next if $db =~ m/compugen/;
	open(GA, "zcat $gafile |") or die $!;
    } else {
	($db) = $gafile =~ m/gene_association\.(\S+)$/o;
	open(GA, "<$gafile") or die $!;
    }

    my %lookup;
    if (-e "$godir/gp2protein/gp2protein.$db") {
	open(GP, "<$godir/gp2protein/gp2protein.$db") or die $!;
	while (<GP>) {
	    next if m/^!/o; chomp;
	    my ($gene, $acc) = $_ =~ m/^(\S+)\s*(.*)\s*$/o;
	    my @acc = map { s/^[^:]*://; $_ } split(/\s*;\s*/, $acc);
	    push @{$lookup{$gene}}, @acc;
	}
	close(GP);
    } elsif ( $db !~ m/^goa/o) {
	# warn "Skipping $gafile - no gp2protein, and not a goa file\n";
	next;
    }

    # warn "$db - $gafile\n";

    my $lastdbacc;
    my @results;
    while (<GA>) {
	next if m/^!/o; chomp;

	my ($dbsym, $dbacc, $dbname, $not, $goacc,
	    $dbref, $evidence, $with, $aspect) = split(/\t/, $_);

	next if $dbsym eq 'ENSEMBL';

	if (!$lastdbacc || ($dbacc ne $lastdbacc)) {
	    $lastdbacc = $dbacc;
	    # try to get a new $gi and $acc for the database:
	    my @dbacc;
	    if (%lookup) {
		if (exists $lookup{$dbacc}) {
		    @dbacc = @{$lookup{$dbacc}};
		} else {
		    # warn "gene $dbacc not found in lookup ($db)\n";
		    next;
		}
	    } else {
		@dbacc = ($dbacc);
	    }
	    @results = find_acc(\@dbacc, $dbname);
	    unless (@results) {
		# warn "couldn't find $dbacc (@{[ join(':', @dbacc) ]})from $db\n";
	    }
	}

	if (@results) {
	    for my $res (@results) {
		$newassoc->execute($goacc, @{$res}, $evidence, $aspect) unless $not;
	    }
	}
    }
    close(GA);
}

$dbh->disconnect;

sub find_acc {

    my ($dbacc, $dbname) = @_;
    my ($gi, $acc, $accdb);
    my @results;

    my $k = 0;
    OUTER : for my $find ( (@{$dbacc}, $dbname) ) {

	my $i = 1;
	INNER : for my $sth (@find_sths) {
	    if ($i <= 2 ) {
		$sth->execute($find);
	    } elsif ($i == 3 && $find =~ m/^\d+$/o) {
		$sth->execute($find);
	    } elsif ($i > 3 && length($find) >= 8) {
		$sth->execute("^$find(\.[[:digit:]]+)?\$")
	    } else {
		last INNER;
	    }
	    my $rows = $sth->rows;
	    if ($rows > 0) {
		($gi, $acc, $accdb) = $sth->fetchrow_array;
		$acc =~ s/\.\d+$//;
		# warn "found ($rows): $dbacc/$dbname = $gi/$acc/$accdb (sth $i, find: $find)\n";
		last INNER;
	    }
	    $i++;
	}

	if (!$acc && $find =~ m/^[OPQ]\d\w\w\w\d$/o) {
	    if (exists $primaries{$find}) {
		$find = $1;
		redo OUTER;
	    } else {
		# look it up in swissprot, could be a secondary accession #:
		my $sp = get("http://us.expasy.org/cgi-bin/niceprot.pl?$find");
		if ($sp =~ m/get-sprot-entry\?([^"]+)"/os && $1 ne $find) {
		    $primaries{$find} = $1;
		    $find = $1;
		    redo OUTER;
		}
	    }
	}
		
	if ($acc) {
	    push @results, [$gi, $acc, $accdb];
	}

	($gi, $acc, $accdb) = (undef) x 3;

	if (@results > 0 && $k == $#{$dbacc}) {
	    last OUTER;
	}

	$k++;
    }

    return @results;
}
