
/****************************************************************************** 
 * 
 *  file:  Alignment.h
 * 
 *  Copyright (c) 2003,  University of Virginia..
 *  All rights reverved.
 * 
 *  See the file COPYRIGHT in the top directory of this distribution for
 *  more information.
 *  
 *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
 *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 *  DEALINGS IN THE SOFTWARE.  
 *  
 *****************************************************************************/ 


#ifndef ALIGNMENT_H
#define ALIGNMENT_H

#include <cmath>
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>
#include <GenericException.h>
#include <Tostring.h>
#include <StringDiff.h>
#include <ScoringMatrix.h>
#include <AltshulGishParams.h>
#include <XMLable.h>
#include <XML.h>

using namespace std;

namespace NOPT {

/**
 * Compression sentinel.
 * Used for compressing alignment output.  Refers to an insertion in the
 * second sequence.
 */ 
const char topGap = '-'; // bot insertion

/**
 * Compression sentinel.
 * Used for compressing alignment output.  Refers to an insertion in the
 * first sequence.
 */ 
const char botGap = '+'; // top insertion

/**
 * Compression sentinel.
 * Used for compressing alignment output.  Used to when characters are 
 * included from both sequences.
 */ 
const char match = '=';


/** 
 * A class that emodies one alignment of two sequences.
 * This class is mainly a storage class. Alignments are created
 * elsewhere. It extends XMLable which allows it to generate xml output.
 */
class Alignment : public XMLable
{

	/**
	 * Operator <<. 
	 * Writes the alignment to an ostream.
	 * \param ostream os - Output stream to that the alignment will be 
	 * written to.
	 * \param Alignment a - Alignment that will be written to the ouput
	 * stream.
	 */
	friend ostream& operator<<(ostream& os, const Alignment& a);

	public:

		static const char gap = '-';

		/**
		 * Constructor.
		 * Everything is provided to this alignment.
		 * \param string seq1 - The first gapped sequence of the alignment.
		 * \param string seq2 - The first gapped sequence of the alignment.
		 * \param int score score of the alignment.
		 * \param int beg1Ind - Index of where alignment starts in seq1, used
		 * for local alignments.
		 * \param int beg2Ind - Index of where alignment starts in seq2, used
		 * for local alignments.
		 * \param int end1Ind - Index of where alignment ends in seq1, used
		 * for local alignments.
		 * \param int end2Ind - Index of where alignment ends in seq2, used
		 * for local alignments.
		 */
		Alignment(const string& seq1, const string& seq2, int score, 
				  int beg1Ind, int beg2Ind, int end1Ind, int end2Ind);

		/**
		 * Copy constructor.
		 * Duplicates an alignment based on the argument.
		 * \param Alignment alignment to be copied.
		 */
		Alignment(const Alignment& a);

		/**
		 * Constructor
		 * This constructor calculates the alignment score of the two gapped
		 * sequences passed in based on the scoring matrix and gap penalties.
		 * \param string seq1 - The first gapped sequence of the alignment.
		 * \param string seq2 - The first gapped sequence of the alignment.
		 * \param ScoringMatrix Sm - scoring matrix to use to calculate 
		 * alignment score.
		 * \param int gapCreate - Gap creation penalty.
		 * \param int gapExtend - Gap extension penalty.
		 */
		Alignment(const string& seq1,
                  const string& seq2,
                  ScoringMatrix& sm,
                  int gapCreate,
                  int gapExtend );

		/**
		 * MyersMiller Constructor.
		 * Constructs an alignment based on the input seqs and a 
		 * Myers-Miller script.
		 * \param string s1 - The first ungapped sequence of the alignment.
		 * \param string s2 - The second ungapped sequence of the alignment.
		 * \param vector<int> script - The Myers-Miller script.
		 * \param int score score of the alignment.
		 * \param int beg1Ind - Index of where alignment starts in seq1, used
		 * for local alignments.
		 * \param int beg2Ind - Index of where alignment starts in seq2, used
		 * for local alignments.
		 * \param int end1Ind - Index of where alignment ends in seq1, used
		 * for local alignments.
		 * \param int end2Ind - Index of where alignment ends in seq2, used
		 * for local alignments.
		Alignment(const string& s1, 
				  const string& s2, 
				  const vector<int>& script, 
				  int score, 
				  int begin1Index, 
				  int begin2Index, 
				  int end1Index, 
				  int end2Index );
		 */

		/**
		 * Blank Constructor.
		 * Sets all values to 0/null/"", etc.
		 */
		Alignment();

		virtual ~Alignment() {};

		/**
		 * Operator =.
		 * Assignment operator. Assigns the values of the argument alignment
		 * to this.
		 * \param Alignment a - Alignment to assign to this.
		 */
		Alignment& operator=(const Alignment& a);

		/**
		 * Equality operator.  
		 * Tests whether argument alignment is equivalent to this.
		 * \param Alignment a - Alignment to compare with this.
		 */
		bool operator==(const Alignment& a);

		/**
		 * Less-than operator.  
		 * Tests whether this is less than the argument
		 * alignment. Used for sorting alignments. Currently only sorts
		 * according to score.
		 * \param Alignment a - Alignment to compare with this.
		 */
		bool operator<(const Alignment& a) const;

		/**
		 * This method calculates the Smith-Waterman score of two aligned seqs.
		 * The score is calculated based on the gap penalties and the scoring
		 * matrix specified.  Uses affine gap penalties so: gapPenalty = 
		 * (gapCreate + gapExtend * numGaps ).
		 * \param string seq1 - First aligned sequence.
		 * \param string seq2 - Second aligned sequence.
		 * \param ScoringMatrix sm - Scoring matrix to be used.
		 * \param int gapCreate - Gap creation penalty.
		 * \param int gapExtend - Gap extension penalty.
		 * \param bool debug - If true debug information will be written to 
		 * cout. Defaults to false and is not required.
		 */
		static int SWScore( const string& seq1,
                            const string& seq2,
                            ScoringMatrix& sm,
                            int gapCreate,
                            int gapExtend,
							bool debug = false );

		static float BitScore( int score, 
                            const string& matrixName,
                            int gapCreate,
                            int gapExtend,
							bool debug = false );

		/**
		 * Returns the score of the alignment.
		 */
		int getScore() const;

		/**
		 * Returns the first gapped sequence of the alignment.
		 */
		string getSeq1() const;

		/**
		 * Returns the second gapped sequence of the alignment.
		 */
		string getSeq2() const;

		/**
		 * Returns the alignment as an ascii string.
		 * \param bool compress - Whether the alignment string should be
		 * compressed in the fasta -m9c format. Defaults to true and
		 * not required.
		 */
		string getAscii(bool compress = true) const;

		/**
		 * Returns the alignment encoded as xml.
		 * Not currently used.  The schema (dtd) used is made up and should
		 * be changed to AGAVE or something similar.
		 */ 
		string getXml() const;


		/**
		 * Uncompresses an alignment string.
		 * Creates two gapped sequence strings based on the compressed
		 * (-m9c format) string and the reference sequences.
		 * \param string c - Compressed (-m9c format) string.
		 * \param string s1 - Empty string that will be assigned the first
		 * gapped sequence.
		 * \param string s2 - Empty string that will be assigned the second
		 * gapped sequence.
		 * \param string refSeq1 - Reference sequence 1.
		 * \param string refSeq2 - Reference sequence 2.
		 */
		void uncompress(const string& c, string& s1, string& s2,
						const string& refSeq1, const string& refSeq2 ) const;

		/**
		 * Compresses two gapped sequences.
		 * Uses the fasta -m9c format.  The format is a sentinel followed by
		 * the number of occurances.  The sentinels are "=" which says include
		 * the characters in both sequences, "+" which says include a gap in
		 * the first sequence and the character from the second sequence, 
		 * and "-" which says include the character from the first
		 * sequence and a gap in the second sequence. Note that this
		 * compression method doesn't require a the reference sequences to
		 * do the compression, but _does_ require them for uncompression.
		 * \param string s1 - First gapped sequence.
		 * \param string s2 - Second gapped sequence.
		 */
		string compress(const string& s1, const string& s2) const;

	protected:

		/**
		 * First gapped sequence of the alignment.
		 */
		string _seq1;

		/**
		 * Second gapped sequence of the alignment.
		 */
		string _seq2;

		/**
		 * Alignment score.
		 */
		int _score;

		/**
		 * Beginning index in sequence 1 for alignment.  
		 * It will be something other than 0 if the alignment is a local
		 * alignment.  Not used at the moment.
		 */
		int _begin1Index;

		/**
		 * Beginning index in sequence 2 for alignment.  
		 * It will be something other than 0 if the alignment is a local
		 * alignment.  Not used at the moment.
		 */
		int _begin2Index;

		/**
		 * Beginning index in sequence 1 for alignment.  
		 * It will be something other than _seq1.length()-1 if the 
		 * alignment is a local alignment.  Not used at the moment.
		 */
		int _end1Index;

		/**
		 * Beginning index in sequence 2 for alignment.  
		 * It will be something other than _seq2.length()-1 if the 
		 * alignment is a local alignment.  Not used at the moment.
		 */
		int _end2Index;
};

}

#endif

