
/****************************************************************************** 
 * 
 *  file:  AlignmentGenerator.h
 * 
 *  Copyright (c) 2003,  University of Virginia..
 *  All rights reverved.
 * 
 *  See the file COPYRIGHT in the top directory of this distribution for
 *  more information.
 *  
 *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
 *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 *  DEALINGS IN THE SOFTWARE.  
 *  
 *****************************************************************************/ 


#ifndef ALIGNMENTGENERATOR_H
#define ALIGNMENTGENERATOR_H

#include <string>
#include <cstdlib>
#include <algorithm>
#include <fstream>
#include <vector>
#include <list>
#include <map>
#include <ScoringMatrix.h> 
#include <GenericException.h> 
#include <XMLable.h> 
#include <XML.h> 
#include <Tostring.h> 
#include <time.h> 
#include "Alignment.h" 
#include <Debug.h> 
#include "Whence.h" 

using namespace std;

namespace NOPT { 

/**
 * Diagonal (match) constant.
 * A value used for the directional bitmask in the Whence objects.
 */
const int DIAG = 1;

/**
 * Create gap down constant.
 * A value used for the directional bitmask in the Whence objects.
 */
const int DOWN = 2;

/**
 * Create gap across constant.
 * A value used for the directional bitmask in the Whence objects.
 */
const int ACROSS = 4;

/**
 * Gap extended down constant.
 * A value used for the directional bitmask in the Whence objects.
 */
const int EXTENDED_DOWN = 8;

/**
 * Gap extended across constant.
 * A value used for the directional bitmask in the Whence objects.
 */
const int EXTENDED_ACROSS = 16;

/**
 * End position constant.
 * A value used for the directional bitmask in the Whence objects.
 * End position is only to signal the end of local alignments.
 */
const int END_POSITION = 32;

/**
 * Define a map iterator.
 */
typedef map<string,unsigned long long>::const_iterator MLL;

/**
 * Near optimal alignment generation base class.
 * This class provides some useful methods for generating near optimal
 * alignments and declares some of the common data used for specifying
 * near optimal alignments.  It extends XMLable to easily output xml.
 */
class AlignmentGenerator : public XMLable
{
	public:

		/**
		 * Constructor.
		 * This constructor primarily sets the data values to what is
		 * passed in and does a few sanity checks.
		 * \param int gapCreate - Gap creation penalty value.
		 * \param int gapExtend - Gap extension penalty value.
		 * \param ScoringMatrix sm - Scoring matrix to be used for 
		 * the alignment.
		 * \param string seq1 - The first sequence to be aligned.
		 * \param string seq2 - The second sequence to be aligned.
		 * \param float lowerBoundPercent - Value between 0 and 1 specifying
		 * the lower bound of the near optimal neighborhood.
		 * \param float upperBoundPercent - Value between 0 and 1 specifying
		 * the upper bound of the near optimal neighborhood.
		 * \param int debug - Debug bitmask value.  Used for outputting
		 * different levels of debug information.
		 * \param bool local - True indicates that the alignments generated
		 * should be true.
		 */
		AlignmentGenerator(int gapCreate, int gapExtend, ScoringMatrix* sm,
						   bool isDNA, const string& seq1, 
						   const string& seq2, float lowerBoundPercent, 
						   float upperBoundPercent, int debug, bool local,
						   const string& algName );

		/**
		 * Destructor.
		 */
		virtual ~AlignmentGenerator();

		/**
		 * Writes ASCII formatted alignments to STDOUT.
		 */
		virtual void getAscii(bool compress = true); 

		/**
		 * Returns a string of the alignments as xml.
		 */
		virtual string getXml() const; 

		/**
		 * Returns true if the class can generate XML output.
		 */
		virtual bool xmlable() const;

		/**
		 * Counts the number of times an edge occurs in a set of alignments.
		 * Lists each edge and its number of occurances.
		 * \param bool showStatus - True will print out status information
		 * as the alignments are counted.  This is useful when counting
		 * edges for all alignments (Waterman-Byers) for distantly related 
		 * sequences (meaning there are lots of alignments).  False just
		 * prints the edge count pair.
		 */
		void getCounts( bool showStatus );

		
		/**
		 * Returns the name of the alignment algorithm used.
		 */
		string getAlgorithmName();

	protected:

		/**
		 * Calculates the boundary scores.
		 * Based on the upper and lower bound percents and the optimal score
		 * this method calcuates what the upper and lower bound scores
		 * are.
		 */
		void _calculateBounds();

		/**
		 * Uses standard dynamic programming algorithm to fill a score matrix.
		 * An implementation of Gotoh's dp algorithm for calculating the
		 * score matrix. In addition to the score matrix this method can
		 * track how each edge was calcuated so that alignments can be 
		 * generated by backtracking through.  Finally, weights can be applied
		 * to edges in the score matrix if desired.  The direction matrix
		 * and weights are applied depending on the size of data structures
		 * passed in.  If they have no size, they're not considered.
		 * \param string seq1 - The first ungapped sequence used to be 
		 * aligned. 
		 * \param string seq2 - The second ungapped sequence used to be 
		 * aligned. 
		 * \param vector< vector<int> > scores - The matrix that the scores
		 * will be stored in.
		 * \param vector< vector<Whence> > directions - The matrix that the
		 * direction will be stored in.  If this is size 0, directions will
		 * not be tracked.
		 * \param map<string,int> weights - The map that contains the weights
		 * to be applied to given edges.  If the map has size 0, no weights
		 * are applied. Note that this method does not update weights, that
		 * must be done elsewhere.  The key to this map is a string comprised
		 * of the edge indices and the characters of the sequence.
		 */
		void _fill( const string& seq1, const string& seq2,
                    vector< vector<int> >& scores,
                    vector< vector<Whence> >& directions, 
                    map<string,int>& weights );

		/**
		 * Uses standard dynamic programming algorithm to fill a score matrix.
		 * This calls the fully specified _fill method with a weights
		 * map of size 0.  Simply a convenience method.
		 * \param string seq1 - The first ungapped sequence used to be 
		 * aligned. 
		 * \param string seq2 - The second ungapped sequence used to be 
		 * aligned. 
		 * \param vector< vector<int> > scores - The matrix that the scores
		 * will be stored in.
		 * \param vector< vector<Whence> > directions - The matrix that the
		 * direction will be stored in.  Presumably this will not have a
		 * size 0, otherwise you'd use the other _fill.
		 */
		void _fill( const string& seq1, const string& seq2,
                    vector< vector<int> >& scores,
                    vector< vector<Whence> >& directions );

		/**
		 * Uses standard dynamic programming algorithm to fill a score matrix.
		 * This calls the fully specified _fill method with a weights
		 * map of size 0 and directions matrix of size 0.  Simply a 
		 * convenience method.
		 * \param string seq1 - The first ungapped sequence used to be 
		 * aligned. 
		 * \param string seq2 - The second ungapped sequence used to be 
		 * aligned. 
		 * \param vector< vector<int> > scores - The matrix that the scores
		 * will be stored in.
		 */
		void _fill( const string& seq1, const string& seq2,
                    vector< vector<int> >& scores );

		/**
		 * Creates a key to the weights map.
		 * Convenience method to create a string out of the edge indices
		 * and characters of the sequences.
		 * \param int i - The edge index of the first sequence character.
		 * \param int j - The edge index of the second sequence character.
		 * \param char s1 - The character of the first sequence at index i.
		 * \param char s2 - The character of the second sequence at index j.
		 */
		string _createKey( int i, int j, char s1, char s2 );

		/**
		 * A template method to output and arbitrary matrix to stdout.
		 * This is a template method that writes a matrix of arbitrary type
		 * (T) to stdout.  This is primarily used for debugging.
		 * \param vector< vector<T> > - The matrix to write to stdout.
		 * \param int i - Usually the down index of the matrix, but can be 
		 * anything. Used for debugging.
		 * \param int j - Usually the across index of the matrix, but can be 
		 * anything. Used for debugging.
		 * \param char* s - Usually the name of the matrix, but can be
		 * anything.
		 */
		template <class T>
		void _showMatrix(const vector< vector<T> >& v, 
						 int i, int j, const char* s )
		{
			cout << "matrix " << s << " i:" << i << " j: " << j << endl;
			for ( int i = 0; (unsigned int)i < v.size(); i++ )
			{
				for ( int j = 0; (unsigned int)j < v[i].size(); j++ )
					cout << setw(4) << v[i][j] << " ";
				cout << endl ;
			}
			cout << endl;
		}

		/**
		 * The method that counts a single alignment.
		 * This method takes the two strings passed in and creates an
		 * edge for each index.  The edge is then counted.
		 * \param string s1 - The first (gapped) sequence of the alignment.
		 * \param string s2 - The second (gapped) sequence of the alignment.
		 */
		void _countAlignment( const string& s1, const string& s2 );

		/**
		 * The actual method that does the counting.
		 * Can be overridden by children that need to count differently
		 * than the default.
		 */
		virtual void _doCounts();


		/**
		 * A method that creates a simple checksum for a given string.
		 * \param string s - The string to be summed.
		 */
		int _checkSum( const string& s );

		/**
		 * A method called in the constructor that creates _paramString
		 * and _paramSum.
		 */
		void _createParamString();

		//------------------------------------------------------------------
		// Command line options.
		//------------------------------------------------------------------

		/**
		 * The first sequence to be aligned.
		 */
		string _seq1;

		/**
		 * The second sequence to be aligned.
		 */
		string _seq2;

		/**
		 * Boolean value indicating whether or not the sequence are DNA. 
		 */
		bool _isDna;

		/**
		 * Gap create value. 
		 * Affine gap penalties: penalty = gapCreate + gapExtend * numGaps.
		 */
		int _gapCreate;

		/**
		 * Gap extend value. 
		 * Affine gap penalties: penalty = gapCreate + gapExtend * numGaps.
		 */
		int _gapExtend;

		/**
		 * Upper bound percentage.
		 * Between 0 and 1.  Upper must be greater than or equal to lower.
		 */
		float _upperBoundPercent;

		/**
		 * Lower bound percentage.
		 * Between 0 and 1.  Lower must be less than or equal to upper.
		 */
		float _lowerBoundPercent;

		/**
		 * Scoring matrix name. 
		 */
		string _smName; 

		/**
		 * Number of alignments generated. 
		 */
		unsigned long long _numAlignments;

		/**
		 * Debug bitmask.
		 * See Debug.h for values.
		 */
		int _debug;

		/**
		 * Whether local alignments should be calculated instead of global.
		 */
		bool _local;

		/**
		 * Pointer to a ScoringMatrix object. 
		 */
		ScoringMatrix* _sm;

		/**
		 * Vector of alignments generated.
		 * It should be noted that this is not necessarily populated.
		 */
		vector<Alignment> _alignments;

		//------------------------------------------------------------------
		// Other values.
		//------------------------------------------------------------------
		
		/**
		 * The lower bound score.
		 * Calculated based on the optimal score and the lower bound
		 * percentage.
		 */
		int _lowerBound;
		
		/**
		 * The upper bound score.
		 * Calculated based on the optimal score and the upper bound 
		 * percentage.
		 */
		int _upperBound;
		
		/**
		 * The optimal score.
		 * Usually set after the first pass through the _fill method.
		 */
		int _optimalScore;

		/**
		 * A map of edges to counts.
		 * The edges is represented as a string which points to a count.
		 */
		map<string,unsigned long long> _edgeCounts;

		/**
		 * Whether or not to perform the count.
		 * Alternatively, enumerate all of the alignments.  
		 * Only applicable to some children.
		 */
		bool _performCount;

		/**
		 * The running count of alignments.
		 * Only used for generating the verbose output of edge counting.
		 */
		int _runningCount;

		/**
		 * The time that the edge counting begins.
		 */
		clock_t _beginTime;

		/**
		 * Whether or not to show status info as alignments are counted.
		 */
		bool _showStatus;

		/**
		 * A string that identifies all of the parameters used. Used in the
		 * output.
		 */
		string _paramString;

		/**
		 * The checksum of the parameter string used in the output 
		 * to identify the parameters used by a particular alignment.
		 */
		int _paramSum;

		/**
		 * The name of the algorithm used to generate the set of alignments.
		 */
		string _algorithmName;

};

}

#endif
