/* This is an example HY-PHY Batch File.



   It reads in a '#' nucleotide dataset data/ubiq.flt and estimates

   maximum ln-likelihood based on a (hypothetic) partition of the

   data into 2 exons and 1 intron. MG94 codon model is applied to

   coding regions, whereas HKY85 is used for non-coding regions.

   

   Output is printed out as a Newick Style tree with branch lengths

   representing the number of expected substitutions per branch (which

   is the default setting for nucleotide models w/o rate variation).

   

   

   Sergei L. Kosakovsky Pond and Spencer V. Muse 

   December 1999. 

*/



/* 1. Read in the data and store the result in a DataSet variable.*/



DataSet 		sequences = ReadDataFile ("data/ubiq.flt");

   

/* 2. Filter the data. We create two coding partitions, 1-99 and 199-end, treated as codons in std genetic code; and one non-coding partition, 100-198.

*/

	  

DataSetFilter	exon1 = CreateFilter (sequences,3,"0-98", "","TAA,TAG,TGA");

DataSetFilter	intron = CreateFilter(sequences,1,"99-197");

DataSetFilter	exon2 = CreateFilter (sequences,3,siteIndex>197,"","TAA,TAG,TGA");



/* 5. Collect observed nucleotide frequencies from the filtered data. observedFreq will store the vector of frequencies. Since MG94 used nucleotide frequencies, we collect those. */



HarvestFrequencies (observedFreq1, exon1, 1, 1, 1);

HarvestFrequencies (observedFreq2, exon2, 1, 1, 1);

HarvestFrequencies (intronFreqs, intron, 1, 1, 1);



/* 3. Include the model definition file. The file defines the model 2 MG94 models (one for each exon) and the function BuildCodonFrequencies which estimates equilibrium codon frequencies given nucleotide frequencies. The reason each exon needs its own version of MG94 is that observed nucleotide frequencies are intergrated into the rate matrix in a fairly irregular way, so different obs. freqs. for each exon give rise two 2 rate matrices. The way the models defined the _require_ the nucl. freqs to be stored in variables observedFreq1 and observedFreq2. */



#include "Models/MG94x2.mdl";



/*7.  Build codon frequencies for each exon*/



codonFreqs1 = BuildCodonFrequencies (observedFreq1);

codonFreqs2 = BuildCodonFrequencies (observedFreq2);



/*8.	Define the models for each exon. */

Model MG94model1 = (MG94_1,codonFreqs1,0);

Model MG94model2 = (MG94_2,codonFreqs2,0);



/* 4  define the HKY85 model */



HKY85RateMatrix = 

		{{*,trvs,trst,trvs}

		 {trvs,*,trvs,trst}

		 {trst,trvs,*,trvs}

		 {trvs,trst,trvs,*}};

		 

Model HKY85	 = (HKY85RateMatrix, intronFreqs);





/*6. 	We will need a separate tree for each partition. */



UseModel (MG94model1);	  
Tree	exon1Tree = (BLYMUB1,(BLYMUB2,(RICRMA630,(ZMUBIS27F,(ZMU29160,ZMU29161)))));
UseModel (MG94model2);	  
Tree	exon2Tree = (BLYMUB1,(BLYMUB2,(RICRMA630,(ZMUBIS27F,(ZMU29160,ZMU29161)))));
UseModel (HKY85);	  
Tree	intronTree = (BLYMUB1,(BLYMUB2,(RICRMA630,(ZMUBIS27F,(ZMU29160,ZMU29161)))));



/*9.  Since all the likelihood function ingredients (data, tree, equilibrium frequencies) have been defined we are ready to construct the likelihood function. */

	  

LikelihoodFunction  theLnLik = (exon1, exon1Tree,
				        		exon2, exon2Tree,	                         
				        		intron,intronTree);

/*8.  Maximize the likelihood function, storing parameter values in the matrix paramValues.
	  This will take a few minutes! */

Optimize (paramValues, theLnLik);

/*9.  Print the tree with optimal branch lengths to the console. */

fprintf  (stdout, theLnLik);

