package ca.ucalgary.seahawk.util;

/**
 * Utility methods to report properties of DNA and protein sequence.
 */

public class Sequence{

    /**
     * Case insensitve search for DNA or RNA like characters in the provid3ed string
     *
     * @return true if ACGTUX comprise > 2/3 of the sequence, and all non-IUPAC letters comprise less than 5%
     */
    public static boolean isNucleicAcid(String sequence){
        sequence = sequence.replaceAll("\\s", "");
	String naChars = sequence.replaceAll("[^acgtunxACGTUNX]", "");
	String nonNAChars = sequence.replaceAll("[acgtumrwsykvhdbnxACGTUMRWSYKVHDBNX \\-\t\r\n]", "");
	double seqLen = (double) sequence.length(); // not int, because we don't want integer division
	return naChars.length()/seqLen > 0.66 && nonNAChars.length()/seqLen < 0.05;
	    
    }

    public static boolean isDNA(String sequence){
        sequence = sequence.replaceAll("\\s", "");
	String dnaChars = sequence.replaceAll("[^acgtnxACGTNX]", "");
	String nonDNAChars = sequence.replaceAll("[acgtmrwsykvhdbnxACGTMRWSYKVHDBNX \\-\t\r\n]", "");
	double seqLen = (double) sequence.length(); // not int, because we don't want integer division
	return dnaChars.length()/seqLen > 0.66 && nonDNAChars.length()/seqLen < 0.05;	    
    }

    public static boolean isRNA(String sequence){
        sequence = sequence.replaceAll("\\s", "");
	String rnaChars = sequence.replaceAll("[^acgunxACGUNX]", "");
	String nonRNAChars = sequence.replaceAll("[acgumrwsykvhdbnxACGUMRWSYKVHDBNX \\-\t\r\n]", "");
	double seqLen = (double) sequence.length(); // not int, because we don't want integer division
	return rnaChars.length()/seqLen > 0.66 && nonRNAChars.length()/seqLen < 0.05;
    }

    public static boolean isProtein(String sequence){
        sequence = sequence.replaceAll("\\s", "");
	String protChars = sequence.replaceAll("[^ARNDCQEGHILKMFPSTWYVBZXarndcqeghilkmfpstwyvbz*]", "");
	String nonProtChars = sequence.replaceAll("[ARNDCQEGHILKMFPSTWYVBZXarndcqeghilkmfpstwyvbz \\-\t\r\n]", "");
	double seqLen = (double) sequence.length(); // not int, because we don't want integer division
	return protChars.length()/seqLen > 0.95 && nonProtChars.length()/seqLen < 0.02;	    
    }

    public static String reverseComplement(String sequence){
	if(sequence == null){
	    return null;
	}

	char aComp = 't';// assume DNA
	char AComp = 'T';// assume DNA
	if(isRNA(sequence)){
	    aComp = 'u';
	    AComp = 'U';
	}
	int seqLen = sequence.length();
	StringBuffer revComp = new StringBuffer(seqLen);
	revComp.setLength(seqLen);
	for(int i = 0; i < seqLen; i++){
	    char compChar = 'X';
	    switch(sequence.charAt(i)){
	        case 'a': compChar = aComp; break;
	        case 'A': compChar = AComp; break;
	        case 'c': compChar = 'g'; break;
	        case 'C': compChar = 'G'; break;
	        case 'g': compChar = 'c'; break;
	        case 'G': compChar = 'C'; break;
	        case 't': // fall-through
	        case 'u': compChar = 'a'; break;
	        case 'T': // fall-through
	        case 'U': compChar = 'A'; break;

		// IUPAC ambiguity codes below
	        case 'b': compChar = 'v'; break;
	        case 'B': compChar = 'V'; break;
                case 'd': compChar = 'h'; break;
                case 'D': compChar = 'H'; break;
                case 'h': compChar = 'd'; break;
                case 'H': compChar = 'd'; break;
                case 'k': compChar = 'm'; break;
                case 'K': compChar = 'M'; break;
                case 'm': compChar = 'k'; break;
                case 'M': compChar = 'K'; break;
                case 'r': compChar = 'y'; break;
                case 'R': compChar = 'Y'; break;
                case 's': compChar = 's'; break;
                case 'S': compChar = 'S'; break;
                case 'v': compChar = 'b'; break;
                case 'V': compChar = 'B'; break;
                case 'w': compChar = 'w'; break;
                case 'W': compChar = 'W'; break;
                case 'y': compChar = 'r'; break;
                case 'Y': compChar = 'R'; break;
                case 'n': compChar = 'n'; break;
                case 'N': compChar = 'N'; break;
                case 'x': compChar = 'x'; break;
	    }
	    revComp.setCharAt(seqLen-i-1, compChar);
	}
	return revComp.toString();
    }
}
