#!/usr/bin/env python3 # Copyright (c) 2014 Lenwood S. Heath. All rights reserved. # This program or module is free software: you can redistribute it and/or # modify it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. It is provided for educational # purposes and is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. """ This program reads a FASTA file of sequences and tries to identify a shared motif using random sampling. """ import argparse import logging import math from random import randrange,random import re import sys import time """ RandomMotifFinder does the work of random sampling of loci in a list of sequences to find the best motif. It uses log likelihood to score a set of loci. """ def RandomMotifFinder(FASTAfile,k,d,runtime): first_time = time.time() # Current time in seconds S = ReadFASTA(FASTAfile) # Accumulate and log basic statistics count = {nucleotide:0.0 for nucleotide in ["A","C","G","T"]} for sequence in S: for nucleotide in sequence: count[nucleotide] += 1.0 total = 0.0 for nucleotide in ["A","C","G","T"]: logging.info("Count of {0}'s is ".format(nucleotide) + str(int(count[nucleotide]))) total += count[nucleotide] logging.info("Total nucleotide count is " + str(int(total))) probability = {x:count[x]/total for x in ["A","C","G","T"]} for nucleotide in ["A","C","G","T"]: logging.info("Probability of A is ".format(nucleotide) + str(probability[nucleotide])) # Generate random samples and check log likelihoods of corresponding # motifs motif = None loci = [0 for x in range(len(S))] log_likelihood = -10.0 while time.time() - first_time < runtime: # Run for runtime seconds new_loci = RandomSample(S,k) (new_motif,new_profile) = ProfileMotif(S,new_loci,k,d) new_log_likelihood = LogLikelihood(S, new_loci, new_motif, new_profile, probability) if new_log_likelihood > log_likelihood: logging.info("Log likelihood is now " + str(new_log_likelihood)) logging.info("\t\tfor motif " + new_motif) loci = new_loci motif = new_motif log_likelihood = new_log_likelihood return (motif,loci,log_likelihood) """ Find one random sample of loci. """ def RandomSample(S,k): loci = list() for sequence in S: loci.append(randrange(0,len(sequence)-k+1)) return loci """ The job of ProfileMotif is to determine the best motif and the corresponding profile for given loci in S. """ def ProfileMotif(S,loci,k,d): # First generate best motif of length k without don't cares motif = "" # Profile matrix is just count of nucleotides in each position # of alignment corresponding to loci. profile = {x:{y:0 for y in range(k)} for x in ["A","C","G","T"]} for i in range(len(S)): substring = S[i][loci[i]:loci[i]+k] for j in range(k): profile[substring[j]][j] += 1 # The best nucleotide in each position is the one with the highest # count. for j in range(k): best_nucleotide = "A" best_count = profile["A"][j] for nucleotide in ["C","G","T"]: count = profile[nucleotide][j] if count > best_count: best_count = count best_nucleotide = nucleotide motif += best_nucleotide # Now find best loci for d don't cares by counts in profile matrix best_list = [[profile[motif[j]][j],j,motif[j]] for j in range(k)] best_list.sort() # Primary sort is by profile matrix count dont_cares = 0 i = 0 while dont_cares < d: if best_list[i][1] not in {0,k-1}: best_list[i][2] = "*" # Lowest counts get the don't cares dont_cares += 1 i += 1 best_list = [[best_list[i][1],best_list[i][2]] for i in range(k)] best_list.sort() # Sort is now by position motif_list = [best_list[j][1] for j in range(k)] # Select nucleotides # and don't cares motif = "".join(motif_list) return (motif,profile) """ LogLikelihood computes the log likelihood of the given motif in the given loci. """ def LogLikelihood(S,loci,motif,profile,probability): n = len(S) log_likelihood = 0.0 for j in range(len(motif)): nucleotide = motif[j] if nucleotide == "*": continue log_likelihood += math.log2((profile[nucleotide][j]/n)/ probability[nucleotide]) logging.info("Evaluated motif " + motif) logging.info("Log likelihood is " + str(log_likelihood)) return log_likelihood """ Read a FASTA file and return the sequences as a list. """ def ReadFASTA(FASTAfile): header = "" # A header line starts with ">" sequence = "" # Accumulated sequence is initially empty list_of_sequences = list() # Initial empty list of sequences for line in open(FASTAfile): line = line.rstrip() # Eliminate new line and trailing white space if len(line) == 0: # Ignore blank lines continue elif line[0] == ">": # Process a header line if header: if sequence: list_of_sequences.append(sequence) else: raise ValueError("FASTA header without sequence\n" + header) header = line logging.info("header line found:\n\t" + header) sequence = "" elif re.match(r"^[ACGT]*$",line): # Process a sequence line if not header: raise ValueError("FASTA sequence without header\n" + line) sequence += line else: # Line not valid FASTA format raise ValueError("Unrecognized FASTA format line\n" + line) if sequence: list_of_sequences.append(sequence) else: raise ValueError("FASTA file does not end with sequence") return list_of_sequences def main(): """ The main program. Uses argparse to parse the command line arguments. Uses logging to record progress. Then call RandomMotifFinder to find the best motif by random sampling. Finally, send best motif and loci to standard output. """ # Set up basic logging logging.basicConfig(filename="RandomMotifFinder.log",filemode="w", level=logging.INFO,format="%(levelname)s: %(asctime)s: %(message)s") logging.info("Starting RandomMotifFinder") # Process command line arguments argparser = argparse.ArgumentParser( description="Use random sampling to find a motif.") argparser.add_argument("FASTAfile",type=str,help="Input FASTA file") argparser.add_argument("-k",dest="k",type=int,default=6, help="Motif length") argparser.add_argument("-d",dest="d",type=int,default=0, help="Number of don't cares") argparser.add_argument("-t","--time",dest="runtime",type=int, default=2, help="Maximum run time in seconds") args = argparser.parse_args() logging.info("Arguments sucessfully parsed") FASTAfile = args.FASTAfile logging.info("Input FASTA file is " + FASTAfile) k = args.k logging.info("Motif length is " + str(k)) if k <= 1: raise ValueError("k must be at least 2") d = args.d logging.info("Number of don't cares is " + str(d)) if d < 0 or d > k-2: raise ValueError("d must be between 0 and k-2") runtime = args.runtime logging.info("Maximum run time is " + str(runtime) + " seconds") if runtime <= 1: raise ValueError("Run time must be at least 2") # Randomly sample loci; return best motif, loci, and log likelihood (motif,loci,log_likelihood) = RandomMotifFinder(FASTAfile,k,d,runtime) # Print results to standard output print("Best motif of length {0} with {1} don't cares is {2}".format( k,d,motif)) print("Log likelihood is {0}".format(log_likelihood)) print("Loci of the best motif are here:") for locus in loci: print(locus+1) if __name__ == "__main__": main()