#!/usr/bin/env python3 # Copyright (c) 2014 Lenwood S. Heath. All rights reserved. # This program or module is free software: you can redistribute it and/or # modify it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. It is provided for educational # purposes and is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. """ This program generates a FASTA file of sequences, each containing an embedded motif, according to command line arguments. """ import argparse import logging import math from random import randrange,random import sys from textwrap import wrap """ EmbedMotif does the work of generating a list of sequences, embedding a motif in them, and mutating the resulting sequences. """ def EmbedMotif(k,d,n,m,mu,A,C,G,T): S = GenerateSequences(n,m,A,C,G,T) (S,loci,motif) = InsertMotif(S,k,d,A,C,G,T) S = MutateSequences(S,mu,A,C,G,T) (profile,actual_motif) = ProfileMotif(S,loci,k,d) if motif != actual_motif: logging.error("Actual optimal motif " + actual_motif + " differs from generated motif " + motif) # Accumulate the actual nucleotide probabilities count = {nucleotide:0.0 for nucleotide in ["A","C","G","T"]} for sequence in S: for nucleotide in sequence: count[nucleotide] += 1.0 total = 0.0 for nucleotide in ["A","C","G","T"]: total += count[nucleotide] probability = {x:count[x]/total for x in ["A","C","G","T"]} log_likelihood = LogLikelihood(S,loci,motif,profile,probability) return S """ The basic job of GenerateSequences is to generate a list of sequences of the appropriate length, with each sequence having length in a range from 0.75*m to 1.25*m and with appropriate nucleotide frequences. """ def GenerateSequences(n,m,A,C,G,T): # Define lower and upper bounds of range for sequence length lower = math.ceil(0.75 * m) upper = math.floor(1.25 * m) S = [] for i in range(n): my_length = randrange(lower,upper) sequence = "" for j in range(my_length): sequence += RandomNucleotide(A,C,G,T) S.append(sequence) return S """ The job of InsertMotif is to create an appropriate random motif and to insert it at a random location in each sequence in a list of sequences. """ def InsertMotif(S,k,d,A,C,G,T): # First generate motif of length k motif = "" for i in range(k): motif += RandomNucleotide(A,C,G,T) # Now insert d don't cares j = d while j > 0: i = randrange(1,k-1) # Do not put a * at either end of motif if motif[i] != "*": j -= 1 motif = motif[0:i] + "*" + motif[i+1:k] logging.info("Chosen motif is " + motif) # Finally, insert motif at a random location in each sequence S_prime = [] loci = list() for sequence in S: my_length = len(sequence) lower = 0 upper = my_length - k + 1 my_position = randrange(lower,upper) logging.info("Position of motif: " + str(my_position+1)) loci.append(my_position) for i in range(k): if motif[i] != "*": sequence = (sequence[0:my_position+i] + motif[i] + sequence[my_position+i+1:my_length]) S_prime.append(sequence) return (S_prime,loci,motif) """ Start with a list of sequences and mutate a random selection of nucleotides in those sequences. """ def MutateSequences(S,mu,A,C,G,T): S_prime = [] number_mutations = 0 for sequence in S: my_length = len(sequence) mutated_sequence = "" for my_nucleotide in sequence: if random() < mu: new_nucleotide = RandomNucleotide(A,C,G,T) while new_nucleotide == my_nucleotide: new_nucleotide = RandomNucleotide(A,C,G,T) mutated_sequence += new_nucleotide number_mutations += 1 else: mutated_sequence += my_nucleotide S_prime.append(mutated_sequence) logging.info("Total number of mutations is " + str(number_mutations)) return S_prime """ Generate a random nucleotide according to a probability distribution. """ def RandomNucleotide(A,C,G,T): my_random = random() if my_random < A: return "A" elif my_random < A+C: return "C" elif my_random < A+C+G: return "G" else: return "T" """ The job of ProfileMotif is to determine the best motif and the corresponding profile for given loci in S. """ def ProfileMotif(S,loci,k,d): # First generate best motif of length k without don't cares motif = "" # Profile matrix is just count of nucleotides in each position # of alignment corresponding to loci. profile = {x:{y:0 for y in range(k)} for x in ["A","C","G","T"]} for i in range(len(S)): substring = S[i][loci[i]:loci[i]+k] for j in range(k): profile[substring[j]][j] += 1 # The best nucleotide in each position is the one with the highest # count. for j in range(k): best_nucleotide = "A" best_count = profile["A"][j] for nucleotide in ["C","G","T"]: count = profile[nucleotide][j] if count > best_count: best_count = count best_nucleotide = nucleotide motif += best_nucleotide # Now find best loci for d don't cares by counts in profile matrix best_list = [[profile[motif[j]][j],j,motif[j]] for j in range(k)] best_list.sort() # Primary sort is by profile matrix count dont_cares = 0 i = 0 while dont_cares < d: if best_list[i][1] not in {0,k-1}: best_list[i][2] = "*" # Lowest counts get the don't cares dont_cares += 1 i += 1 best_list = [[best_list[i][1],best_list[i][2]] for i in range(k)] best_list.sort() # Sort is now by position motif_list = [best_list[j][1] for j in range(k)] # Select nucleotides # and don't cares motif = "".join(motif_list) return (profile,motif) """ LogLikelihood computes the log likelihood of the given motif in the given loci. """ def LogLikelihood(S,loci,motif,profile,probability): n = len(S) log_likelihood = 0.0 for j in range(len(motif)): nucleotide = motif[j] if nucleotide == "*": continue log_likelihood += math.log2((profile[nucleotide][j]/n)/ probability[nucleotide]) logging.info("Evaluated motif " + motif) logging.info("Log likelihood is " + str(log_likelihood)) return log_likelihood def main(): """ The main program. Uses argparse to parse the command line arguments. Uses logging to record progress. Then call EmbedMotif to generate sequences. Finally, send sequences in FASTA format to output file. """ # Set up basic logging logging.basicConfig(filename="EmbedMotif.log",filemode="w", level=logging.INFO,format="%(levelname)s: %(asctime)s: %(message)s") logging.info("Starting EmbedMotif") # Process command line arguments argparser = argparse.ArgumentParser( description="Generate sequences with an embedded motif.") argparser.add_argument("FASTAfile",type=str,help="Output FASTA file") argparser.add_argument("-k",dest="k",type=int,default=6, help="Motif length") argparser.add_argument("-d",dest="d",type=int,default=0, help="Number of don't cares") argparser.add_argument("-n",dest="n",type=int,default=10, help="Number of sequences") argparser.add_argument("-m",dest="m",type=int,default=250, help="Average sequence length") argparser.add_argument("--mu",dest="mu",type=float,default=0.20, help="Probability of nucleotide mutation") argparser.add_argument("-a","-A",dest="A",type=float,default=0.25, help="Target probability for amino acid A") argparser.add_argument("-c","-C",dest="C",type=float,default=0.25, help="Target probability for amino acid C") argparser.add_argument("-g","-G",dest="G",type=float,default=0.25, help="Target probability for amino acid G") args = argparser.parse_args() logging.info("Arguments sucessfully parsed") FASTAfile = args.FASTAfile out_file = open(FASTAfile,mode="w") logging.info("Output FASTA file is " + FASTAfile) k = args.k logging.info("Motif length is " + str(k)) if k <= 1: raise ValueError("k must be at least 2") d = args.d logging.info("Number of don't cares is " + str(d)) if d < 0 or d > k-2: raise ValueError("d must be between 0 and k-2") n = args.n logging.info("Number of sequences is " + str(n)) if n <= 1: raise ValueError("n must be at least 2") m = args.m logging.info("Average sequence length is " + str(m)) if math.ceil(0.75 * m) < k: raise ValueError("k cannot exceed 0.75*m") mu = args.mu logging.info("Probability of nucleotide mutation is " + str(mu)) if mu < 0.0: raise ValueError("mu must be non-negative") A = args.A logging.info("Probability of A is " + str(A)) if A <= 0.0 or A >= 1.0: raise ValueError("A must be strictly between 0 and 1") C = args.C logging.info("Probability of C is " + str(C)) if C <= 0.0 or C >= 1.0: raise ValueError("C must be strictly between 0 and 1") G = args.G logging.info("Probability of G is " + str(G)) if G <= 0.0 or G >= 1.0: raise ValueError("G must be strictly between 0 and 1") T = 1 - (A+C+G) logging.info("Probability of T is " + str(T)) if T <= 0.0: raise ValueError("A+C+G must be strictly less than 1") # Generate set of sequences with embedded motif S = EmbedMotif(k,d,n,m,mu,A,C,G,T) # Counting nucleotides generates some basic statistics count = {nucleotide:0.0 for nucleotide in ["A","C","G","T"]} # Output FASTA file while accumulating statistics for i in range(len(S)): sequence = S[i] print(">Sequence{0} length {1}".format(i+1,len(sequence)), file=out_file) print("\n".join(wrap(sequence)),file=out_file) for nucleotide in sequence: count[nucleotide] += 1.0 total = 0.0 for nucleotide in ["A","C","G","T"]: logging.info("Count of {0}'s is ".format(nucleotide) + str(int(count[nucleotide]))) total += count[nucleotide] logging.info("Total nucleotide count is " + str(int(total))) probability = {x:count[x]/total for x in ["A","C","G","T"]} for nucleotide in ["A","C","G","T"]: logging.info("Probability of {0} is ".format(nucleotide) + str(probability[nucleotide])) if __name__ == "__main__": main()