# Brute force motif search # ************************************************************************* # Find the consensus score for the l-mers in dna starting at positions in s # ************************************************************************* def Score(s, l, dna): t = len(dna) # t = number of dna sequences # Step 1: Extract the alignment corresponding to starting positions in s alignment = [] # TO DO: put the list of l-mers corresponding to s into alignment # Step 2: Create the corresponding profile matrix profile = [[],[],[],[]] # prepare an empty 4 x l profile matrix first for i in range(0, 4): profile[i] = [0] * l # TO DO: fill in profile with the profile matrix corresponding to alignment # profile[0] will be a list with the numbers of A's in each column # profile[1] will be a list with the numbers of T's in each column # profile[2] will be a list with the numbers of C's in each column # profile[3] will be a list with the numbers of G's in each column # Step 3: Compute the score from the profile matrix # TO DO: score = sum of maximum values in each column in profile return score # ************************************************************************* # Find the next list of starting positions after a # ************************************************************************* def NextLeaf(a, L, k): for i in range(L, 0, -1): # level L to 1 if a[i-1] < k - 1: a[i-1] = a[i-1] + 1 return a a[i-1] = 0 return a # ************************************************************************* # Brute force motif search - searches (n-l+1)^t possible alignments # t = number of dna sequences in dna # n = length of each dna sequence # l = length of motif (l-mer) to find # ************************************************************************* def BF_MotifSearch(dna, t, n, l): s = [0] * t # create initial starting positions [0, 0, ..., 0] bestScore = Score(s, l, dna) # compute the score for alignment based on s while 1: # loop forever (1 = true) s = NextLeaf(s, t, n-l+1) # increment the starting positions in s score = Score(s, l, dna) # compute the score for alignment based on s if score > bestScore: # if this score is the best so far, bestScore = score # remember the score bestMotif = s[:] # and the motif (the [:] makes a copy) if s == [0] * t: # if we've incremented all the way around, return (bestScore, bestMotif) # return our result # For testing: dna = ["atccagct", "gggcaact", "atggatct", "aagcaacc", "ttggaact", "atgccatt", "atggcact"] print BF_MotifSearch(dna, 7, 8, 3) # note that (n-l+1)^t = 6^7 = 279,936