生物信息原理作业第三弹:DNA序列局部比对,利用Smith–Waterman算法,python3.6代码实现。
实例以及原理均来自https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm。
1 import numpy as np 2 import pandas as pd 3 sequence1 = ‘TGTTACGG‘ 4 sequence2 = ‘GGTTGACTA‘ 5 s1 = ‘‘ 6 s2 = ‘‘ 7 gap = -2 8 score_matrix = pd.read_excel(‘score.xlsx‘) #匹配得分 9 print(score_matrix) 10 best_matrix = np.empty(shape= (len(sequence2)+1,len(sequence1)+1),dtype = int) 11 def get_match_score(s1,s2): 12 score = score_matrix[s1][s2] 13 return score 14 15 def get_matrix_max(matrix): #得到最大分数下标 16 Max = matrix.max() 17 for i in range(len(sequence2)+1): 18 for j in range(len(sequence1)+1): 19 if matrix[i][j] == Max: 20 return (i,j) 21 22 for i in range(len(sequence2)+1): 23 for j in range(len(sequence1)+1): 24 if i == 0 or j == 0: 25 best_matrix[i][j] = 0 26 else: 27 match = get_match_score(sequence2[i-1],sequence1[j-1]) 28 gap1_score = best_matrix[i-1][j] + gap 29 gap2_score = best_matrix[i][j-1] + gap 30 match_score = best_matrix[i-1][j-1]+match 31 score = max(gap1_score,gap2_score,match_score) 32 if score>0: 33 best_matrix[i][j] = score 34 else: 35 best_matrix[i][j] = 0 36 print(best_matrix) 37 38 #traceback 39 i,j = get_matrix_max(best_matrix) 40 while(best_matrix[i][j]!= 0): 41 match = get_match_score(sequence2[i-1],sequence1[j-1]) 42 if i>0 and j>0 and best_matrix[i][j] == best_matrix[i-1][j-1]+match: 43 s1 += sequence1[j-1] 44 s2 += sequence2[i-1] 45 i-=1;j-=1 46 elif i>0 and best_matrix[i,j] == best_matrix[i-1,j]+gap: 47 s1+=‘-‘ 48 s2+=sequence2[i-1] 49 i-=1 50 else: 51 s1+=sequence1[j-1] 52 s2+=‘-‘ 53 j-=1 54 print(s1[::-1]+‘\n‘+s2[::-1])
感觉我的得分矩阵写成Excel不必要,等我熟悉一下Numpy和Python命令行之后会修改的。
时间: 2024-10-27 06:52:54