生物信息学原理作业第四弹:DNA序列组装(贪婪算法)
原理:生物信息学(孙啸)
大致思想:
1. 找到权值最大的边;
2. 除去以最大权值边的起始顶点为起始顶点的边;
3. 除去以最大权值边为终点为终点的边;
4. 重复上述步骤,得到所有符合条件的边;
5. 拼接得到的边;
6. 加入孤立点(如果有)。
附上Python代码,如果有问题我会及时更正(确实不太熟算法)
转载请保留出处!
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Mon Dec 4 15:04:39 2017 4 @author: zxzhu 5 python3.6 6 """ 7 from functools import reduce 8 def get_weight(s1,s2): #通过两条序列的overlap计算出权值 9 l = min(len(s1),len(s2)) 10 while l>0: 11 if s2[:l] == s1[-l:]: 12 return l 13 else: 14 l-=1 15 return 0 16 17 def print_result(s1,s2): #将两条序列去除首尾overlap后合并 18 weight = get_weight(s1,s2) 19 s = s1 + s2[weight:] 20 #print(s) 21 return s 22 23 def dir_graph(l,t=3): #得到满足条件的有向图(权值大于等于t) 24 graph = {} 25 for i in l: 26 for j in l: 27 if i!=j: 28 weight = get_weight(i,j) 29 if weight >= t: 30 key = (i,j) 31 graph[key] = weight 32 return graph 33 34 def rm_path(graph,path): #贪婪算法加入一条边后应该去除与该边首尾顶点相同的边 35 key = graph.keys() 36 rm_key = [] 37 for i in key: 38 if i[1] == path[1] or i[0] == path[0]: 39 rm_key.append(i) 40 for i in rm_key: 41 graph.pop(i) 42 return graph 43 44 def get_path(graph,path = []): #得到满足条件的所有边 45 while graph: 46 max_weight = 0 47 for i in graph.keys(): 48 if graph[i] > max_weight: 49 max_weight = graph[i] 50 cur_path = i 51 path.append(cur_path) 52 graph = rm_path(graph,cur_path) 53 get_path(graph,path) 54 return path 55 56 def out_num(path,V): #计算某顶点的出度 57 count = 0 58 for i in path: 59 if i[0] == V: 60 count+=1 61 return count 62 63 def get_last_V(path,last_V = None): #得到最后一条边 64 index = 0 65 if last_V: #非随机寻找出度为0的顶点 66 for i in path: 67 if i[1] == last_V: 68 return i,index 69 else: 70 index+=1 71 return None #没有找到指向last_V的顶点 72 else: #随机寻找出度为0的顶点 73 for i in path: 74 if out_num(path,i[1]) == 0: 75 return i,index 76 else: 77 index+=1 78 79 def assemble(cur_V,path,new_path = []): #给满足条件的边排序 80 while path: 81 path.pop(cur_V[1]) 82 new_path.insert(0,cur_V[0]) 83 cur_V = get_last_V(path,last_V = cur_V[0][0]) 84 if cur_V: 85 assemble(cur_V,path,new_path) 86 else: 87 cur_V = get_last_V(path) 88 assemble(cur_V,path,new_path) 89 return new_path 90 91 def align_isolated(path,sequence): #加入孤立顶点 92 new_path = reduce(lambda x,y:x+y,path) 93 for i in sequence: 94 if i not in new_path: 95 new_path.append(i) 96 return new_path 97 98 99 x = ‘CCTTTTGG‘ 100 y = ‘TTGGCAATCACT‘ 101 w = ‘AGTATTGGCAATC‘ 102 u = ‘ATGCAAACCT‘ 103 z = ‘AATCGATG‘ 104 v = ‘TCACTCCTTTT‘ 105 a = ‘CATAA‘ 106 b = ‘ATCA‘ 107 c = ‘TGCAT‘ 108 sequence = [x,y,w,u,z] 109 sequence1 = [a,b,c] 110 graph = dir_graph(sequence1,t=2) 111 path = get_path(graph) 112 path = [list(i) for i in path] #将path中的tuple元素换成list 113 #print(path) 114 start = get_last_V(path) #起始出度为0的顶点所在的边 115 new_path = assemble(start,path) #排序后的边 116 #print(new_path) 117 new_path = align_isolated(new_path,sequence1) #加入孤立顶点 118 #print(new_path) 119 result = reduce(print_result,new_path) #组装 120 print(result)
时间: 2024-11-09 12:20:14