Zuker algorithm¶
The basic idea: minimum free energy (MFE)
Free energy minimization of RNA secondary structure.- In cells, RNAs are likely to form energetically stable secondary structure.
- The correctness of secondary structure should be evaluated based on the free energy, rather than the number of base-pairs.
- The free energy of a secondary structure is approximated as the sum of the free energy of "loops".
※ If you want know more, please look at here!!
Implementation¶
In [1]:
import numpy as np
In [2]:
inf = 1e3
a = 6
b = -1
c = 0.1
Stacking¶
# | G:C | A:U/G:U |
---|---|---|
G:C | -3.0 | -2.0 |
A:U/G:U | -2.0 | -0.5 |
In [3]:
stacking_score = np.array([
[-3.0, -2.0],
[-2.0, -0.5]
])
Hairpin, Internal, Bulge loop¶
長さ(nt) | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31以上 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Hairpin | Inf | Inf | 4.4 | 4.6 | 4.7 | 4.4 | 5.0 | 4.5 | 5.4 | 5.5 | 5.6 | 5.7 | 5.8 | 5.9 | 5.9 | 6.0 | 6.1 | 6.1 | 6.2 | 6.2 | 6.3 | 6.3 | 6.4 | 6.4 | 6.5 | 6.5 | 6.5 | 6.6 | 6.6 | 6.7 | Inf |
Internal | Inf | 1.0 | 1.0 | 1.1 | 2.0 | 2.0 | 2.1 | 2.3 | 2.4 | 2.5 | 2.6 | 2.7 | 2.8 | 2.9 | 2.9 | 3.0 | 3.1 | 3.1 | 3.2 | 3.3 | 3.3 | 3.4 | 3.4 | 3.5 | 3.5 | 3.5 | 3.6 | 3.6 | 3.7 | 3.7 | Inf |
Buldge | 1.0 | 1.0 | 1.0 | 1.1 | 2.0 | 2.0 | 2.1 | 2.3 | 2.4 | 2.5 | 2.6 | 2.7 | 2.8 | 2.9 | 2.9 | 3.0 | 3.1 | 3.1 | 3.2 | 3.3 | 3.3 | 3.4 | 3.4 | 3.5 | 3.5 | 3.5 | 3.6 | 3.6 | 3.7 | 3.7 | Inf |
※ I think this definition of Free energy is strange because if there constructs secondary structure, the total free energy will increase, it means unstable.
In [4]:
Hairpin_score = np.array([inf, inf, 4.4, 4.6, 4.7, 4.4, 5.0, 4.5, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.9, 6.0, 6.1, 6.1, 6.2, 6.2, 6.3, 6.3, 6.4, 6.4, 6.5, 6.5, 6.5, 6.6, 6.6, 6.7])
In [5]:
Internal_score = np.array([inf, 1.0, 1.0, 1.1, 2.0, 2.0, 2.1, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7])
In [6]:
Buldge_score = np.array([1.0, 1.0, 1.0, 1.1, 2.0, 2.0, 2.1, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 2.9, 3.0, 3.1, 3.1, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, 3.6, 3.6, 3.7, 3.7])
In [7]:
class Zuker():
def __init__(self,Array,a,b,c,hairpin,stacking,internal_loop,buldge_loop,inf=1e3):
"""initialization"""
self.arr = Array
self.inf = inf
self.N = len(Array)
self.W = np.full((self.N,self.N), self.inf)# np.zeros((self.N,self.N),dtype=int)
self.V = np.full((self.N,self.N), self.inf)
self.M = np.full((self.N,self.N), self.inf)
self.M1 = np.full((self.N,self.N), self.inf)
"""parameter"""
self.hairpin = hairpin
self.stacking = stacking
self.internal_loop = internal_loop
self.buldge_loop = buldge_loop
self.a=a; self.b=b; self.c=c
def _is_bp(self,i,j,DNA,Wobble):
"""check if i to j forms a base-pair."""
bases = self.arr[i] + self.arr[j]
# Watson-Crick
WC_1 = ("A" in bases) and ("T" in bases) if DNA else ("A" in bases) and ("U" in bases)
WC_2 = ("G" in bases) and ("C" in bases)
# Wobble
Wob = ("G" in bases) and ("U" in bases) if Wobble else 0
return 0 if WC_2 else 1 if (WC_1 or Wob) else 2
def _calW(self,i,j): return min(self.W[i+1][j],self.W[i][j-1],self.V[i][j],min([self.W[i][k] + self.W[k+1][j] for k in range(i,j)]))
def _calV(self,i,j,DNA,Wobble): return min(self._F1(i,j),min([self.inf]+[self._F2(i,j,h,l,DNA,Wobble)+self.V[h][l] for h in range(i+1,j) for l in range(h+1,j) if self._is_bp(h,l,DNA=DNA,Wobble=Wobble)<2]),self.M[i+1][j-1] + self.a + self.b) if self._is_bp(i,j,DNA=DNA,Wobble=Wobble)<2 else self.inf
def _calM(self,i,j): return min([self.M1[i][k] + self.M1[k+1][j] for k in range(i,j)])
def _calM1(self,i,j): return min(self.M[i][j], self.V[i][j]+self.b, self.M1[i+1][j]+self.c, self.M1[i][j-1]+self.c)
def _F1(self,i,j):
nt=j-i-1
return self.hairpin[nt] if nt<30 else self.inf
def _F2(self,i,j,h,l,DNA,Wobble):
nt = (h-i-1)+(j-l-1)
if nt>=30: val = self.inf
elif (h==i+1) and (l==j-1): val = self.stacking[self._is_bp(i,j,DNA=DNA,Wobble=Wobble)][self._is_bp(i+1,j-1,DNA=DNA,Wobble=Wobble)]
elif (i+1<h<l<j-1): val = self.internal_loop[nt]
else: val = self.buldge_loop[nt]
return val
def fit(self,DNA=True,Wobble=True):
"""calcurate DP matrix.(recursion)"""
for ini_i in reversed(range(self.N)):
diff = self.N-ini_i
for i in reversed(range(ini_i)):
j = i+diff
self.V[i][j] = self._calV(i,j,DNA=DNA,Wobble=Wobble)
self.M[i][j] = self._calM(i,j)
self.M1[i][j] = self._calM1(i,j)
self.W[i][j] = self._calW(i,j)
def trackback(self,DNA=True,Wobble=True):
"""trackback to find which bases form base-pairs."""
bp = []
stack = [(0,self.N-1)]
while(stack):
i,j = stack.pop(0)
if (i>=j): continue
elif self.W[i+1][j] == self.W[i][j]: stack.append((i+1,j))
elif self.W[i][j-1] == self.W[i][j]: stack.append((i,j-1))
elif self.V[i][j] == self.W[i][j]:
bp.append((i,j))
if self.V[i][j] == self._F1(i,j): continue
if self.V[i][j] == self.M[i+1][j-1]+self.a+self.b:
for k in range(i+1,j-1):
if self.M1[i][k] + self.DP[k][j] == self.M[i+1][j-1]:
stack.append((k+1,j))
stack.append((i,k))
break
else:
flag=0
for h in range(i+1,j):
for l in range(h+1,j):
if self._is_bp(h,l,DNA=DNA,Wobble=Wobble)<2:
if self.V[i][j] == self._F2(i,j,h,l,DNA=DNA,Wobble=Wobble) + self.V[h][l]:
stack.append((h,l))
flag=1
break
if flag: break
else:
for k in range(i,j):
if self.W[i][k] + self.W[k+1][j] == self.W[i][j]:
stack.append((k+1,j))
stack.append((i,k))
break
#=== Print ===
S = list(" " * self.N)
for i,j in bp:
S[i] = "("; S[j] = ")"
print(self.arr)
print("".join(S))
In [8]:
Sample = "AGGAACCCCCUUGGU"
In [9]:
model = Zuker(Sample,a,b,c,Hairpin_score,stacking_score,Internal_score,Buldge_score,inf)
In [10]:
model.fit(DNA=False)
In [11]:
print(np.where(model.W==inf, 0, model.W))
In [12]:
model.trackback(DNA=False)
↑ This is different from the result of Nussinov algorithm. I think it is because of the definition of free energy (or my coding skills.)
In [ ]: