Problem Setting
Identify the common sequencial features(sequence motif) from large amount of sequences data containing tf-binding regions.
DeepBind¶
- Link:
- "Deep Learning" techniques to ascertain the sequence specificities of DNA- and RNA-binding proteins from experimental data.
- outperforms other state-of-the-art methods. (Published: 27 July 2015)
- Results are readily visualized as
- a weighted ensemble of position weight matrices(PWMs)
- a mutation map that indicates how variations affect binding within a specific sequence.
- computes a binding score $f(s)$ using four stages:
$$f(s) = \text{Neural Network}_W\left(\text{pool}\left(\text{rect}_b\left(\text{conv}_M(s)\right)\right)\right)$$
- Convolution: Scan a set of motif detectors (length=$L$) with parameters $M^{(j)}$ $$\text{conv}_M(s)_{ij} = \sum_{k=0}^{K-1}\sum_{l=0}^{L-1}M^{(j)}_{k,i+l}s_{ki}$$
- Rectification: Extract some positions $$\text{rect}_b(x_{ij}) =\begin{cases} x_{ij} + b & (\text{if }x_{ij} + b>0)\\0 & (\text{otherwise})\end{cases}$$
- Pooling: Compute the maximum and average of each $\text{rect}_b\left(\cdot\right)$ across the sequence.
- Maximizing: identify the presence of longer motifs.
- Averaging: identify cumulative effects of short motifs $$\text{pool}_{\alpha}(x_{ij}) = \alpha\max_i\left(x_{ij}\right) + \left(1-\alpha\right)\underset{i}{\text{ave}}\left(x_{ij}\right)$$
- Neural Network: Combines the responses to produce a score.
- a: Five independent sequences being processed in parallel by single DeepBind model.
- b: The calibration, training and testing procedure.
Implementation¶
generate DataSets¶
import numpy as np
from kerasy.utils import generateSeq_embedded_Motif
# Types of Base (DNA)
nuc = ["A", "C", "G", "T"]
num_class = len(nuc)
nuc2code = dict(zip(nuc, np.identity(num_class, dtype=np.int8)[range(num_class)]))
nuc2code
motif = np.asarray([
[0.0, 1.0, 0.0, 0.0], # C
[0.5, 0.0, 0.0, 0.5], # A/T
[0.0, 0.5, 0.5, 0.0], # C/G
[0.0, 0.0, 1.0, 0.0], # G
[0.0, 0.0, 0.0, 1.0], # T
])
len_motif, num_base_types = motif.shape
import matplotlib.pyplot as plt
import seaborn as sns
def visMotif_std(motif, name, ax=None):
motif_std = (motif - motif.min()) / (motif.max()-motif.min())
if ax is None:
fig, ax = plt.subplots(figsize=(4,5))
sns.heatmap(motif_std, cbar=False, cmap="binary", ax=ax, fmt=".3f", annot=True)
ax.set_xticklabels(nuc), ax.set_ylabel("Motif position", fontsize=14)
ax.set_title(name, fontsize=18)
return ax
visMotif_std(motif, name="True")
plt.show()
len_sequence = 20
n_pos_train = 5000
n_neg_train = 5000
n_pos_test = 500
n_neg_test = 500
seed_train = 2020
seed_test = seed_train+1
# Although DeepBind can be trained and tested on datasets containing varying-length sequences,
# here, we fix the length for simplicity
x_train, y_train = generateSeq_embedded_Motif(n_pos_train, n_neg_train, len_sequence, motif, nuc, seed=seed_train)
x_test, y_test = generateSeq_embedded_Motif(n_pos_test, n_neg_test, len_sequence, motif, nuc, seed=seed_test)
x_train = np.asarray([np.vstack([nuc2code[x] for x in x_seq]) for x_seq in x_train])
x_test = np.asarray([np.vstack([nuc2code[x] for x in x_seq]) for x_seq in x_test])
y_train = np.asarray(y_train).reshape(-1,1)
y_test = np.asarray(y_test).reshape(-1,1)
print(f"x_train.shape = {x_train.shape}")
print(f"y_train.shape = {y_train.shape}")
print(f"x_test.shape = {x_test.shape}")
print(f"y_test.shape = {y_test.shape}")
Building Network¶
import keras
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Conv1D, GlobalAveragePooling1D
from keras import backend as K
# Single candidate motif is considered
num_cand_mortif = 1
seed_weight = seed_train+seed_test
batch_size = 32
epochs = 1000
def prob_reg(motif_matrix):
"""
@params motif_matrix : shape=(len_motif, num_base_types, num_cand_mortif)
"""
# shape=(num_base_types,)
probs = K.sum(motif_matrix, axis=(1,-1))
mean = K.mean(probs)
return 0.001 * K.sqrt(K.sum(K.square(probs-mean)))
# Fixed the initial kernel weight.
initial_kernel_weight = np.random.RandomState(seed_weight).uniform(size=(len_motif,num_base_types,num_cand_mortif)).astype(np.float32)
# initial_bias_weight = np.zeros(shape=1, dtype=np.float32)
initial_conv_weights = [initial_kernel_weight] #, initial_bias_weight]
fig, (ax_true, ax_ini) = plt.subplots(1,2,figsize=(8,5), sharey=True)
ax_true = visMotif_std(motif, name="True", ax=ax_true)
ax_ini = visMotif_std(initial_kernel_weight[:,:,0], name="initial", ax=ax_ini)
plt.tight_layout()
plt.show()
simple model¶
# If there is a motif, indicate a high value.
#
# [Positive] [Negative]
# ...... ========= .. ....................
# ...... | Motif | .. ....................
# ...... ========= .. ....................
# \\\\ //// Convolution \\\\ ////
# ____----___ ____________
# | |
# | Global Average Pooling |
# v v
# High value (Positive!!) Low value (Negative!!)
model_simple = Sequential()
model_simple.add(Conv1D(
filters=num_cand_mortif,
kernel_size=len_motif,
kernel_initializer="TruncatedNormal",
kernel_regularizer=prob_reg,
input_shape=(len_sequence, num_base_types),
activation="relu",
padding="valid",
use_bias=False
))
model_simple.add(GlobalAveragePooling1D())
model_simple.add(Activation("sigmoid"))
# Initialize to the same weights.
model_simple.layers[0].set_weights(initial_conv_weights)
model_simple.summary()
Deeper Model¶
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPool1D
model_deeper = Sequential()
model_deeper.add(Conv1D(
filters=num_cand_mortif,
kernel_size=len_motif,
kernel_initializer="TruncatedNormal",
input_shape=(len_sequence, num_base_types),
activation="relu",
padding="valid",
use_bias=False
))
model_deeper.add(Conv1D(
filters=1,
kernel_size=2,
kernel_initializer="TruncatedNormal",
bias_initializer="zero",
activation="relu",
padding="valid"
))
model_deeper.add(MaxPool1D(pool_size=2))
model_deeper.add(Dropout(0.5))
model_deeper.add(Flatten())
model_deeper.add(Dense(4, activation="relu"))
model_deeper.add(Dropout(0.5))
model_deeper.add(Dense(1, activation="sigmoid"))
# Initialize to the same weights.
model_deeper.layers[0].set_weights(initial_conv_weights)
model_deeper.summary()
Training¶
loss = keras.losses.binary_crossentropy
optimizer = keras.optimizers.sgd()
# es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=int(epochs*0.3), verbose=0, mode='auto')
# ms = keras.callbacks.ModelCheckpoint(filepath="{epoch}epochs.h5", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
# callbacks = [es, ms]
model_simple.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
model_deeper.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
history_simple = model_simple.fit(
x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=0,
# callbacks=callbacks,
validation_data=(x_test, y_test)
)
history_deeper = model_deeper.fit(
x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=0,
# callbacks=callbacks,
validation_data=(x_test, y_test)
)
Results¶
def plot_hist(history, axes, name):
epochs = history.epoch
val_loss = history.history.get("val_loss")
val_acc = history.history.get("val_acc")
loss = history.history.get("loss")
acc = history.history.get("acc")
ax_acc, ax_loss = axes
ax_acc.plot(epochs, acc, "r" , label = "training acc")
ax_acc.plot(epochs, val_acc, "b" , label = "validation acc")
ax_acc.set_title(f"Training and Validation acc ({name})")
ax_acc.set_xlabel("Epochs"), ax_acc.set_ylabel("Accuracy")
ax_acc.legend()
# Loss Plot
ax_loss.plot(epochs, loss, "r" , label = "training loss")
ax_loss.plot(epochs, val_loss, "b" , label = "validation loss")
ax_loss.set_title(f"Training and Validation loss ({name})")
ax_loss.set_xlabel("Epochs"), ax_loss.set_ylabel("Loss")
ax_loss.legend()
return (ax_acc, ax_loss)
fig, (ax_simpel, ax_deeper) = plt.subplots(2,2,figsize=(18,12), sharex=True)
ax_simple = plot_hist(history_simple, ax_simpel, "simlple")
ax_deeper = plot_hist(history_deeper, ax_deeper, "deeper")
Evaluation¶
def getKernel(model):
conv_layer = [l for l in model.layers if l.name.startswith("conv1d")][0]
conv_weight = conv_layer.get_weights()[0][:,:,0]
return conv_weight
fig, ((ax_true, ax_ini),(ax_simple, ax_deep)) = plt.subplots(2,2,figsize=(8,10), sharex="all", sharey="all")
ax_true = visMotif_std(motif, name="True", ax=ax_true)
ax_ini = visMotif_std(initial_kernel_weight[:,:,0], name="initial", ax=ax_ini)
ax_simple = visMotif_std(getKernel(model_simple), name="simple", ax=ax_simple)
ax_deep = visMotif_std(getKernel(model_deeper), name="deeper", ax=ax_deep)
plt.tight_layout()
def print_score(model, x_test, y_test, data_type="Test"):
loss,acc = model.evaluate(x_test, y_test, verbose=0)
print(f"\t- {data_type} loss : {loss:.3f}")
print(f"\t- {data_type} accuracy: {acc*100:.1f}%")
print("- Model Simple")
print_score(model_simple, x_test, y_test)
print("- Model Deeper")
print_score(model_deeper, x_test, y_test)
Trained simple model in more epochs, but not using Regularization and use_bias=True
fig, ax_simpel = plt.subplots(1,2,figsize=(18,6))
ax_simple = plot_hist(history_simple, ax_simpel, "simlple")
plt.tight_layout()
plt.show()
model_simple.load_weights("simple_model_30000epochs.h5")
model_simple.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
print("- Model Simple")
print_score(model_simple, x_train, y_train, data_type="Train")
print_score(model_simple, x_test, y_test)
fig, (ax_true, ax_simple) = plt.subplots(1,2,figsize=(8,5), sharey=True)
ax_true = visMotif_std(motif, name="True", ax=ax_true)
ax_simple = visMotif_std(getKernel(model_simple), name="simple", ax=ax_simple)
plt.tight_layout()
plt.show()
Motif is well captured!!
Consideration¶
In this case, the data was too simple without any noise, so it was easier to capture the motif with a simple model without adding additional layers.
future work: DeeperBind¶
As shown in this paper ("DeeperBind: Enhancing Prediction of Sequence Specificities of DNA Binding Proteins"), DeepBind was the first deep convolutional method ever designed to address the need for accurate characterization of motifs for protein targets. Despite its name, it employs only one convolution layer followed by a non-linear thresholding, a max-pooling layer and one/two fully connected layer(s) to estimate the intensity of input probes. Therefore, it is expected that its performance will be further enhanced by using various deep learning techniques like RNN, Attention, Metric Learning, and so on.