|
|
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
|
|
#
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
# You may obtain a copy of the License at
|
|
|
#
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
#
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
# See the License for the specific language governing permissions and
|
|
|
# limitations under the License.
|
|
|
|
|
|
import sys
|
|
|
import numpy as np
|
|
|
import math
|
|
|
import copy
|
|
|
|
|
|
import paddle
|
|
|
import paddle.nn as nn
|
|
|
import paddle.nn.functional as F
|
|
|
from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout)
|
|
|
from paddle.nn.initializer import Constant, Normal
|
|
|
from ...utils.save_load import load_ckpt
|
|
|
from ..registry import BACKBONES
|
|
|
from ..weight_init import weight_init_
|
|
|
|
|
|
ACT2FN = {"gelu": F.gelu, "relu": F.relu, "swish": F.swish}
|
|
|
|
|
|
|
|
|
class BertEmbeddings(nn.Layer):
|
|
|
"""Construct the embeddings from word, position and token_type embeddings.
|
|
|
"""
|
|
|
def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,
|
|
|
hidden_size, hidden_dropout_prob):
|
|
|
super(BertEmbeddings, self).__init__()
|
|
|
self.word_embeddings = nn.Embedding(vocab_size,
|
|
|
hidden_size,
|
|
|
padding_idx=0)
|
|
|
self.position_embeddings = nn.Embedding(max_position_embeddings,
|
|
|
hidden_size)
|
|
|
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
|
|
|
|
|
|
self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
|
|
|
self.dropout = nn.Dropout(hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, input_ids, token_type_ids=None):
|
|
|
seq_length = input_ids.shape[1]
|
|
|
position_ids = paddle.arange(end=seq_length, dtype="int64")
|
|
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
|
|
if token_type_ids is None:
|
|
|
token_type_ids = paddle.zeros_like(input_ids)
|
|
|
|
|
|
words_embeddings = self.word_embeddings(input_ids) #8,36 -> 8,36,768
|
|
|
position_embeddings = self.position_embeddings(
|
|
|
position_ids) #8,36 -> 8,36,768
|
|
|
token_type_embeddings = self.token_type_embeddings(
|
|
|
token_type_ids) #8,36 -> 8,36,768
|
|
|
|
|
|
embeddings = words_embeddings + position_embeddings + token_type_embeddings
|
|
|
embeddings = self.LayerNorm(embeddings)
|
|
|
embeddings = self.dropout(embeddings)
|
|
|
return embeddings
|
|
|
|
|
|
|
|
|
class BertImageEmbeddings(nn.Layer):
|
|
|
def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob):
|
|
|
super(BertImageEmbeddings, self).__init__()
|
|
|
self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size)
|
|
|
self.image_location_embeddings = nn.Linear(5, v_hidden_size)
|
|
|
self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
|
|
|
self.dropout = nn.Dropout(v_hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, input_ids, input_loc):
|
|
|
img_embeddings = self.image_embeddings(
|
|
|
input_ids) #8,37,2048 -> 8,37,1024
|
|
|
loc_embeddings = self.image_location_embeddings(
|
|
|
input_loc) #8,37,5 -> 8,37,1024
|
|
|
embeddings = self.LayerNorm(img_embeddings + loc_embeddings)
|
|
|
embeddings = self.dropout(embeddings)
|
|
|
return embeddings # shape: bs*seq_len*hs
|
|
|
|
|
|
|
|
|
class BertActionEmbeddings(nn.Layer):
|
|
|
def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob):
|
|
|
super(BertActionEmbeddings, self).__init__()
|
|
|
self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size)
|
|
|
self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12)
|
|
|
self.dropout = nn.Dropout(a_hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, input_ids):
|
|
|
action_embeddings = self.action_embeddings(
|
|
|
input_ids) #8,5,2048 -> 8,5,768
|
|
|
embeddings = self.LayerNorm(action_embeddings)
|
|
|
embeddings = self.dropout(embeddings)
|
|
|
return embeddings
|
|
|
|
|
|
|
|
|
class BertSelfAttention(nn.Layer):
|
|
|
def __init__(self, hidden_size, num_attention_heads,
|
|
|
attention_probs_dropout_prob):
|
|
|
super(BertSelfAttention, self).__init__()
|
|
|
if hidden_size % num_attention_heads != 0:
|
|
|
raise ValueError(
|
|
|
"The hidden size (%d) is not a multiple of the number of attention "
|
|
|
"heads (%d)" % (hidden_size, num_attention_heads))
|
|
|
self.num_attention_heads = num_attention_heads
|
|
|
self.attention_head_size = int(hidden_size / num_attention_heads)
|
|
|
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
|
|
|
|
|
self.query = nn.Linear(hidden_size, self.all_head_size)
|
|
|
self.key = nn.Linear(hidden_size, self.all_head_size)
|
|
|
self.value = nn.Linear(hidden_size, self.all_head_size)
|
|
|
|
|
|
self.dropout = nn.Dropout(attention_probs_dropout_prob)
|
|
|
|
|
|
def transpose_for_scores(self, x):
|
|
|
new_x_shape = x.shape[:-1] + [
|
|
|
self.num_attention_heads,
|
|
|
self.attention_head_size,
|
|
|
]
|
|
|
x = x.reshape(new_x_shape)
|
|
|
return x.transpose((0, 2, 1, 3))
|
|
|
|
|
|
def forward(self, hidden_states, attention_mask):
|
|
|
mixed_query_layer = self.query(hidden_states)
|
|
|
mixed_key_layer = self.key(hidden_states)
|
|
|
mixed_value_layer = self.value(hidden_states)
|
|
|
|
|
|
query_layer = self.transpose_for_scores(mixed_query_layer)
|
|
|
key_layer = self.transpose_for_scores(mixed_key_layer)
|
|
|
value_layer = self.transpose_for_scores(mixed_value_layer)
|
|
|
|
|
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
|
|
attention_scores = paddle.matmul(query_layer,
|
|
|
key_layer.transpose((0, 1, 3, 2)))
|
|
|
attention_scores = attention_scores / math.sqrt(
|
|
|
self.attention_head_size)
|
|
|
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
|
|
attention_scores = attention_scores + attention_mask
|
|
|
|
|
|
# Normalize the attention scores to probabilities.
|
|
|
attention_probs = nn.Softmax(axis=-1)(attention_scores)
|
|
|
|
|
|
# This is actually dropping out entire tokens to attend to, which might
|
|
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
|
|
attention_probs = self.dropout(attention_probs)
|
|
|
|
|
|
context_layer = paddle.matmul(attention_probs, value_layer)
|
|
|
context_layer = context_layer.transpose((0, 2, 1, 3))
|
|
|
new_context_layer_shape = context_layer.shape[:-2] + [
|
|
|
self.all_head_size
|
|
|
]
|
|
|
context_layer = context_layer.reshape(new_context_layer_shape)
|
|
|
|
|
|
return context_layer, attention_probs
|
|
|
|
|
|
|
|
|
class BertSelfOutput(nn.Layer):
|
|
|
def __init__(self, hidden_size, hidden_dropout_prob):
|
|
|
super(BertSelfOutput, self).__init__()
|
|
|
self.dense = nn.Linear(hidden_size, hidden_size)
|
|
|
self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
|
|
|
self.dropout = nn.Dropout(hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, hidden_states, input_tensor):
|
|
|
hidden_states = self.dense(hidden_states)
|
|
|
hidden_states = self.dropout(hidden_states)
|
|
|
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
class BertAttention(nn.Layer):
|
|
|
def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads,
|
|
|
attention_probs_dropout_prob):
|
|
|
super(BertAttention, self).__init__()
|
|
|
self.self = BertSelfAttention(hidden_size, num_attention_heads,
|
|
|
attention_probs_dropout_prob)
|
|
|
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, input_tensor, attention_mask):
|
|
|
self_output, attention_probs = self.self(input_tensor, attention_mask)
|
|
|
attention_output = self.output(self_output, input_tensor)
|
|
|
return attention_output, attention_probs
|
|
|
|
|
|
|
|
|
class BertIntermediate(nn.Layer):
|
|
|
def __init__(self, hidden_size, intermediate_size, hidden_act):
|
|
|
super(BertIntermediate, self).__init__()
|
|
|
self.dense = nn.Linear(hidden_size, intermediate_size)
|
|
|
if isinstance(hidden_act, str) or (sys.version_info[0] == 2
|
|
|
and isinstance(hidden_act, str)):
|
|
|
self.intermediate_act_fn = ACT2FN[hidden_act]
|
|
|
else:
|
|
|
self.intermediate_act_fn = hidden_act
|
|
|
|
|
|
def forward(self, hidden_states):
|
|
|
hidden_states = self.dense(hidden_states)
|
|
|
hidden_states = self.intermediate_act_fn(hidden_states)
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
class BertOutput(nn.Layer):
|
|
|
def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob):
|
|
|
super(BertOutput, self).__init__()
|
|
|
self.dense = nn.Linear(intermediate_size, hidden_size)
|
|
|
self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
|
|
|
self.dropout = nn.Dropout(hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, hidden_states, input_tensor):
|
|
|
hidden_states = self.dense(hidden_states)
|
|
|
hidden_states = self.dropout(hidden_states)
|
|
|
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
class BertEntAttention(nn.Layer):
|
|
|
"""Core mudule of tangled transformer.
|
|
|
"""
|
|
|
def __init__(
|
|
|
self,
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
a_hidden_size,
|
|
|
bi_hidden_size,
|
|
|
attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob,
|
|
|
a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob,
|
|
|
at_attention_probs_dropout_prob,
|
|
|
bi_num_attention_heads,
|
|
|
):
|
|
|
super(BertEntAttention, self).__init__()
|
|
|
if bi_hidden_size % bi_num_attention_heads != 0:
|
|
|
raise ValueError(
|
|
|
"The hidden size (%d) is not a multiple of the number of attention "
|
|
|
"heads (%d)" % (bi_hidden_size, bi_num_attention_heads))
|
|
|
|
|
|
self.num_attention_heads = bi_num_attention_heads
|
|
|
self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads)
|
|
|
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
|
|
|
|
|
# self attention layers for vision input
|
|
|
self.query1 = nn.Linear(v_hidden_size, self.all_head_size)
|
|
|
self.key1 = nn.Linear(v_hidden_size, self.all_head_size)
|
|
|
self.value1 = nn.Linear(v_hidden_size, self.all_head_size)
|
|
|
self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob)
|
|
|
|
|
|
# self attention layers for text input
|
|
|
self.query2 = nn.Linear(hidden_size, self.all_head_size)
|
|
|
self.key2 = nn.Linear(hidden_size, self.all_head_size)
|
|
|
self.value2 = nn.Linear(hidden_size, self.all_head_size)
|
|
|
self.dropout2 = nn.Dropout(attention_probs_dropout_prob)
|
|
|
|
|
|
# self attention layers for action input
|
|
|
self.query3 = nn.Linear(a_hidden_size, self.all_head_size)
|
|
|
self.key3 = nn.Linear(a_hidden_size, self.all_head_size)
|
|
|
self.value3 = nn.Linear(a_hidden_size, self.all_head_size)
|
|
|
self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob)
|
|
|
|
|
|
# self attention layers for action_text
|
|
|
self.key_at = nn.Linear(bi_hidden_size, self.all_head_size)
|
|
|
self.value_at = nn.Linear(bi_hidden_size, self.all_head_size)
|
|
|
self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob)
|
|
|
|
|
|
# self attention layers for action_vision
|
|
|
self.key_av = nn.Linear(bi_hidden_size, self.all_head_size)
|
|
|
self.value_av = nn.Linear(bi_hidden_size, self.all_head_size)
|
|
|
self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob)
|
|
|
|
|
|
def transpose_for_scores(self, x):
|
|
|
new_x_shape = x.shape[:-1] + [
|
|
|
self.num_attention_heads,
|
|
|
self.attention_head_size,
|
|
|
]
|
|
|
x = x.reshape(new_x_shape)
|
|
|
return x.transpose((0, 2, 1, 3))
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
input_tensor1,
|
|
|
attention_mask1,
|
|
|
input_tensor2,
|
|
|
attention_mask2,
|
|
|
input_tensor3,
|
|
|
attention_mask3,
|
|
|
):
|
|
|
|
|
|
# for vision input.
|
|
|
mixed_query_layer1 = self.query1(input_tensor1)
|
|
|
mixed_key_layer1 = self.key1(input_tensor1)
|
|
|
mixed_value_layer1 = self.value1(input_tensor1)
|
|
|
|
|
|
query_layer1 = self.transpose_for_scores(mixed_query_layer1)
|
|
|
key_layer1 = self.transpose_for_scores(mixed_key_layer1)
|
|
|
value_layer1 = self.transpose_for_scores(mixed_value_layer1)
|
|
|
|
|
|
# for text input:
|
|
|
mixed_query_layer2 = self.query2(input_tensor2)
|
|
|
mixed_key_layer2 = self.key2(input_tensor2)
|
|
|
mixed_value_layer2 = self.value2(input_tensor2)
|
|
|
|
|
|
query_layer2 = self.transpose_for_scores(mixed_query_layer2)
|
|
|
key_layer2 = self.transpose_for_scores(mixed_key_layer2)
|
|
|
value_layer2 = self.transpose_for_scores(mixed_value_layer2)
|
|
|
|
|
|
# for action input:
|
|
|
mixed_query_layer3 = self.query3(input_tensor3)
|
|
|
mixed_key_layer3 = self.key3(input_tensor3)
|
|
|
mixed_value_layer3 = self.value3(input_tensor3)
|
|
|
|
|
|
query_layer3 = self.transpose_for_scores(mixed_query_layer3)
|
|
|
key_layer3 = self.transpose_for_scores(mixed_key_layer3)
|
|
|
value_layer3 = self.transpose_for_scores(mixed_value_layer3)
|
|
|
|
|
|
def do_attention(query_layer, key_layer, value_layer, attention_mask,
|
|
|
dropout):
|
|
|
""" compute attention """
|
|
|
attention_scores = paddle.matmul(query_layer,
|
|
|
key_layer.transpose((0, 1, 3, 2)))
|
|
|
attention_scores = attention_scores / math.sqrt(
|
|
|
self.attention_head_size)
|
|
|
attention_scores = attention_scores + attention_mask
|
|
|
|
|
|
# Normalize the attention scores to probabilities.
|
|
|
attention_probs = nn.Softmax(axis=-1)(attention_scores)
|
|
|
|
|
|
# This is actually dropping out entire tokens to attend to, which might
|
|
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
|
|
attention_probs = dropout(attention_probs)
|
|
|
|
|
|
context_layer = paddle.matmul(attention_probs, value_layer)
|
|
|
context_layer = context_layer.transpose((0, 2, 1, 3))
|
|
|
new_context_layer_shape = context_layer.shape[:-2] + [
|
|
|
self.all_head_size
|
|
|
]
|
|
|
context_layer = context_layer.reshape(new_context_layer_shape)
|
|
|
return context_layer
|
|
|
|
|
|
context_av = do_attention(query_layer3, key_layer1, value_layer1,
|
|
|
attention_mask1, self.dropout_av)
|
|
|
context_at = do_attention(query_layer3, key_layer2, value_layer2,
|
|
|
attention_mask2, self.dropout_at)
|
|
|
|
|
|
context_key_av = self.key_av(context_av).transpose((0, 2, 1))
|
|
|
# interpolate only support 4-D tensor now.
|
|
|
context_key_av = F.interpolate(context_key_av.unsqueeze(-1),
|
|
|
size=(key_layer2.shape[2],
|
|
|
1)).squeeze(-1)
|
|
|
context_key_av = self.transpose_for_scores(
|
|
|
context_key_av.transpose((0, 2, 1)))
|
|
|
key_layer2 = key_layer2 + context_key_av
|
|
|
|
|
|
context_key_at = self.key_at(context_at).transpose((0, 2, 1))
|
|
|
context_key_at = F.interpolate(context_key_at.unsqueeze(-1),
|
|
|
size=(key_layer1.shape[2],
|
|
|
1)).squeeze(-1)
|
|
|
context_key_at = self.transpose_for_scores(
|
|
|
context_key_at.transpose((0, 2, 1)))
|
|
|
key_layer1 = key_layer1 + context_key_at
|
|
|
|
|
|
context_val_av = self.value_at(context_av).transpose((0, 2, 1))
|
|
|
context_val_av = F.interpolate(context_val_av.unsqueeze(-1),
|
|
|
size=(value_layer2.shape[2],
|
|
|
1)).squeeze(-1)
|
|
|
context_val_av = self.transpose_for_scores(
|
|
|
context_val_av.transpose((0, 2, 1)))
|
|
|
value_layer2 = value_layer2 + context_val_av
|
|
|
|
|
|
context_val_at = self.value_at(context_at).transpose((0, 2, 1))
|
|
|
context_val_at = F.interpolate(context_val_at.unsqueeze(-1),
|
|
|
size=(value_layer1.shape[2],
|
|
|
1)).squeeze(-1)
|
|
|
context_val_at = self.transpose_for_scores(
|
|
|
context_val_at.transpose((0, 2, 1)))
|
|
|
value_layer1 = value_layer1 + context_val_at
|
|
|
|
|
|
context_layer1 = do_attention(query_layer1, key_layer1, value_layer1,
|
|
|
attention_mask1, self.dropout1)
|
|
|
context_layer2 = do_attention(query_layer2, key_layer2, value_layer2,
|
|
|
attention_mask2, self.dropout2)
|
|
|
context_layer3 = do_attention(query_layer3, key_layer3, value_layer3,
|
|
|
attention_mask3, self.dropout3)
|
|
|
|
|
|
return context_layer1, context_layer2, context_layer3 # vision, text, action
|
|
|
|
|
|
|
|
|
class BertEntOutput(nn.Layer):
|
|
|
def __init__(
|
|
|
self,
|
|
|
bi_hidden_size,
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
v_hidden_dropout_prob,
|
|
|
hidden_dropout_prob,
|
|
|
):
|
|
|
super(BertEntOutput, self).__init__()
|
|
|
|
|
|
self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size)
|
|
|
self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12)
|
|
|
self.dropout1 = nn.Dropout(v_hidden_dropout_prob)
|
|
|
|
|
|
self.dense2 = nn.Linear(bi_hidden_size, hidden_size)
|
|
|
self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12)
|
|
|
self.dropout2 = nn.Dropout(hidden_dropout_prob)
|
|
|
|
|
|
self.dense3 = nn.Linear(bi_hidden_size, hidden_size)
|
|
|
self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12)
|
|
|
self.dropout3 = nn.Dropout(hidden_dropout_prob)
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
hidden_states1,
|
|
|
input_tensor1,
|
|
|
hidden_states2,
|
|
|
input_tensor2,
|
|
|
hidden_states3,
|
|
|
input_tensor3,
|
|
|
):
|
|
|
context_state1 = self.dense1(hidden_states1)
|
|
|
context_state1 = self.dropout1(context_state1)
|
|
|
|
|
|
context_state2 = self.dense2(hidden_states2)
|
|
|
context_state2 = self.dropout2(context_state2)
|
|
|
|
|
|
context_state3 = self.dense3(hidden_states3)
|
|
|
context_state3 = self.dropout3(context_state3)
|
|
|
|
|
|
hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1)
|
|
|
hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2)
|
|
|
hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3)
|
|
|
|
|
|
return hidden_states1, hidden_states2, hidden_states3
|
|
|
|
|
|
|
|
|
class BertLayer(nn.Layer):
|
|
|
def __init__(self, hidden_size, intermediate_size, hidden_act,
|
|
|
hidden_dropout_prob, num_attention_heads,
|
|
|
attention_probs_dropout_prob):
|
|
|
super(BertLayer, self).__init__()
|
|
|
self.attention = BertAttention(hidden_size, hidden_dropout_prob,
|
|
|
num_attention_heads,
|
|
|
attention_probs_dropout_prob)
|
|
|
self.intermediate = BertIntermediate(hidden_size, intermediate_size,
|
|
|
hidden_act)
|
|
|
self.output = BertOutput(intermediate_size, hidden_size,
|
|
|
hidden_dropout_prob)
|
|
|
|
|
|
def forward(self, hidden_states, attention_mask):
|
|
|
attention_output, attention_probs = self.attention(
|
|
|
hidden_states, attention_mask)
|
|
|
intermediate_output = self.intermediate(attention_output)
|
|
|
layer_output = self.output(intermediate_output, attention_output)
|
|
|
return layer_output, attention_probs
|
|
|
|
|
|
|
|
|
class BertConnectionLayer(nn.Layer):
|
|
|
def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
|
|
|
bi_hidden_size, bi_num_attention_heads,
|
|
|
attention_probs_dropout_prob, v_attention_probs_dropout_prob,
|
|
|
a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob,
|
|
|
at_attention_probs_dropout_prob, intermediate_size,
|
|
|
v_intermediate_size, a_intermediate_size, hidden_act,
|
|
|
v_hidden_act, a_hidden_act, hidden_dropout_prob,
|
|
|
v_hidden_dropout_prob, a_hidden_dropout_prob):
|
|
|
super(BertConnectionLayer, self).__init__()
|
|
|
self.ent_attention = BertEntAttention(
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
a_hidden_size,
|
|
|
bi_hidden_size,
|
|
|
attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob,
|
|
|
a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob,
|
|
|
at_attention_probs_dropout_prob,
|
|
|
bi_num_attention_heads,
|
|
|
)
|
|
|
|
|
|
self.ent_output = BertEntOutput(
|
|
|
bi_hidden_size,
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
v_hidden_dropout_prob,
|
|
|
hidden_dropout_prob,
|
|
|
)
|
|
|
|
|
|
self.v_intermediate = BertIntermediate(v_hidden_size,
|
|
|
v_intermediate_size,
|
|
|
v_hidden_act)
|
|
|
self.v_output = BertOutput(v_intermediate_size, v_hidden_size,
|
|
|
v_hidden_dropout_prob)
|
|
|
|
|
|
self.t_intermediate = BertIntermediate(hidden_size, intermediate_size,
|
|
|
hidden_act)
|
|
|
self.t_output = BertOutput(intermediate_size, hidden_size,
|
|
|
hidden_dropout_prob)
|
|
|
|
|
|
self.a_intermediate = BertIntermediate(a_hidden_size,
|
|
|
a_intermediate_size,
|
|
|
a_hidden_act)
|
|
|
self.a_output = BertOutput(a_intermediate_size, a_hidden_size,
|
|
|
a_hidden_dropout_prob)
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
input_tensor1,
|
|
|
attention_mask1,
|
|
|
input_tensor2,
|
|
|
attention_mask2,
|
|
|
input_tensor3,
|
|
|
attention_mask3,
|
|
|
):
|
|
|
|
|
|
ent_output1, ent_output2, ent_output3 = self.ent_attention(
|
|
|
input_tensor1, attention_mask1, input_tensor2, attention_mask2,
|
|
|
input_tensor3, attention_mask3)
|
|
|
|
|
|
attention_output1, attention_output2, attention_output3 = self.ent_output(
|
|
|
ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3,
|
|
|
input_tensor3)
|
|
|
|
|
|
intermediate_output1 = self.v_intermediate(attention_output1)
|
|
|
layer_output1 = self.v_output(intermediate_output1, attention_output1)
|
|
|
|
|
|
intermediate_output2 = self.t_intermediate(attention_output2)
|
|
|
layer_output2 = self.t_output(intermediate_output2, attention_output2)
|
|
|
|
|
|
intermediate_output3 = self.a_intermediate(attention_output3)
|
|
|
layer_output3 = self.a_output(intermediate_output3, attention_output3)
|
|
|
|
|
|
return layer_output1, layer_output2, layer_output3
|
|
|
|
|
|
|
|
|
class BertEncoder(nn.Layer):
|
|
|
"""
|
|
|
ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer.
|
|
|
"""
|
|
|
def __init__(
|
|
|
self,
|
|
|
v_ent_attention_id,
|
|
|
t_ent_attention_id,
|
|
|
a_ent_attention_id,
|
|
|
fixed_t_layer,
|
|
|
fixed_v_layer,
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
a_hidden_size,
|
|
|
bi_hidden_size,
|
|
|
intermediate_size,
|
|
|
v_intermediate_size,
|
|
|
a_intermediate_size,
|
|
|
hidden_act,
|
|
|
v_hidden_act,
|
|
|
a_hidden_act,
|
|
|
hidden_dropout_prob,
|
|
|
v_hidden_dropout_prob,
|
|
|
a_hidden_dropout_prob,
|
|
|
attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob,
|
|
|
a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob,
|
|
|
at_attention_probs_dropout_prob,
|
|
|
num_attention_heads,
|
|
|
v_num_attention_heads,
|
|
|
a_num_attention_heads,
|
|
|
bi_num_attention_heads,
|
|
|
num_hidden_layers,
|
|
|
v_num_hidden_layers,
|
|
|
a_num_hidden_layers,
|
|
|
):
|
|
|
super(BertEncoder, self).__init__()
|
|
|
self.v_ent_attention_id = v_ent_attention_id
|
|
|
self.t_ent_attention_id = t_ent_attention_id
|
|
|
self.a_ent_attention_id = a_ent_attention_id
|
|
|
self.fixed_t_layer = fixed_t_layer
|
|
|
self.fixed_v_layer = fixed_v_layer
|
|
|
|
|
|
layer = BertLayer(hidden_size, intermediate_size, hidden_act,
|
|
|
hidden_dropout_prob, num_attention_heads,
|
|
|
attention_probs_dropout_prob)
|
|
|
v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act,
|
|
|
v_hidden_dropout_prob, v_num_attention_heads,
|
|
|
v_attention_probs_dropout_prob)
|
|
|
a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act,
|
|
|
a_hidden_dropout_prob, a_num_attention_heads,
|
|
|
a_attention_probs_dropout_prob)
|
|
|
connect_layer = BertConnectionLayer(
|
|
|
hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
|
|
|
bi_num_attention_heads, attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
|
|
|
intermediate_size, v_intermediate_size, a_intermediate_size,
|
|
|
hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob,
|
|
|
v_hidden_dropout_prob, a_hidden_dropout_prob)
|
|
|
|
|
|
self.layer = nn.LayerList(
|
|
|
[copy.deepcopy(layer) for _ in range(num_hidden_layers)]) #12
|
|
|
self.v_layer = nn.LayerList(
|
|
|
[copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)]) #2
|
|
|
self.a_layer = nn.LayerList(
|
|
|
[copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)]) #3
|
|
|
self.c_layer = nn.LayerList([
|
|
|
copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id))
|
|
|
] #2 [0,1]
|
|
|
)
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
txt_embedding,
|
|
|
image_embedding,
|
|
|
action_embedding,
|
|
|
txt_attention_mask,
|
|
|
image_attention_mask,
|
|
|
action_attention_mask,
|
|
|
output_all_encoded_layers=True,
|
|
|
):
|
|
|
v_start, a_start, t_start = 0, 0, 0
|
|
|
count = 0
|
|
|
all_encoder_layers_t = []
|
|
|
all_encoder_layers_v = []
|
|
|
all_encoder_layers_a = []
|
|
|
|
|
|
for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id,
|
|
|
self.a_ent_attention_id,
|
|
|
self.t_ent_attention_id):
|
|
|
v_end = v_layer_id
|
|
|
a_end = a_layer_id
|
|
|
t_end = t_layer_id
|
|
|
|
|
|
assert self.fixed_t_layer <= t_end
|
|
|
assert self.fixed_v_layer <= v_end
|
|
|
|
|
|
### region embedding
|
|
|
for idx in range(v_start,
|
|
|
self.fixed_v_layer): #两次训练,这个循环都没有进去 #前面的层固定住
|
|
|
with paddle.no_grad():
|
|
|
image_embedding, image_attention_probs = self.v_layer[idx](
|
|
|
image_embedding, image_attention_mask)
|
|
|
v_start = self.fixed_v_layer
|
|
|
for idx in range(v_start, v_end):
|
|
|
image_embedding, image_attention_probs = self.v_layer[idx](
|
|
|
image_embedding, image_attention_mask)
|
|
|
|
|
|
### action embedding
|
|
|
for idx in range(a_start, a_end):
|
|
|
action_embedding, action_attention_probs = self.a_layer[idx](
|
|
|
action_embedding, action_attention_mask)
|
|
|
|
|
|
### text embedding
|
|
|
for idx in range(t_start, self.fixed_t_layer):
|
|
|
with paddle.no_grad():
|
|
|
txt_embedding, txt_attention_probs = self.layer[idx](
|
|
|
txt_embedding, txt_attention_mask)
|
|
|
t_start = self.fixed_t_layer
|
|
|
for idx in range(t_start, t_end):
|
|
|
txt_embedding, txt_attention_probs = self.layer[idx](
|
|
|
txt_embedding, txt_attention_mask)
|
|
|
|
|
|
image_embedding, txt_embedding, action_embedding = self.c_layer[
|
|
|
count](image_embedding, image_attention_mask, txt_embedding,
|
|
|
txt_attention_mask, action_embedding,
|
|
|
action_attention_mask)
|
|
|
|
|
|
v_start = v_end
|
|
|
t_start = t_end
|
|
|
a_start = a_end
|
|
|
count += 1
|
|
|
|
|
|
if output_all_encoded_layers:
|
|
|
all_encoder_layers_t.append(txt_embedding)
|
|
|
all_encoder_layers_v.append(image_embedding)
|
|
|
all_encoder_layers_a.append(action_embedding)
|
|
|
|
|
|
for idx in range(v_start, len(self.v_layer)): # 1
|
|
|
image_embedding, image_attention_probs = self.v_layer[idx](
|
|
|
image_embedding, image_attention_mask)
|
|
|
|
|
|
for idx in range(a_start, len(self.a_layer)):
|
|
|
action_embedding, action_attention_probs = self.a_layer[idx](
|
|
|
action_embedding, action_attention_mask)
|
|
|
|
|
|
for idx in range(t_start, len(self.layer)):
|
|
|
txt_embedding, txt_attention_probs = self.layer[idx](
|
|
|
txt_embedding, txt_attention_mask)
|
|
|
|
|
|
# add the end part to finish.
|
|
|
if not output_all_encoded_layers:
|
|
|
all_encoder_layers_t.append(txt_embedding) #8, 36, 768
|
|
|
all_encoder_layers_v.append(image_embedding) #8, 37, 1024
|
|
|
all_encoder_layers_a.append(action_embedding) #8, 5, 768
|
|
|
|
|
|
return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a
|
|
|
|
|
|
|
|
|
class BertPooler(nn.Layer):
|
|
|
""" "Pool" the model by simply taking the hidden state corresponding
|
|
|
to the first token.
|
|
|
"""
|
|
|
def __init__(self, hidden_size, bi_hidden_size):
|
|
|
super(BertPooler, self).__init__()
|
|
|
self.dense = nn.Linear(hidden_size, bi_hidden_size)
|
|
|
self.activation = nn.ReLU()
|
|
|
|
|
|
def forward(self, hidden_states):
|
|
|
first_token_tensor = hidden_states[:, 0] #8, 768
|
|
|
pooled_output = self.dense(first_token_tensor)
|
|
|
pooled_output = self.activation(pooled_output)
|
|
|
return pooled_output
|
|
|
|
|
|
|
|
|
class BertModel(nn.Layer):
|
|
|
def __init__(
|
|
|
self,
|
|
|
vocab_size,
|
|
|
max_position_embeddings,
|
|
|
type_vocab_size,
|
|
|
v_feature_size,
|
|
|
a_feature_size,
|
|
|
num_hidden_layers,
|
|
|
v_num_hidden_layers,
|
|
|
a_num_hidden_layers,
|
|
|
v_ent_attention_id,
|
|
|
t_ent_attention_id,
|
|
|
a_ent_attention_id,
|
|
|
fixed_t_layer,
|
|
|
fixed_v_layer,
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
a_hidden_size,
|
|
|
bi_hidden_size,
|
|
|
intermediate_size,
|
|
|
v_intermediate_size,
|
|
|
a_intermediate_size,
|
|
|
hidden_act,
|
|
|
v_hidden_act,
|
|
|
a_hidden_act,
|
|
|
hidden_dropout_prob,
|
|
|
v_hidden_dropout_prob,
|
|
|
a_hidden_dropout_prob,
|
|
|
attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob,
|
|
|
a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob,
|
|
|
at_attention_probs_dropout_prob,
|
|
|
num_attention_heads,
|
|
|
v_num_attention_heads,
|
|
|
a_num_attention_heads,
|
|
|
bi_num_attention_heads,
|
|
|
):
|
|
|
super(BertModel, self).__init__()
|
|
|
# initilize word embedding
|
|
|
self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings,
|
|
|
type_vocab_size, hidden_size,
|
|
|
hidden_dropout_prob)
|
|
|
# initlize the region embedding
|
|
|
self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size,
|
|
|
v_hidden_dropout_prob)
|
|
|
# initlize the action embedding
|
|
|
self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size,
|
|
|
a_hidden_dropout_prob)
|
|
|
|
|
|
self.encoder = BertEncoder(
|
|
|
v_ent_attention_id, t_ent_attention_id, a_ent_attention_id,
|
|
|
fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size,
|
|
|
a_hidden_size, bi_hidden_size, intermediate_size,
|
|
|
v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act,
|
|
|
a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob,
|
|
|
a_hidden_dropout_prob, attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob, a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob, at_attention_probs_dropout_prob,
|
|
|
num_attention_heads, v_num_attention_heads, a_num_attention_heads,
|
|
|
bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers,
|
|
|
a_num_hidden_layers)
|
|
|
|
|
|
self.t_pooler = BertPooler(hidden_size, bi_hidden_size)
|
|
|
self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size)
|
|
|
self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size)
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
text_ids,
|
|
|
action_feat,
|
|
|
image_feat,
|
|
|
image_loc,
|
|
|
token_type_ids=None,
|
|
|
text_mask=None,
|
|
|
image_mask=None,
|
|
|
action_mask=None,
|
|
|
output_all_encoded_layers=False,
|
|
|
):
|
|
|
"""
|
|
|
text_ids: input text ids. Shape: [batch_size, seqence_length]
|
|
|
action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
|
|
|
image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]]
|
|
|
image_loc: input region location. Shape: [batch_size, region_length, region_location_dim]
|
|
|
token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
|
|
|
text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
|
|
|
image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
|
|
|
action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
|
|
|
output_all_encoded_layers: is output encoded layers feature or not. Type: Bool.
|
|
|
"""
|
|
|
if text_mask is None:
|
|
|
text_mask = paddle.ones_like(text_ids)
|
|
|
if token_type_ids is None:
|
|
|
token_type_ids = paddle.zeros_like(text_ids)
|
|
|
if image_mask is None:
|
|
|
image_mask = paddle.ones(image_feat.shape[0],
|
|
|
image_feat.shape[1]).astype(text_ids.dtype)
|
|
|
if action_mask is None:
|
|
|
action_mask = paddle.ones(action_feat.shape[0],
|
|
|
action_feat.shape[1]).astype(
|
|
|
text_ids.dtype)
|
|
|
|
|
|
# We create a 3D attention mask from a 2D tensor mask.
|
|
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
|
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length].
|
|
|
extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2)
|
|
|
extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2)
|
|
|
extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2)
|
|
|
|
|
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
|
|
# masked positions, this operation will create a tensor which is 0.0 for
|
|
|
# positions we want to attend and -10000.0 for masked positions.
|
|
|
# Since we are adding it to the raw scores before the softmax, this is
|
|
|
# effectively the same as removing these entirely.
|
|
|
def set_mask(extended_attention_mask):
|
|
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
|
|
return extended_attention_mask
|
|
|
|
|
|
extended_text_mask = set_mask(extended_text_mask)
|
|
|
extended_image_mask = set_mask(extended_image_mask)
|
|
|
extended_action_mask = set_mask(extended_action_mask)
|
|
|
|
|
|
t_embedding_output = self.embeddings(text_ids, token_type_ids)
|
|
|
v_embedding_output = self.v_embeddings(image_feat, image_loc)
|
|
|
a_embedding_output = self.a_embeddings(action_feat)
|
|
|
|
|
|
# var = [t_embedding_output, v_embedding_output, a_embedding_output]
|
|
|
# import numpy as np
|
|
|
# for i, item in enumerate(var):
|
|
|
# np.save('tmp/' + str(i)+'.npy', item.numpy())
|
|
|
|
|
|
encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder(
|
|
|
t_embedding_output,
|
|
|
v_embedding_output,
|
|
|
a_embedding_output,
|
|
|
extended_text_mask,
|
|
|
extended_image_mask,
|
|
|
extended_action_mask,
|
|
|
output_all_encoded_layers=output_all_encoded_layers,
|
|
|
)
|
|
|
|
|
|
sequence_output_t = encoded_layers_t[-1] #get item from list
|
|
|
sequence_output_v = encoded_layers_v[-1]
|
|
|
sequence_output_a = encoded_layers_a[-1]
|
|
|
|
|
|
pooled_output_t = self.t_pooler(sequence_output_t)
|
|
|
pooled_output_v = self.v_pooler(sequence_output_v)
|
|
|
pooled_output_a = self.a_pooler(sequence_output_a)
|
|
|
|
|
|
if not output_all_encoded_layers:
|
|
|
encoded_layers_t = encoded_layers_t[-1]
|
|
|
encoded_layers_v = encoded_layers_v[-1]
|
|
|
encoded_layers_a = encoded_layers_a[-1]
|
|
|
|
|
|
return encoded_layers_t, encoded_layers_v, encoded_layers_a, \
|
|
|
pooled_output_t, pooled_output_v, pooled_output_a
|
|
|
|
|
|
|
|
|
# For Head
|
|
|
class BertPredictionHeadTransform(nn.Layer):
|
|
|
def __init__(self, hidden_size, hidden_act):
|
|
|
super(BertPredictionHeadTransform, self).__init__()
|
|
|
self.dense = nn.Linear(hidden_size, hidden_size)
|
|
|
if isinstance(hidden_act, str) or (sys.version_info[0] == 2
|
|
|
and isinstance(hidden_act, str)):
|
|
|
self.transform_act_fn = ACT2FN[hidden_act]
|
|
|
else:
|
|
|
self.transform_act_fn = hidden_act
|
|
|
self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12)
|
|
|
|
|
|
def forward(self, hidden_states):
|
|
|
hidden_states = self.dense(hidden_states)
|
|
|
hidden_states = self.transform_act_fn(hidden_states)
|
|
|
hidden_states = self.LayerNorm(hidden_states)
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
class BertLMPredictionHead(nn.Layer):
|
|
|
def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights):
|
|
|
super(BertLMPredictionHead, self).__init__()
|
|
|
self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
|
|
|
|
|
|
# The output weights are the same as the input embeddings, but there is
|
|
|
# an output-only bias for each token.
|
|
|
assert bert_model_embedding_weights.shape[1] == hidden_size
|
|
|
vocab_size = bert_model_embedding_weights.shape[0]
|
|
|
|
|
|
# another implementation which would create another big params:
|
|
|
# self.decoder = nn.Linear(hidden_size, vocab_size) # NOTE bias default: constant 0.0
|
|
|
# self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size],
|
|
|
# default_initializer=nn.initializer.Assign(
|
|
|
# bert_model_embedding_weights.t())) # transpose
|
|
|
|
|
|
self.decoder_weight = bert_model_embedding_weights
|
|
|
self.decoder_bias = self.create_parameter(
|
|
|
shape=[vocab_size],
|
|
|
dtype=bert_model_embedding_weights.dtype,
|
|
|
is_bias=True) # NOTE bias default: constant 0.0
|
|
|
|
|
|
def forward(self, hidden_states):
|
|
|
hidden_states = self.transform(hidden_states)
|
|
|
hidden_states = paddle.tensor.matmul(
|
|
|
hidden_states, self.decoder_weight,
|
|
|
transpose_y=True) + self.decoder_bias
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
class BertImageActionPredictionHead(nn.Layer):
|
|
|
def __init__(self, hidden_size, hidden_act, target_size):
|
|
|
super(BertImageActionPredictionHead, self).__init__()
|
|
|
self.transform = BertPredictionHeadTransform(hidden_size, hidden_act)
|
|
|
|
|
|
self.decoder = nn.Linear(hidden_size, target_size)
|
|
|
|
|
|
def forward(self, hidden_states):
|
|
|
hidden_states = self.transform(hidden_states)
|
|
|
hidden_states = self.decoder(hidden_states)
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
class BertPreTrainingHeads(nn.Layer):
|
|
|
def __init__(self, hidden_size, v_hidden_size, a_hidden_size,
|
|
|
bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act,
|
|
|
v_target_size, a_target_size, fusion_method,
|
|
|
bert_model_embedding_weights):
|
|
|
super(BertPreTrainingHeads, self).__init__()
|
|
|
self.predictions = BertLMPredictionHead(hidden_size, hidden_act,
|
|
|
bert_model_embedding_weights)
|
|
|
self.seq_relationship = nn.Linear(bi_hidden_size, 2)
|
|
|
self.imagePredictions = BertImageActionPredictionHead(
|
|
|
v_hidden_size, v_hidden_act, v_target_size) # visual class number
|
|
|
self.actionPredictions = BertImageActionPredictionHead(
|
|
|
a_hidden_size, a_hidden_act, a_target_size) # action class number
|
|
|
self.fusion_method = fusion_method
|
|
|
self.dropout = nn.Dropout(0.1)
|
|
|
|
|
|
def forward(self, sequence_output_t, sequence_output_v, sequence_output_a,
|
|
|
pooled_output_t, pooled_output_v, pooled_output_a):
|
|
|
|
|
|
if self.fusion_method == 'sum':
|
|
|
pooled_output = self.dropout(pooled_output_t + pooled_output_v +
|
|
|
pooled_output_a)
|
|
|
elif self.fusion_method == 'mul':
|
|
|
pooled_output = self.dropout(pooled_output_t * pooled_output_v +
|
|
|
pooled_output_a)
|
|
|
else:
|
|
|
assert False
|
|
|
|
|
|
prediction_scores_t = self.predictions(
|
|
|
sequence_output_t) # 8, 36 ,30522
|
|
|
seq_relationship_score = self.seq_relationship(pooled_output) # 8, 2
|
|
|
prediction_scores_v = self.imagePredictions(
|
|
|
sequence_output_v) # 8, 37, 1601
|
|
|
prediction_scores_a = self.actionPredictions(
|
|
|
sequence_output_a) # 8, 5, 401
|
|
|
|
|
|
return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
|
|
|
|
|
|
|
|
|
@BACKBONES.register()
|
|
|
class BertForMultiModalPreTraining(nn.Layer):
|
|
|
"""BERT model with multi modal pre-training heads.
|
|
|
"""
|
|
|
def __init__(
|
|
|
self,
|
|
|
vocab_size=30522,
|
|
|
max_position_embeddings=512,
|
|
|
type_vocab_size=2,
|
|
|
v_target_size=1601,
|
|
|
a_target_size=700,
|
|
|
v_feature_size=2048,
|
|
|
a_feature_size=2048,
|
|
|
num_hidden_layers=12,
|
|
|
v_num_hidden_layers=2,
|
|
|
a_num_hidden_layers=3,
|
|
|
t_ent_attention_id=[10, 11],
|
|
|
v_ent_attention_id=[0, 1],
|
|
|
a_ent_attention_id=[0, 1],
|
|
|
fixed_t_layer=0,
|
|
|
fixed_v_layer=0,
|
|
|
hidden_size=768,
|
|
|
v_hidden_size=1024,
|
|
|
a_hidden_size=768,
|
|
|
bi_hidden_size=1024,
|
|
|
intermediate_size=3072,
|
|
|
v_intermediate_size=1024,
|
|
|
a_intermediate_size=3072,
|
|
|
hidden_act="gelu",
|
|
|
v_hidden_act="gelu",
|
|
|
a_hidden_act="gelu",
|
|
|
hidden_dropout_prob=0.1,
|
|
|
v_hidden_dropout_prob=0.1,
|
|
|
a_hidden_dropout_prob=0.1,
|
|
|
attention_probs_dropout_prob=0.1,
|
|
|
v_attention_probs_dropout_prob=0.1,
|
|
|
a_attention_probs_dropout_prob=0.1,
|
|
|
av_attention_probs_dropout_prob=0.1,
|
|
|
at_attention_probs_dropout_prob=0.1,
|
|
|
num_attention_heads=12,
|
|
|
v_num_attention_heads=8,
|
|
|
a_num_attention_heads=12,
|
|
|
bi_num_attention_heads=8,
|
|
|
fusion_method="mul",
|
|
|
pretrained=None,
|
|
|
):
|
|
|
"""
|
|
|
vocab_size: vocabulary size. Default: 30522.
|
|
|
max_position_embeddings: max position id. Default: 512.
|
|
|
type_vocab_size: max segment id. Default: 2.
|
|
|
v_target_size: class number of visual word. Default: 1601.
|
|
|
a_target_size: class number of action word. Default: 700.
|
|
|
v_feature_size: input visual feature dimension. Default: 2048.
|
|
|
a_feature_size: input action feature dimension. Default: 2048.
|
|
|
num_hidden_layers: number of BertLayer in text transformer. Default: 12.
|
|
|
v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2.
|
|
|
a_num_hidden_layers: number of BertLayer in action transformer. Default:3.
|
|
|
t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11].
|
|
|
v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1].
|
|
|
a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1].
|
|
|
fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0.
|
|
|
fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0.
|
|
|
hidden_size: hidden size in text BertLayer. Default: 768.
|
|
|
v_hidden_size: hidden size in visual BertLayer. Default: 1024.
|
|
|
a_hidden_size: hidden size in action BertLayer. Default: 768.
|
|
|
bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024,
|
|
|
intermediate_size: intermediate size in text BertLayer. Default: 3072.
|
|
|
v_intermediate_size: intermediate size in visual BertLayer. Default: 1024.
|
|
|
a_intermediate_size: intermediate size in text BertLayer. Default: 3072.
|
|
|
hidden_act: hidden activation function in text BertLayer. Default: "gelu".
|
|
|
v_hidden_act: hidden activation function in visual BertLayer. Default: "gelu".
|
|
|
a_hidden_act: hidden activation function in action BertLayer. Default: "gelu".
|
|
|
hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1
|
|
|
v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1
|
|
|
a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1
|
|
|
attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1
|
|
|
v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1
|
|
|
a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1
|
|
|
av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1
|
|
|
at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1
|
|
|
num_attention_heads: number of heads in text BertLayer. Default: 12.
|
|
|
v_num_attention_heads: number of heads in visual BertLayer. Default: 8.
|
|
|
a_num_attention_heads: number of heads in action BertLayer. Default: 12.
|
|
|
bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8.
|
|
|
fusion_method: methods of fusing pooled output from 3 transformer. Default: "mul".
|
|
|
"""
|
|
|
super(BertForMultiModalPreTraining, self).__init__()
|
|
|
self.pretrained = pretrained
|
|
|
self.vocab_size = vocab_size
|
|
|
self.a_target_size = a_target_size
|
|
|
|
|
|
self.bert = BertModel(
|
|
|
vocab_size,
|
|
|
max_position_embeddings,
|
|
|
type_vocab_size,
|
|
|
v_feature_size,
|
|
|
a_feature_size,
|
|
|
num_hidden_layers,
|
|
|
v_num_hidden_layers,
|
|
|
a_num_hidden_layers,
|
|
|
v_ent_attention_id,
|
|
|
t_ent_attention_id,
|
|
|
a_ent_attention_id,
|
|
|
fixed_t_layer,
|
|
|
fixed_v_layer,
|
|
|
hidden_size,
|
|
|
v_hidden_size,
|
|
|
a_hidden_size,
|
|
|
bi_hidden_size,
|
|
|
intermediate_size,
|
|
|
v_intermediate_size,
|
|
|
a_intermediate_size,
|
|
|
hidden_act,
|
|
|
v_hidden_act,
|
|
|
a_hidden_act,
|
|
|
hidden_dropout_prob,
|
|
|
v_hidden_dropout_prob,
|
|
|
a_hidden_dropout_prob,
|
|
|
attention_probs_dropout_prob,
|
|
|
v_attention_probs_dropout_prob,
|
|
|
a_attention_probs_dropout_prob,
|
|
|
av_attention_probs_dropout_prob,
|
|
|
at_attention_probs_dropout_prob,
|
|
|
num_attention_heads,
|
|
|
v_num_attention_heads,
|
|
|
a_num_attention_heads,
|
|
|
bi_num_attention_heads,
|
|
|
)
|
|
|
self.cls = BertPreTrainingHeads(
|
|
|
hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size,
|
|
|
hidden_act, v_hidden_act, a_hidden_act, v_target_size,
|
|
|
a_target_size, fusion_method,
|
|
|
self.bert.embeddings.word_embeddings.weight)
|
|
|
|
|
|
def init_weights(self):
|
|
|
"""Initiate the parameters.
|
|
|
"""
|
|
|
if isinstance(self.pretrained, str) and self.pretrained.strip() != "":
|
|
|
load_ckpt(self, self.pretrained)
|
|
|
elif self.pretrained is None or self.pretrained.strip() == "":
|
|
|
for layer in self.sublayers():
|
|
|
if isinstance(layer, (nn.Linear, nn.Embedding)):
|
|
|
weight_init_(layer, 'Normal', std=0.02)
|
|
|
elif isinstance(layer, nn.LayerNorm):
|
|
|
weight_init_(layer, 'Constant', value=1)
|
|
|
|
|
|
def forward(
|
|
|
self,
|
|
|
text_ids, #8,36
|
|
|
action_feat, #8,5,2048
|
|
|
image_feat, #8,37,2048
|
|
|
image_loc, #8,37,5
|
|
|
token_type_ids=None, #8,36
|
|
|
text_mask=None, #8,36
|
|
|
image_mask=None, #8,37
|
|
|
action_mask=None, #8,5
|
|
|
):
|
|
|
"""
|
|
|
text_ids: input text ids. Shape: [batch_size, seqence_length]
|
|
|
action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim]
|
|
|
image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature.
|
|
|
image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location.
|
|
|
token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length]
|
|
|
text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length]
|
|
|
image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length]
|
|
|
action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length]
|
|
|
"""
|
|
|
sequence_output_t, sequence_output_v, sequence_output_a, \
|
|
|
pooled_output_t, pooled_output_v, pooled_output_a = self.bert(
|
|
|
text_ids,
|
|
|
action_feat,
|
|
|
image_feat,
|
|
|
image_loc,
|
|
|
token_type_ids,
|
|
|
text_mask,
|
|
|
image_mask,
|
|
|
action_mask,
|
|
|
output_all_encoded_layers=False,
|
|
|
)
|
|
|
|
|
|
prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls(
|
|
|
sequence_output_t, sequence_output_v, sequence_output_a,
|
|
|
pooled_output_t, pooled_output_v, pooled_output_a)
|
|
|
|
|
|
return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score
|