You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
381 lines
14 KiB
Python
381 lines
14 KiB
Python
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import random
|
|
import numpy as np
|
|
from PIL import Image
|
|
# import decord as de
|
|
import copy
|
|
import json
|
|
from ..registry import PIPELINES
|
|
|
|
try:
|
|
from paddlenlp.transformers import BertTokenizer
|
|
except ImportError as e:
|
|
print(
|
|
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
|
|
)
|
|
|
|
|
|
@PIPELINES.register()
|
|
class FeaturePadding(object):
|
|
"""
|
|
Padding feature to target shape.
|
|
"""
|
|
def __init__(self, max_region_num=36, max_action_num=5):
|
|
self.max_region_num = max_region_num
|
|
self.max_action_num = max_action_num
|
|
|
|
def __call__(self, results):
|
|
"""
|
|
Padding feature.
|
|
"""
|
|
pack_feature = results['feature']
|
|
tokenizer = results['tokenizer']
|
|
image_feature_wp, image_target_wp, image_location_wp, \
|
|
num_boxes, image_h, image_w, image_id, caption, \
|
|
action_feature_wp, action_target_wp, num_actions = pack_feature
|
|
|
|
image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
|
|
image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
|
|
image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
|
|
|
|
action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
|
|
action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
|
|
|
|
num_boxes = int(num_boxes)
|
|
image_feature[:num_boxes] = image_feature_wp
|
|
image_target[:num_boxes] = image_target_wp
|
|
image_location[:num_boxes, :4] = image_location_wp
|
|
|
|
image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
|
|
image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
|
|
float(image_h))
|
|
|
|
image_location[:, 0] = image_location[:, 0] / float(image_w)
|
|
image_location[:, 1] = image_location[:, 1] / float(image_h)
|
|
image_location[:, 2] = image_location[:, 2] / float(image_w)
|
|
image_location[:, 3] = image_location[:, 3] / float(image_h)
|
|
|
|
image_feature = copy.deepcopy(image_feature)
|
|
image_target = copy.deepcopy(image_target)
|
|
|
|
num_actions = int(num_actions)
|
|
action_feature[:num_actions] = action_feature_wp
|
|
action_target[:num_actions] = action_target_wp
|
|
action_feature = copy.deepcopy(action_feature)
|
|
action_target = copy.deepcopy(action_target)
|
|
|
|
results = dict(image_feat=image_feature,
|
|
image_target=image_target,
|
|
caption=caption,
|
|
image_loc=image_location,
|
|
num_boxes=int(num_boxes),
|
|
action_feat=action_feature,
|
|
action_target=action_target,
|
|
num_actions=int(num_actions),
|
|
tokenizer=tokenizer)
|
|
return results
|
|
|
|
|
|
@PIPELINES.register()
|
|
class RandomCap(object):
|
|
def __init__(self, caption_path):
|
|
"""
|
|
Random Caption for NSP task
|
|
"""
|
|
self.caption_path = caption_path
|
|
|
|
def select_caption(self, caption):
|
|
captions = caption.split('!')
|
|
rind = random.randint(0, len(captions) - 1)
|
|
caption = captions[rind]
|
|
return caption
|
|
|
|
def get_random_caption(self, all_captions):
|
|
num_caps = len(all_captions)
|
|
rand_doc_idx = random.randint(0, num_caps - 1)
|
|
caption = all_captions[rand_doc_idx]
|
|
caption = self.select_caption(caption)
|
|
return caption
|
|
|
|
def random_cap(self, caption, all_captions):
|
|
if random.random() > 0.5:
|
|
label = 0
|
|
else:
|
|
caption = self.get_random_caption(all_captions)
|
|
label = 1
|
|
return caption, label
|
|
|
|
def __call__(self, results):
|
|
caption = results['caption']
|
|
all_captions = list(json.load(open(self.caption_path, 'r')))
|
|
caption = self.select_caption(caption)
|
|
caption, label = self.random_cap(caption, all_captions)
|
|
results['caption'] = caption
|
|
results['is_next'] = label
|
|
return results
|
|
|
|
|
|
@PIPELINES.register()
|
|
class Tokenize(object):
|
|
def __init__(self, ):
|
|
"""
|
|
Tokenize caption
|
|
"""
|
|
pass
|
|
|
|
def __call__(self, results):
|
|
caption = results['caption']
|
|
tokenizer = results['tokenizer']
|
|
tokens_caption = tokenizer.tokenize(caption)
|
|
results['caption'] = tokens_caption
|
|
return results
|
|
|
|
|
|
@PIPELINES.register()
|
|
class RandomMask(object):
|
|
def __init__(self,
|
|
max_seq_length=36,
|
|
max_action_length=5,
|
|
max_region_length=36):
|
|
self.max_seq_length = max_seq_length
|
|
self.max_action_length = max_action_length
|
|
self.max_region_length = max_region_length
|
|
|
|
def get_image_global_feature(self, image_feat, image_loc, image_mask):
|
|
g_image_feat = np.sum(image_feat, axis=0) / np.sum(
|
|
image_mask, axis=0, keepdims=True)
|
|
image_feat = np.concatenate(
|
|
[np.expand_dims(g_image_feat, axis=0), image_feat],
|
|
axis=0).astype("float32")
|
|
|
|
g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
|
|
image_loc = np.concatenate(
|
|
[np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
|
|
|
|
g_image_mask = np.array([1])
|
|
image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
|
|
|
|
return image_feat, image_loc, image_mask
|
|
|
|
def _truncate_seq_pair(self, tokens_b, max_length):
|
|
"""Truncates a sequence pair in place to the maximum length.
|
|
This is a simple heuristic which will always truncate the longer sequence
|
|
one token at a time. This makes more sense than truncating an equal percent
|
|
of tokens from each, since if one sequence is very short then each token
|
|
that's truncated likely contains more information than a longer sequence.
|
|
"""
|
|
while True:
|
|
total_length = len(tokens_b)
|
|
if total_length <= max_length:
|
|
break
|
|
tokens_b.pop()
|
|
|
|
def random_word(self, tokens, tokenizer):
|
|
"""
|
|
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
|
|
Args:
|
|
tokens: list of str, tokenized sentence.
|
|
tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
|
|
Return:
|
|
(list of str, list of int), masked tokens and related labels for LM prediction
|
|
"""
|
|
output_label = []
|
|
|
|
for i, token in enumerate(tokens):
|
|
prob = random.random()
|
|
# mask token with 15% probability
|
|
|
|
if prob < 0.15:
|
|
prob /= 0.15
|
|
|
|
# 80% randomly change token to mask token
|
|
if prob < 0.8:
|
|
tokens[i] = "[MASK]"
|
|
|
|
# 10% randomly change token to random token
|
|
elif prob < 0.9:
|
|
#tok = random.choice(list(tokenizer.vocab.items()))[0]
|
|
tok = tokenizer.vocab.idx_to_token[random.randint(
|
|
0,
|
|
tokenizer.vocab_size,
|
|
)]
|
|
tokens[i] = tok
|
|
|
|
# rest 10% randomly keep current token
|
|
# append current token to output (we will predict these later)
|
|
try:
|
|
output_label.append(tokenizer.vocab[token])
|
|
except KeyError:
|
|
# For unknown words (should not occur with BPE vocab)
|
|
output_label.append(tokenizer.vocab["[UNK]"])
|
|
print(
|
|
"Cannot find token '{}' in vocab. Using [UNK] insetad".
|
|
format(token))
|
|
else:
|
|
# no masking token (will be ignored by loss function later)
|
|
output_label.append(-1)
|
|
|
|
return tokens, output_label
|
|
|
|
def random_region(self, image_feat, image_loc, num_boxes):
|
|
output_label = []
|
|
|
|
for i in range(num_boxes):
|
|
prob = random.random()
|
|
# mask token with 15% probability
|
|
if prob < 0.15:
|
|
prob /= 0.15
|
|
|
|
# 80% randomly change token to mask token
|
|
if prob < 0.9:
|
|
image_feat[i] = 0
|
|
|
|
# rest 20% randomly keep current token
|
|
# append current token to output (we will predict these later)
|
|
output_label.append(1)
|
|
else:
|
|
# no masking token (will be ignored by loss function later)
|
|
output_label.append(-1)
|
|
|
|
return image_feat, image_loc, output_label
|
|
|
|
def random_action(self, action_feat, action_target, num_actions):
|
|
output_label = []
|
|
|
|
for i in range(num_actions):
|
|
prob = random.random()
|
|
# mask token with 15% probability
|
|
if prob < 0.15:
|
|
prob /= 0.15
|
|
|
|
# 90% randomly change token to mask token
|
|
if prob < 0.9:
|
|
action_feat[i] = 0
|
|
|
|
# rest 10% randomly keep current token
|
|
# append current token to output (we will predict these later)
|
|
output_label.append(action_target[i])
|
|
else:
|
|
# no masking token (will be ignored by loss function later)
|
|
output_label.append(-1)
|
|
|
|
return action_feat, output_label
|
|
|
|
def __call__(self, results):
|
|
caption = results['caption']
|
|
tokenizer = results['tokenizer']
|
|
image_feat = results['image_feat']
|
|
image_loc = results['image_loc']
|
|
num_boxes = results['num_boxes']
|
|
action_feat = results['action_feat']
|
|
action_target = results['action_target']
|
|
num_actions = results['num_actions']
|
|
is_next = results['is_next']
|
|
image_target = results['image_target']
|
|
|
|
self._truncate_seq_pair(caption, self.max_seq_length - 2)
|
|
caption, caption_label = self.random_word(caption, tokenizer)
|
|
|
|
image_feat, image_loc, image_label = self.random_region(
|
|
image_feat, image_loc, num_boxes)
|
|
action_feat, action_label = self.random_action(action_feat,
|
|
action_target,
|
|
num_actions)
|
|
|
|
# concatenate lm labels and account for CLS, SEP, SEP
|
|
lm_label_ids = [-1] + caption_label + [-1]
|
|
|
|
# The convention in BERT is:
|
|
# (a) For sequence pairs:
|
|
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
# (b) For single sequences:
|
|
# tokens: [CLS] the dog is hairy . [SEP]
|
|
# type_ids: 0 0 0 0 0 0 0
|
|
#
|
|
# Where "type_ids" are used to indicate whether this is the first
|
|
# sequence or the second sequence. The embedding vectors for `type=0` and
|
|
# `type=1` were learned during pre-training and are added to the wordpiece
|
|
# embedding vector (and position vector). This is not *strictly* necessary
|
|
# since the [SEP] token unambigiously separates the sequences, but it makes
|
|
# it easier for the model to learn the concept of sequences.
|
|
#
|
|
# For classification tasks, the first vector (corresponding to [CLS]) is
|
|
# used as as the "sentence vector". Note that this only makes sense because
|
|
# the entire model is fine-tuned.
|
|
|
|
tokens = []
|
|
segment_ids = []
|
|
|
|
tokens.append("[CLS]")
|
|
segment_ids.append(0)
|
|
|
|
for token in caption:
|
|
tokens.append(token)
|
|
segment_ids.append(0)
|
|
tokens.append("[SEP]")
|
|
segment_ids.append(0)
|
|
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
|
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
|
|
input_mask = [1] * (len(input_ids))
|
|
image_mask = [1] * (num_boxes)
|
|
action_mask = [1] * (num_actions)
|
|
|
|
# Zero-pad up to the visual sequence length.
|
|
while len(image_mask) < self.max_region_length:
|
|
image_mask.append(0)
|
|
image_label.append(-1)
|
|
while len(action_mask) < self.max_action_length:
|
|
action_mask.append(0)
|
|
action_label.append(-1)
|
|
|
|
# Zero-pad up to the sequence length.
|
|
while len(input_ids) < self.max_seq_length:
|
|
input_ids.append(0)
|
|
input_mask.append(0)
|
|
segment_ids.append(0)
|
|
lm_label_ids.append(-1)
|
|
|
|
assert len(input_ids) == self.max_seq_length
|
|
assert len(input_mask) == self.max_seq_length
|
|
assert len(segment_ids) == self.max_seq_length
|
|
assert len(lm_label_ids) == self.max_seq_length
|
|
assert len(image_mask) == self.max_region_length
|
|
assert len(image_label) == self.max_region_length
|
|
assert len(action_mask) == self.max_action_length
|
|
assert len(action_label) == self.max_action_length
|
|
|
|
image_feat, image_loc, image_mask = self.get_image_global_feature(
|
|
image_feat, image_loc, np.array(image_mask))
|
|
features = [
|
|
np.array(input_ids),
|
|
action_feat,
|
|
image_feat,
|
|
image_loc,
|
|
np.array(segment_ids),
|
|
np.array(input_mask),
|
|
image_mask,
|
|
np.array(action_mask),
|
|
np.array(lm_label_ids),
|
|
np.array(action_label),
|
|
np.array(is_next),
|
|
np.array(image_label),
|
|
image_target,
|
|
]
|
|
results['features'] = features
|
|
return results
|