You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

381 lines
14 KiB
Python

# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
from PIL import Image
# import decord as de
import copy
import json
from ..registry import PIPELINES
try:
from paddlenlp.transformers import BertTokenizer
except ImportError as e:
print(
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
)
@PIPELINES.register()
class FeaturePadding(object):
"""
Padding feature to target shape.
"""
def __init__(self, max_region_num=36, max_action_num=5):
self.max_region_num = max_region_num
self.max_action_num = max_action_num
def __call__(self, results):
"""
Padding feature.
"""
pack_feature = results['feature']
tokenizer = results['tokenizer']
image_feature_wp, image_target_wp, image_location_wp, \
num_boxes, image_h, image_w, image_id, caption, \
action_feature_wp, action_target_wp, num_actions = pack_feature
image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
num_boxes = int(num_boxes)
image_feature[:num_boxes] = image_feature_wp
image_target[:num_boxes] = image_target_wp
image_location[:num_boxes, :4] = image_location_wp
image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
float(image_h))
image_location[:, 0] = image_location[:, 0] / float(image_w)
image_location[:, 1] = image_location[:, 1] / float(image_h)
image_location[:, 2] = image_location[:, 2] / float(image_w)
image_location[:, 3] = image_location[:, 3] / float(image_h)
image_feature = copy.deepcopy(image_feature)
image_target = copy.deepcopy(image_target)
num_actions = int(num_actions)
action_feature[:num_actions] = action_feature_wp
action_target[:num_actions] = action_target_wp
action_feature = copy.deepcopy(action_feature)
action_target = copy.deepcopy(action_target)
results = dict(image_feat=image_feature,
image_target=image_target,
caption=caption,
image_loc=image_location,
num_boxes=int(num_boxes),
action_feat=action_feature,
action_target=action_target,
num_actions=int(num_actions),
tokenizer=tokenizer)
return results
@PIPELINES.register()
class RandomCap(object):
def __init__(self, caption_path):
"""
Random Caption for NSP task
"""
self.caption_path = caption_path
def select_caption(self, caption):
captions = caption.split('!')
rind = random.randint(0, len(captions) - 1)
caption = captions[rind]
return caption
def get_random_caption(self, all_captions):
num_caps = len(all_captions)
rand_doc_idx = random.randint(0, num_caps - 1)
caption = all_captions[rand_doc_idx]
caption = self.select_caption(caption)
return caption
def random_cap(self, caption, all_captions):
if random.random() > 0.5:
label = 0
else:
caption = self.get_random_caption(all_captions)
label = 1
return caption, label
def __call__(self, results):
caption = results['caption']
all_captions = list(json.load(open(self.caption_path, 'r')))
caption = self.select_caption(caption)
caption, label = self.random_cap(caption, all_captions)
results['caption'] = caption
results['is_next'] = label
return results
@PIPELINES.register()
class Tokenize(object):
def __init__(self, ):
"""
Tokenize caption
"""
pass
def __call__(self, results):
caption = results['caption']
tokenizer = results['tokenizer']
tokens_caption = tokenizer.tokenize(caption)
results['caption'] = tokens_caption
return results
@PIPELINES.register()
class RandomMask(object):
def __init__(self,
max_seq_length=36,
max_action_length=5,
max_region_length=36):
self.max_seq_length = max_seq_length
self.max_action_length = max_action_length
self.max_region_length = max_region_length
def get_image_global_feature(self, image_feat, image_loc, image_mask):
g_image_feat = np.sum(image_feat, axis=0) / np.sum(
image_mask, axis=0, keepdims=True)
image_feat = np.concatenate(
[np.expand_dims(g_image_feat, axis=0), image_feat],
axis=0).astype("float32")
g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
image_loc = np.concatenate(
[np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
g_image_mask = np.array([1])
image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
return image_feat, image_loc, image_mask
def _truncate_seq_pair(self, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length.
This is a simple heuristic which will always truncate the longer sequence
one token at a time. This makes more sense than truncating an equal percent
of tokens from each, since if one sequence is very short then each token
that's truncated likely contains more information than a longer sequence.
"""
while True:
total_length = len(tokens_b)
if total_length <= max_length:
break
tokens_b.pop()
def random_word(self, tokens, tokenizer):
"""
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
Args:
tokens: list of str, tokenized sentence.
tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
Return:
(list of str, list of int), masked tokens and related labels for LM prediction
"""
output_label = []
for i, token in enumerate(tokens):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.8:
tokens[i] = "[MASK]"
# 10% randomly change token to random token
elif prob < 0.9:
#tok = random.choice(list(tokenizer.vocab.items()))[0]
tok = tokenizer.vocab.idx_to_token[random.randint(
0,
tokenizer.vocab_size,
)]
tokens[i] = tok
# rest 10% randomly keep current token
# append current token to output (we will predict these later)
try:
output_label.append(tokenizer.vocab[token])
except KeyError:
# For unknown words (should not occur with BPE vocab)
output_label.append(tokenizer.vocab["[UNK]"])
print(
"Cannot find token '{}' in vocab. Using [UNK] insetad".
format(token))
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return tokens, output_label
def random_region(self, image_feat, image_loc, num_boxes):
output_label = []
for i in range(num_boxes):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.9:
image_feat[i] = 0
# rest 20% randomly keep current token
# append current token to output (we will predict these later)
output_label.append(1)
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return image_feat, image_loc, output_label
def random_action(self, action_feat, action_target, num_actions):
output_label = []
for i in range(num_actions):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 90% randomly change token to mask token
if prob < 0.9:
action_feat[i] = 0
# rest 10% randomly keep current token
# append current token to output (we will predict these later)
output_label.append(action_target[i])
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return action_feat, output_label
def __call__(self, results):
caption = results['caption']
tokenizer = results['tokenizer']
image_feat = results['image_feat']
image_loc = results['image_loc']
num_boxes = results['num_boxes']
action_feat = results['action_feat']
action_target = results['action_target']
num_actions = results['num_actions']
is_next = results['is_next']
image_target = results['image_target']
self._truncate_seq_pair(caption, self.max_seq_length - 2)
caption, caption_label = self.random_word(caption, tokenizer)
image_feat, image_loc, image_label = self.random_region(
image_feat, image_loc, num_boxes)
action_feat, action_label = self.random_action(action_feat,
action_target,
num_actions)
# concatenate lm labels and account for CLS, SEP, SEP
lm_label_ids = [-1] + caption_label + [-1]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in caption:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
input_mask = [1] * (len(input_ids))
image_mask = [1] * (num_boxes)
action_mask = [1] * (num_actions)
# Zero-pad up to the visual sequence length.
while len(image_mask) < self.max_region_length:
image_mask.append(0)
image_label.append(-1)
while len(action_mask) < self.max_action_length:
action_mask.append(0)
action_label.append(-1)
# Zero-pad up to the sequence length.
while len(input_ids) < self.max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
lm_label_ids.append(-1)
assert len(input_ids) == self.max_seq_length
assert len(input_mask) == self.max_seq_length
assert len(segment_ids) == self.max_seq_length
assert len(lm_label_ids) == self.max_seq_length
assert len(image_mask) == self.max_region_length
assert len(image_label) == self.max_region_length
assert len(action_mask) == self.max_action_length
assert len(action_label) == self.max_action_length
image_feat, image_loc, image_mask = self.get_image_global_feature(
image_feat, image_loc, np.array(image_mask))
features = [
np.array(input_ids),
action_feat,
image_feat,
image_loc,
np.array(segment_ids),
np.array(input_mask),
image_mask,
np.array(action_mask),
np.array(lm_label_ids),
np.array(action_label),
np.array(is_next),
np.array(image_label),
image_target,
]
results['features'] = features
return results