# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import random import numpy as np from PIL import Image # import decord as de import copy import json from ..registry import PIPELINES try: from paddlenlp.transformers import BertTokenizer except ImportError as e: print( f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT." ) @PIPELINES.register() class FeaturePadding(object): """ Padding feature to target shape. """ def __init__(self, max_region_num=36, max_action_num=5): self.max_region_num = max_region_num self.max_action_num = max_action_num def __call__(self, results): """ Padding feature. """ pack_feature = results['feature'] tokenizer = results['tokenizer'] image_feature_wp, image_target_wp, image_location_wp, \ num_boxes, image_h, image_w, image_id, caption, \ action_feature_wp, action_target_wp, num_actions = pack_feature image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32) image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32) image_location = np.zeros((self.max_region_num, 5), dtype=np.float32) action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32) action_target = np.zeros((self.max_action_num, ), dtype=np.int64) num_boxes = int(num_boxes) image_feature[:num_boxes] = image_feature_wp image_target[:num_boxes] = image_target_wp image_location[:num_boxes, :4] = image_location_wp image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * ( image_location[:, 2] - image_location[:, 0]) / (float(image_w) * float(image_h)) image_location[:, 0] = image_location[:, 0] / float(image_w) image_location[:, 1] = image_location[:, 1] / float(image_h) image_location[:, 2] = image_location[:, 2] / float(image_w) image_location[:, 3] = image_location[:, 3] / float(image_h) image_feature = copy.deepcopy(image_feature) image_target = copy.deepcopy(image_target) num_actions = int(num_actions) action_feature[:num_actions] = action_feature_wp action_target[:num_actions] = action_target_wp action_feature = copy.deepcopy(action_feature) action_target = copy.deepcopy(action_target) results = dict(image_feat=image_feature, image_target=image_target, caption=caption, image_loc=image_location, num_boxes=int(num_boxes), action_feat=action_feature, action_target=action_target, num_actions=int(num_actions), tokenizer=tokenizer) return results @PIPELINES.register() class RandomCap(object): def __init__(self, caption_path): """ Random Caption for NSP task """ self.caption_path = caption_path def select_caption(self, caption): captions = caption.split('!') rind = random.randint(0, len(captions) - 1) caption = captions[rind] return caption def get_random_caption(self, all_captions): num_caps = len(all_captions) rand_doc_idx = random.randint(0, num_caps - 1) caption = all_captions[rand_doc_idx] caption = self.select_caption(caption) return caption def random_cap(self, caption, all_captions): if random.random() > 0.5: label = 0 else: caption = self.get_random_caption(all_captions) label = 1 return caption, label def __call__(self, results): caption = results['caption'] all_captions = list(json.load(open(self.caption_path, 'r'))) caption = self.select_caption(caption) caption, label = self.random_cap(caption, all_captions) results['caption'] = caption results['is_next'] = label return results @PIPELINES.register() class Tokenize(object): def __init__(self, ): """ Tokenize caption """ pass def __call__(self, results): caption = results['caption'] tokenizer = results['tokenizer'] tokens_caption = tokenizer.tokenize(caption) results['caption'] = tokens_caption return results @PIPELINES.register() class RandomMask(object): def __init__(self, max_seq_length=36, max_action_length=5, max_region_length=36): self.max_seq_length = max_seq_length self.max_action_length = max_action_length self.max_region_length = max_region_length def get_image_global_feature(self, image_feat, image_loc, image_mask): g_image_feat = np.sum(image_feat, axis=0) / np.sum( image_mask, axis=0, keepdims=True) image_feat = np.concatenate( [np.expand_dims(g_image_feat, axis=0), image_feat], axis=0).astype("float32") g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32") image_loc = np.concatenate( [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0) g_image_mask = np.array([1]) image_mask = np.concatenate([g_image_mask, image_mask], axis=0) return image_feat, image_loc, image_mask def _truncate_seq_pair(self, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length. This is a simple heuristic which will always truncate the longer sequence one token at a time. This makes more sense than truncating an equal percent of tokens from each, since if one sequence is very short then each token that's truncated likely contains more information than a longer sequence. """ while True: total_length = len(tokens_b) if total_length <= max_length: break tokens_b.pop() def random_word(self, tokens, tokenizer): """ Masking some random tokens for Language Model task with probabilities as in the original BERT paper. Args: tokens: list of str, tokenized sentence. tokenizer: Tokenizer, object used for tokenization (we need it's vocab here) Return: (list of str, list of int), masked tokens and related labels for LM prediction """ output_label = [] for i, token in enumerate(tokens): prob = random.random() # mask token with 15% probability if prob < 0.15: prob /= 0.15 # 80% randomly change token to mask token if prob < 0.8: tokens[i] = "[MASK]" # 10% randomly change token to random token elif prob < 0.9: #tok = random.choice(list(tokenizer.vocab.items()))[0] tok = tokenizer.vocab.idx_to_token[random.randint( 0, tokenizer.vocab_size, )] tokens[i] = tok # rest 10% randomly keep current token # append current token to output (we will predict these later) try: output_label.append(tokenizer.vocab[token]) except KeyError: # For unknown words (should not occur with BPE vocab) output_label.append(tokenizer.vocab["[UNK]"]) print( "Cannot find token '{}' in vocab. Using [UNK] insetad". format(token)) else: # no masking token (will be ignored by loss function later) output_label.append(-1) return tokens, output_label def random_region(self, image_feat, image_loc, num_boxes): output_label = [] for i in range(num_boxes): prob = random.random() # mask token with 15% probability if prob < 0.15: prob /= 0.15 # 80% randomly change token to mask token if prob < 0.9: image_feat[i] = 0 # rest 20% randomly keep current token # append current token to output (we will predict these later) output_label.append(1) else: # no masking token (will be ignored by loss function later) output_label.append(-1) return image_feat, image_loc, output_label def random_action(self, action_feat, action_target, num_actions): output_label = [] for i in range(num_actions): prob = random.random() # mask token with 15% probability if prob < 0.15: prob /= 0.15 # 90% randomly change token to mask token if prob < 0.9: action_feat[i] = 0 # rest 10% randomly keep current token # append current token to output (we will predict these later) output_label.append(action_target[i]) else: # no masking token (will be ignored by loss function later) output_label.append(-1) return action_feat, output_label def __call__(self, results): caption = results['caption'] tokenizer = results['tokenizer'] image_feat = results['image_feat'] image_loc = results['image_loc'] num_boxes = results['num_boxes'] action_feat = results['action_feat'] action_target = results['action_target'] num_actions = results['num_actions'] is_next = results['is_next'] image_target = results['image_target'] self._truncate_seq_pair(caption, self.max_seq_length - 2) caption, caption_label = self.random_word(caption, tokenizer) image_feat, image_loc, image_label = self.random_region( image_feat, image_loc, num_boxes) action_feat, action_label = self.random_action(action_feat, action_target, num_actions) # concatenate lm labels and account for CLS, SEP, SEP lm_label_ids = [-1] + caption_label + [-1] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in caption: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. input_mask = [1] * (len(input_ids)) image_mask = [1] * (num_boxes) action_mask = [1] * (num_actions) # Zero-pad up to the visual sequence length. while len(image_mask) < self.max_region_length: image_mask.append(0) image_label.append(-1) while len(action_mask) < self.max_action_length: action_mask.append(0) action_label.append(-1) # Zero-pad up to the sequence length. while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) lm_label_ids.append(-1) assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length assert len(lm_label_ids) == self.max_seq_length assert len(image_mask) == self.max_region_length assert len(image_label) == self.max_region_length assert len(action_mask) == self.max_action_length assert len(action_label) == self.max_action_length image_feat, image_loc, image_mask = self.get_image_global_feature( image_feat, image_loc, np.array(image_mask)) features = [ np.array(input_ids), action_feat, image_feat, image_loc, np.array(segment_ids), np.array(input_mask), image_mask, np.array(action_mask), np.array(lm_label_ids), np.array(action_label), np.array(is_next), np.array(image_label), image_target, ] results['features'] = features return results