# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np try: import av except ImportError as e: print( f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models." ) import cv2 import pickle import decord as de import math import random from ..registry import PIPELINES def get_start_end_idx(video_size, clip_size, clip_idx, num_clips): delta = max(video_size - clip_size, 0) if clip_idx == -1: # here # Random temporal sampling. start_idx = random.uniform(0, delta) else: # ignore # Uniformly sample the clip with the given index. start_idx = delta * clip_idx / num_clips end_idx = start_idx + clip_size - 1 return start_idx, end_idx @PIPELINES.register() class VideoDecoder(object): """ Decode mp4 file to frames. Args: filepath: the file path of mp4 file """ def __init__(self, backend='cv2', mode='train', sampling_rate=32, num_seg=8, num_clips=1, target_fps=30): self.backend = backend # params below only for TimeSformer self.mode = mode self.sampling_rate = sampling_rate self.num_seg = num_seg self.num_clips = num_clips self.target_fps = target_fps def __call__(self, results): """ Perform mp4 decode operations. return: List where each item is a numpy array after decoder. """ file_path = results['filename'] results['format'] = 'video' results['backend'] = self.backend if self.backend == 'cv2': cap = cv2.VideoCapture(file_path) videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) sampledFrames = [] for i in range(videolen): ret, frame = cap.read() # maybe first frame is empty if ret == False: continue img = frame[:, :, ::-1] sampledFrames.append(img) results['frames'] = sampledFrames results['frames_len'] = len(sampledFrames) elif self.backend == 'decord': container = de.VideoReader(file_path) frames_len = len(container) results['frames'] = container results['frames_len'] = frames_len elif self.backend == 'pyav': # for TimeSformer if self.mode in ["train", "valid"]: clip_idx = -1 elif self.mode in ["test"]: clip_idx = 0 else: raise NotImplementedError container = av.open(file_path) num_clips = 1 # always be 1 # decode process fps = float(container.streams.video[0].average_rate) frames_length = container.streams.video[0].frames duration = container.streams.video[0].duration if duration is None: # If failed to fetch the decoding information, decode the entire video. decode_all_video = True video_start_pts, video_end_pts = 0, math.inf else: decode_all_video = False start_idx, end_idx = get_start_end_idx( frames_length, self.sampling_rate * self.num_seg / self.target_fps * fps, clip_idx, num_clips) timebase = duration / frames_length video_start_pts = int(start_idx * timebase) video_end_pts = int(end_idx * timebase) frames = None # If video stream was found, fetch video frames from the video. if container.streams.video: margin = 1024 seek_offset = max(video_start_pts - margin, 0) container.seek(seek_offset, any_frame=False, backward=True, stream=container.streams.video[0]) tmp_frames = {} buffer_count = 0 max_pts = 0 for frame in container.decode(**{"video": 0}): max_pts = max(max_pts, frame.pts) if frame.pts < video_start_pts: continue if frame.pts <= video_end_pts: tmp_frames[frame.pts] = frame else: buffer_count += 1 tmp_frames[frame.pts] = frame if buffer_count >= 0: break video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)] container.close() frames = [frame.to_rgb().to_ndarray() for frame in video_frames] clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps start_idx, end_idx = get_start_end_idx( len(frames), # frame_len clip_sz, clip_idx if decode_all_video else 0, # If decode all video, -1 in train and valid, 0 in test; # else, always 0 in train, valid and test, as we has selected clip size frames when decode. 1) results['frames'] = frames results['frames_len'] = len(frames) results['start_idx'] = start_idx results['end_idx'] = end_idx else: raise NotImplementedError # pass return results @PIPELINES.register() class FrameDecoder(object): """just parse results """ def __init__(self): pass def __call__(self, results): results['format'] = 'frame' return results @PIPELINES.register() class MRIDecoder(object): """just parse results """ def __init__(self): pass def __call__(self, results): results['format'] = 'MRI' return results @PIPELINES.register() class FeatureDecoder(object): """ Perform feature decode operations.e.g.youtube8m """ def __init__(self, num_classes, max_len=512, has_label=True): self.max_len = max_len self.num_classes = num_classes self.has_label = has_label def __call__(self, results): """ Perform feature decode operations. return: List where each item is a numpy array after decoder. """ #1. load pkl #2. parse to rgb/audio/ #3. padding filepath = results['filename'] data = pickle.load(open(filepath, 'rb'), encoding='bytes') record = data nframes = record['nframes'] if 'nframes' in record else record[ b'nframes'] rgb = record['feature'].astype( float) if 'feature' in record else record[b'feature'].astype(float) audio = record['audio'].astype( float) if 'audio' in record else record[b'audio'].astype(float) if self.has_label: label = record['label'] if 'label' in record else record[b'label'] one_hot_label = self.make_one_hot(label, self.num_classes) rgb = rgb[0:nframes, :] audio = audio[0:nframes, :] rgb = self.dequantize(rgb, max_quantized_value=2., min_quantized_value=-2.) audio = self.dequantize(audio, max_quantized_value=2, min_quantized_value=-2) if self.has_label: results['labels'] = one_hot_label.astype("float32") feat_pad_list = [] feat_len_list = [] mask_list = [] vitem = [rgb, audio] for vi in range(2): #rgb and audio if vi == 0: prefix = "rgb_" else: prefix = "audio_" feat = vitem[vi] results[prefix + 'len'] = feat.shape[0] #feat pad step 1. padding feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]), dtype=np.float32) feat_pad = np.concatenate((feat, feat_add), axis=0) results[prefix + 'data'] = feat_pad.astype("float32") #feat pad step 2. mask feat_mask_origin = np.ones(feat.shape, dtype=np.float32) feat_mask_add = feat_add feat_mask = np.concatenate((feat_mask_origin, feat_mask_add), axis=0) results[prefix + 'mask'] = feat_mask.astype("float32") return results def dequantize(self, feat_vector, max_quantized_value=2., min_quantized_value=-2.): """ Dequantize the feature from the byte format to the float format """ assert max_quantized_value > min_quantized_value quantized_range = max_quantized_value - min_quantized_value scalar = quantized_range / 255.0 bias = (quantized_range / 512.0) + min_quantized_value return feat_vector * scalar + bias def make_one_hot(self, label, dim=3862): one_hot_label = np.zeros(dim) one_hot_label = one_hot_label.astype(float) for ind in label: one_hot_label[int(ind)] = 1 return one_hot_label @PIPELINES.register() class ActionFeatureDecoder(object): """ Perform feature decode operations on footballaction """ def __init__(self, num_classes, max_len=512, has_label=True): self.max_len = max_len self.num_classes = num_classes self.has_label = has_label def __call__(self, results): """ Perform feature decode operations. return: List where each item is a numpy array after decoder. """ #1. load pkl #2. parse to rgb/audio/ #3. padding filepath = results['filename'] data = pickle.load(open(filepath, 'rb'), encoding='bytes') pkl_data = data rgb = pkl_data['image_feature'].astype(float) audio = pkl_data['audio_feature'].astype(float) label_id_info = pkl_data['label_info'] label_cls = [label_id_info['label']] label_one = int(label_cls[0]) if len(label_cls) > 1: label_index = random.randint(0, 1) label_one = int(label_cls[label_index]) iou_norm = float(label_id_info['norm_iou']) results['labels'] = np.array([label_one]) results['iou_norm'] = float(iou_norm) vitem = [rgb, audio] for vi in range(2): #rgb and audio if vi == 0: prefix = "rgb_" else: prefix = "audio_" feat = vitem[vi] results[prefix + 'len'] = feat.shape[0] #feat pad step 1. padding feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]), dtype=np.float32) feat_pad = np.concatenate((feat, feat_add), axis=0) results[prefix + 'data'] = feat_pad.astype("float32") #feat pad step 2. mask feat_mask_origin = np.ones(feat.shape, dtype=np.float32) feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0) results[prefix + 'mask'] = feat_mask.astype("float32") return results