You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

349 lines
12 KiB
Python

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
try:
import av
except ImportError as e:
print(
f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models."
)
import cv2
import pickle
import decord as de
import math
import random
from ..registry import PIPELINES
def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
delta = max(video_size - clip_size, 0)
if clip_idx == -1: # here
# Random temporal sampling.
start_idx = random.uniform(0, delta)
else: # ignore
# Uniformly sample the clip with the given index.
start_idx = delta * clip_idx / num_clips
end_idx = start_idx + clip_size - 1
return start_idx, end_idx
@PIPELINES.register()
class VideoDecoder(object):
"""
Decode mp4 file to frames.
Args:
filepath: the file path of mp4 file
"""
def __init__(self,
backend='cv2',
mode='train',
sampling_rate=32,
num_seg=8,
num_clips=1,
target_fps=30):
self.backend = backend
# params below only for TimeSformer
self.mode = mode
self.sampling_rate = sampling_rate
self.num_seg = num_seg
self.num_clips = num_clips
self.target_fps = target_fps
def __call__(self, results):
"""
Perform mp4 decode operations.
return:
List where each item is a numpy array after decoder.
"""
file_path = results['filename']
results['format'] = 'video'
results['backend'] = self.backend
if self.backend == 'cv2':
cap = cv2.VideoCapture(file_path)
videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
sampledFrames = []
for i in range(videolen):
ret, frame = cap.read()
# maybe first frame is empty
if ret == False:
continue
img = frame[:, :, ::-1]
sampledFrames.append(img)
results['frames'] = sampledFrames
results['frames_len'] = len(sampledFrames)
elif self.backend == 'decord':
container = de.VideoReader(file_path)
frames_len = len(container)
results['frames'] = container
results['frames_len'] = frames_len
elif self.backend == 'pyav': # for TimeSformer
if self.mode in ["train", "valid"]:
clip_idx = -1
elif self.mode in ["test"]:
clip_idx = 0
else:
raise NotImplementedError
container = av.open(file_path)
num_clips = 1 # always be 1
# decode process
fps = float(container.streams.video[0].average_rate)
frames_length = container.streams.video[0].frames
duration = container.streams.video[0].duration
if duration is None:
# If failed to fetch the decoding information, decode the entire video.
decode_all_video = True
video_start_pts, video_end_pts = 0, math.inf
else:
decode_all_video = False
start_idx, end_idx = get_start_end_idx(
frames_length,
self.sampling_rate * self.num_seg / self.target_fps * fps,
clip_idx, num_clips)
timebase = duration / frames_length
video_start_pts = int(start_idx * timebase)
video_end_pts = int(end_idx * timebase)
frames = None
# If video stream was found, fetch video frames from the video.
if container.streams.video:
margin = 1024
seek_offset = max(video_start_pts - margin, 0)
container.seek(seek_offset,
any_frame=False,
backward=True,
stream=container.streams.video[0])
tmp_frames = {}
buffer_count = 0
max_pts = 0
for frame in container.decode(**{"video": 0}):
max_pts = max(max_pts, frame.pts)
if frame.pts < video_start_pts:
continue
if frame.pts <= video_end_pts:
tmp_frames[frame.pts] = frame
else:
buffer_count += 1
tmp_frames[frame.pts] = frame
if buffer_count >= 0:
break
video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
container.close()
frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
start_idx, end_idx = get_start_end_idx(
len(frames), # frame_len
clip_sz,
clip_idx if decode_all_video else
0, # If decode all video, -1 in train and valid, 0 in test;
# else, always 0 in train, valid and test, as we has selected clip size frames when decode.
1)
results['frames'] = frames
results['frames_len'] = len(frames)
results['start_idx'] = start_idx
results['end_idx'] = end_idx
else:
raise NotImplementedError
# pass
return results
@PIPELINES.register()
class FrameDecoder(object):
"""just parse results
"""
def __init__(self):
pass
def __call__(self, results):
results['format'] = 'frame'
return results
@PIPELINES.register()
class MRIDecoder(object):
"""just parse results
"""
def __init__(self):
pass
def __call__(self, results):
results['format'] = 'MRI'
return results
@PIPELINES.register()
class FeatureDecoder(object):
"""
Perform feature decode operations.e.g.youtube8m
"""
def __init__(self, num_classes, max_len=512, has_label=True):
self.max_len = max_len
self.num_classes = num_classes
self.has_label = has_label
def __call__(self, results):
"""
Perform feature decode operations.
return:
List where each item is a numpy array after decoder.
"""
#1. load pkl
#2. parse to rgb/audio/
#3. padding
filepath = results['filename']
data = pickle.load(open(filepath, 'rb'), encoding='bytes')
record = data
nframes = record['nframes'] if 'nframes' in record else record[
b'nframes']
rgb = record['feature'].astype(
float) if 'feature' in record else record[b'feature'].astype(float)
audio = record['audio'].astype(
float) if 'audio' in record else record[b'audio'].astype(float)
if self.has_label:
label = record['label'] if 'label' in record else record[b'label']
one_hot_label = self.make_one_hot(label, self.num_classes)
rgb = rgb[0:nframes, :]
audio = audio[0:nframes, :]
rgb = self.dequantize(rgb,
max_quantized_value=2.,
min_quantized_value=-2.)
audio = self.dequantize(audio,
max_quantized_value=2,
min_quantized_value=-2)
if self.has_label:
results['labels'] = one_hot_label.astype("float32")
feat_pad_list = []
feat_len_list = []
mask_list = []
vitem = [rgb, audio]
for vi in range(2): #rgb and audio
if vi == 0:
prefix = "rgb_"
else:
prefix = "audio_"
feat = vitem[vi]
results[prefix + 'len'] = feat.shape[0]
#feat pad step 1. padding
feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
dtype=np.float32)
feat_pad = np.concatenate((feat, feat_add), axis=0)
results[prefix + 'data'] = feat_pad.astype("float32")
#feat pad step 2. mask
feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
feat_mask_add = feat_add
feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
axis=0)
results[prefix + 'mask'] = feat_mask.astype("float32")
return results
def dequantize(self,
feat_vector,
max_quantized_value=2.,
min_quantized_value=-2.):
"""
Dequantize the feature from the byte format to the float format
"""
assert max_quantized_value > min_quantized_value
quantized_range = max_quantized_value - min_quantized_value
scalar = quantized_range / 255.0
bias = (quantized_range / 512.0) + min_quantized_value
return feat_vector * scalar + bias
def make_one_hot(self, label, dim=3862):
one_hot_label = np.zeros(dim)
one_hot_label = one_hot_label.astype(float)
for ind in label:
one_hot_label[int(ind)] = 1
return one_hot_label
@PIPELINES.register()
class ActionFeatureDecoder(object):
"""
Perform feature decode operations on footballaction
"""
def __init__(self, num_classes, max_len=512, has_label=True):
self.max_len = max_len
self.num_classes = num_classes
self.has_label = has_label
def __call__(self, results):
"""
Perform feature decode operations.
return:
List where each item is a numpy array after decoder.
"""
#1. load pkl
#2. parse to rgb/audio/
#3. padding
filepath = results['filename']
data = pickle.load(open(filepath, 'rb'), encoding='bytes')
pkl_data = data
rgb = pkl_data['image_feature'].astype(float)
audio = pkl_data['audio_feature'].astype(float)
label_id_info = pkl_data['label_info']
label_cls = [label_id_info['label']]
label_one = int(label_cls[0])
if len(label_cls) > 1:
label_index = random.randint(0, 1)
label_one = int(label_cls[label_index])
iou_norm = float(label_id_info['norm_iou'])
results['labels'] = np.array([label_one])
results['iou_norm'] = float(iou_norm)
vitem = [rgb, audio]
for vi in range(2): #rgb and audio
if vi == 0:
prefix = "rgb_"
else:
prefix = "audio_"
feat = vitem[vi]
results[prefix + 'len'] = feat.shape[0]
#feat pad step 1. padding
feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
dtype=np.float32)
feat_pad = np.concatenate((feat, feat_add), axis=0)
results[prefix + 'data'] = feat_pad.astype("float32")
#feat pad step 2. mask
feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
results[prefix + 'mask'] = feat_mask.astype("float32")
return results