0808更新项目代码

V0.1.0
王莹 2 years ago
parent 65cca19d91
commit 530524ff53

@ -0,0 +1,164 @@
import os
import os.path as osp
from paddlevideo.utils.config import get_config
from paddle.inference import Config, create_predictor
from tools.utils import build_inference_helper
class PP_TSMv2_predict(object):
"""PP-TSMv2模型中常用的参数初始化"""
def __init__(self,use_gpu=True,ir_optim=True,
disable_glog=False,save_name=None,enable_mklddn=False,
precision="fp32",gpu_mem=8000,cpu_threads=None):
self.use_gpu = use_gpu #是否使用GPU
self.cpu_threads = cpu_threads #cpu线程数
self.ir_optim = ir_optim #是否开启IR优化
self.disable_glog = disable_glog
self.gpu_mem = gpu_mem #GPU存储大小
self.enable_mkldnn = enable_mklddn #是否开启mkldnn
self.precision = precision #mfldnn精度
self.save_name = save_name #转化推理模型存放名称
def parse_file_paths(self,input_path: str) -> list:
"""
获取模型输入数据
input_path:模型的输入文件
"""
if osp.isfile(input_path):
files = [
input_path,
]
else:
files = os.listdir(input_path)
files = [
file for file in files
if (file.endswith(".avi") or file.endswith(".mp4"))
]
files = [osp.join(input_path, file) for file in files]
return files
def create_paddle_predictor(self,model_f,pretr_p,cfg):
"""
创建推理引擎
model_f:可推理模型存放的路径+配置文件
pretr_p:训练后的参数存放文件
cfg:模型配置文件
"""
config = Config(model_f,pretr_p)
if self.use_gpu:
config.enable_use_gpu(self.gpu_mem,0)
else:
config.disable_gpu()
if self.cpu_threads:
config.set_cpu_math_library_num_threads(self.cpu_threads)
if self.enable_mkldnn:
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
if self.precision == "fp16":
config.enable_mkldnn_bfloat16()
config.switch_ir_optim(self.ir_optim)
config.enable_memory_optim()
config.switch_use_feed_fetch_ops(False)
if self.disable_glog:
config.disable_glog_info()
predictor = create_predictor(config)
return config,predictor
def create_inference_model(self,config,model_f,params_f):
"""
创建推理模型以及引擎
config模型配置文件
model_f可推理模型的存放路径
params_f可推理模型的参数
"""
cfg = get_config(config, overrides=None, show=False)
InferenceHelper = build_inference_helper(cfg.INFERENCE)
_, predictor = self.create_paddle_predictor(model_f, params_f, cfg)
return InferenceHelper,predictor
def predict(self,input_f,batch_size,predictor,InferenceHelper):
"""
推理模型,对数据进行推理预测
config :PP-TSMv2模型的配置文件
input_f:待推理数据集的存放路径
batch_size:模型推理中所取数据的多少,default = 1
predictor:推理引擎
InferenceHelper:推理模型
"""
result = {}
# cfg = get_config(config, overrides=None, show=False)
# model_name = cfg.model_name
# print(f"Inference model({model_name})...")
# get input_tensor and output_tensor
input_names = predictor.get_input_names()
output_names = predictor.get_output_names()
input_tensor_list = []
output_tensor_list = []
for item in input_names:
input_tensor_list.append(predictor.get_input_handle(item))
for item in output_names:
output_tensor_list.append(predictor.get_output_handle(item))
files = self.parse_file_paths(input_f)#input_path=input_f
batch_num = batch_size
for st_idx in range(0, len(files), batch_num):
ed_idx = min(st_idx + batch_num, len(files))
#输出数据预处理
batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx])
for i in range(len(input_tensor_list)):
input_tensor_list[i].copy_from_cpu(batched_inputs[i])
#推理引擎开始推理
predictor.run()
batched_outputs = []
for j in range(len(output_tensor_list)):
batched_outputs.append(output_tensor_list[j].copy_to_cpu())
#输出推理结果
res = InferenceHelper.postprocess(batched_outputs,False,True)
result["video_id"] = res[0]["video_id"]
result["topk_class"] = res[0]["topk_class"].tolist()[0]
result["topk_scores"] = res[0]["topk_scores"].tolist()[0]
# print(result)
return result
# def main():
# config = 'D:/download/PaddleVideo1/output/output/pptsm_lcnet_k400_16frames_uniform.yaml' # 配置文件地址
# input_file='C:/Users/Administrator/Pictures/video_seg_re_hand/test01_3.avi' #待推理数据集存放的地址
# model_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdmodel' # 推理模型存放地址
# params_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdiparams'
# batch_size= 1 #输出推理模型
# infer,predictor = PP_TSMv2_predict().create_inference_model(config,model_file,params_file)
# PP_TSMv2_predict().predict(config,input_file,batch_size,predictor,infer) #推理模型推理、预测
# if __name__ == "__main__":
# main()

@ -0,0 +1,152 @@
# Copyright 2020 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MediaPipe solution drawing utils."""
import math
from typing import List, Mapping, Optional, Tuple, Union
import cv2
import dataclasses
import matplotlib.pyplot as plt
import numpy as np
from mediapipe.framework.formats import detection_pb2
from mediapipe.framework.formats import location_data_pb2
from mediapipe.framework.formats import landmark_pb2
_PRESENCE_THRESHOLD = 0.5
_VISIBILITY_THRESHOLD = 0.5
_BGR_CHANNELS = 3
WHITE_COLOR = (224, 224, 224)
BLACK_COLOR = (0, 0, 0)
RED_COLOR = (0, 0, 255)
GREEN_COLOR = (0, 128, 0)
BLUE_COLOR = (255, 0, 0)
@dataclasses.dataclass
class DrawingSpec:
# Color for drawing the annotation. Default to the white color.
color: Tuple[int, int, int] = WHITE_COLOR
# Thickness for drawing the annotation. Default to 2 pixels.
thickness: int = 2
# Circle radius. Default to 2 pixels.
circle_radius: int = 2
def _normalized_to_pixel_coordinates(
normalized_x: float, normalized_y: float, image_width: int,
image_height: int) -> Union[None, Tuple[int, int]]:
"""Converts normalized value pair to pixel coordinates."""
# Checks if the float value is between 0 and 1.
def is_valid_normalized_value(value: float) -> bool:
return (value > 0 or math.isclose(0, value)) and (value < 1 or
math.isclose(1, value))
if not (is_valid_normalized_value(normalized_x) and
is_valid_normalized_value(normalized_y)):
# TODO: Draw coordinates even if it's outside of the image bounds.
return None
x_px = min(math.floor(normalized_x * image_width), image_width - 1)
y_px = min(math.floor(normalized_y * image_height), image_height - 1)
return x_px, y_px
def draw_landmarks(
image: np.ndarray,
landmark_list: landmark_pb2.NormalizedLandmarkList,
connections: Optional[List[Tuple[int, int]]] = None):
"""Draws the landmarks and the connections on the image.
Args:
image: A three channel BGR image represented as numpy ndarray.
landmark_list: A normalized landmark list proto message to be annotated on
the image.
connections: A list of landmark index tuples that specifies how landmarks to
be connected in the drawing.
landmark_drawing_spec: Either a DrawingSpec object or a mapping from hand
landmarks to the DrawingSpecs that specifies the landmarks' drawing
settings such as color, line thickness, and circle radius. If this
argument is explicitly set to None, no landmarks will be drawn.
connection_drawing_spec: Either a DrawingSpec object or a mapping from hand
connections to the DrawingSpecs that specifies the connections' drawing
settings such as color and line thickness. If this argument is explicitly
set to None, no landmark connections will be drawn.
Raises:
ValueError: If one of the followings:
a) If the input image is not three channel BGR.
b) If any connetions contain invalid landmark index.
"""
if not landmark_list:
return
if image.shape[2] != _BGR_CHANNELS:
raise ValueError('Input image must contain three channel bgr data.')
image_rows, image_cols, _ = image.shape
# 所有的点转换成坐标的字典
idx_to_coordinates = {}
for idx, landmark in enumerate(landmark_list.landmark):
# print('landmark:',landmark)
if ((landmark.HasField('visibility') and
landmark.visibility < _VISIBILITY_THRESHOLD) or
(landmark.HasField('presence') and
landmark.presence < _PRESENCE_THRESHOLD)):
continue
landmark_px = _normalized_to_pixel_coordinates(landmark.x, landmark.y,
image_cols, image_rows)
# print('landmark_px:',landmark_px)
if landmark_px:
idx_to_coordinates[idx] = landmark_px
if connections:
num_landmarks = len(landmark_list.landmark)
# print('connections:',connections)
# Draws the connections if the start and end landmarks are both visible.
start_list = []
end_list = []
for connection in connections:
# print(connection)
start_idx = connection[0]
end_idx = connection[1]
start_list.append(start_idx)
end_list.append(end_idx)
point_list = []
for point_idx in end_list:
# if point_idx not in start_list:
# print(point_idx)
point_list.append(point_idx)
point_axis_list = []
for point in point_list:
if point in list(idx_to_coordinates.keys()):
point_axis_list.append(idx_to_coordinates[point])
return point_axis_list

@ -0,0 +1,104 @@
import cv2
import mediapipe as mp
import analysisPoint as mp_drawing
mp_holistic = mp.solutions.holistic
import numpy as np
class MediapipeProcess:
def mediapipe_det(image,holistic):
'''
调用模型推理获得检测结果
'''
image.flags.writeable = False
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = holistic.process(image)
return results
def get_analysis_result(image,results):
'''
images: 检测的图片
results: 图片的检测结果
对上述结果进行分析
'''
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
face_result = mp_drawing.draw_landmarks(
image,
results.face_landmarks,
mp_holistic.FACEMESH_CONTOURS)
right_hand_result = mp_drawing.draw_landmarks(
image,
results.right_hand_landmarks,
mp_holistic.HAND_CONNECTIONS)
left_hand_result = mp_drawing.draw_landmarks(
image,
results.left_hand_landmarks,
mp_holistic.HAND_CONNECTIONS)
face_bbox = MediapipeProcess.point_to_bbox(face_result)
right_hand_bbox = MediapipeProcess.point_to_bbox(right_hand_result)
left_hand_bbox = MediapipeProcess.point_to_bbox(left_hand_result)
result_dict = {'face_bbox':[face_bbox],'hand_bbox':[right_hand_bbox,left_hand_bbox]}
return result_dict
def point_to_bbox(result_list):
'''
根据关键点坐标获取坐标点的最小外接矩形
'''
result_array = np.array(result_list)
if result_array.all():
rect = cv2.minAreaRect(result_array) # 得到最小外接矩形的(中心(x,y), (宽,高), 旋转角度)
bbox = cv2.boxPoints(rect) # 获取最小外接矩形的4个顶点坐标(ps: cv2.boxPoints(rect) for OpenCV 3.x)
bbox = np.int0(bbox)
bbox=bbox.tolist()
left_top = [min(bbox, key=lambda p: p[0])[0], min(bbox, key=lambda p: p[1])[1]]
right_bottom = [max(bbox, key=lambda p: p[0])[0], max(bbox, key=lambda p: p[1])[1]]
bbox_list = left_top + right_bottom
# print('bbox:',bbox)
# print('bbox_list:',bbox_list)
# bbox_list = []
# bbox_list.append(bbox[0][0])
# bbox_list.append(bbox[0][1])
# bbox_list.append(bbox[2][0])
# bbox_list.append(bbox[2][1])
return bbox_list
else:
pass
# if __name__ == '__main__':
# # media_holistic(video_file='E:/Bank_files/Bank_02/dataset/video_person/after_1/0711-1_199_0.avi',
# video_save_path='E:/Bank_files/Bank_02/videos_mediapipe/test_data/0725_test')

@ -0,0 +1,15 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .version import paddlevideo_version

@ -0,0 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .builder import build_dataset, build_dataloader, build_batch_pipeline
from .dataset import VideoDataset
from .dali_loader import TSN_Dali_loader, get_input_data
__all__ = [
'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',
'TSN_Dali_loader', 'get_input_data'
]

@ -0,0 +1,132 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import signal
import os
import paddle
from paddle.io import DataLoader, DistributedBatchSampler
from .registry import DATASETS, PIPELINES
from ..utils.build_utils import build
from .pipelines.compose import Compose
from paddlevideo.utils import get_logger
from paddlevideo.utils.multigrid import DistributedShortSampler
import numpy as np
logger = get_logger("paddlevideo")
def build_pipeline(cfg):
"""Build pipeline.
Args:
cfg (dict): root config dict.
"""
if cfg == None:
return
return Compose(cfg)
def build_dataset(cfg):
"""Build dataset.
Args:
cfg (dict): root config dict.
Returns:
dataset: dataset.
"""
#XXX: ugly code here!
cfg_dataset, cfg_pipeline = cfg
cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
dataset = build(cfg_dataset, DATASETS, key="format")
return dataset
def build_batch_pipeline(cfg):
batch_pipeline = build(cfg, PIPELINES)
return batch_pipeline
def build_dataloader(dataset,
batch_size,
num_workers,
places,
shuffle=True,
drop_last=True,
multigrid=False,
collate_fn_cfg=None,
**kwargs):
"""Build Paddle Dataloader.
XXX explain how the dataloader work!
Args:
dataset (paddle.dataset): A PaddlePaddle dataset object.
batch_size (int): batch size on single card.
num_worker (int): num_worker
shuffle(bool): whether to shuffle the data at every epoch.
"""
if multigrid:
sampler = DistributedShortSampler(dataset,
batch_sizes=batch_size,
shuffle=True,
drop_last=True)
else:
sampler = DistributedBatchSampler(dataset,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last)
#NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
# batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
# [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
def mix_collate_fn(batch):
pipeline = build_batch_pipeline(collate_fn_cfg)
batch = pipeline(batch)
slots = []
for items in batch:
for i, item in enumerate(items):
if len(slots) < len(items):
slots.append([item])
else:
slots[i].append(item)
return [np.stack(slot, axis=0) for slot in slots]
#if collate_fn_cfg is not None:
#ugly code here. collate_fn is mix op config
# collate_fn = mix_collate_fn(collate_fn_cfg)
data_loader = DataLoader(
dataset,
batch_sampler=sampler,
places=places,
num_workers=num_workers,
collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
return_list=True,
**kwargs)
return data_loader
def term_mp(sig_num, frame):
""" kill all child processes
"""
pid = os.getpid()
pgid = os.getpgid(os.getpid())
logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
os.killpg(pgid, signal.SIGKILL)
return
signal.signal(signal.SIGINT, term_mp)
signal.signal(signal.SIGTERM, term_mp)

@ -0,0 +1,206 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import math
import paddle
from paddle.distributed import ParallelEnv
import paddle.distributed as dist
from paddlevideo.utils import get_logger
logger = get_logger("paddlevideo")
try:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import tempfile
from nvidia.dali.plugin.paddle import DALIGenericIterator
except:
Pipeline = object
def get_input_data(data):
return paddle.to_tensor(data[0]['image']), paddle.to_tensor(
data[0]['label'])
class TSN_Dali_loader(object):
def __init__(self, cfg):
self.batch_size = cfg.batch_size
self.file_path = cfg.file_path
self.num_seg = cfg.num_seg
self.seglen = cfg.seglen
self.short_size = cfg.short_size
self.target_size = cfg.target_size
# set num_shards and shard_id when distributed training is implemented
self.num_shards = dist.get_world_size()
self.shard_id = ParallelEnv().local_rank
self.dali_mean = cfg.mean * (self.num_seg * self.seglen)
self.dali_std = cfg.std * (self.num_seg * self.seglen)
def build_dali_reader(self):
"""
build dali training reader
"""
def reader_():
with open(self.file_path) as flist:
full_lines = [line for line in flist]
if (not hasattr(reader_, 'seed')):
reader_.seed = 0
random.Random(reader_.seed).shuffle(full_lines)
logger.info(f"reader shuffle seed: {reader_.seed}.")
if reader_.seed is not None:
reader_.seed += 1
per_node_lines = int(
math.ceil(len(full_lines) * 1.0 / self.num_shards))
total_lines = per_node_lines * self.num_shards
# aligned full_lines so that it can evenly divisible
full_lines += full_lines[:(total_lines - len(full_lines))]
assert len(full_lines) == total_lines
# trainer get own sample
lines = full_lines[self.shard_id:total_lines:self.num_shards]
assert len(lines) == per_node_lines
logger.info(
f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
)
logger.info(
f"read videos from {self.shard_id * per_node_lines}, "
f"length: {per_node_lines}, "
f"lines length: {len(lines)}, "
f"total: {len(full_lines)}")
video_files = ''.join([item for item in lines])
tf = tempfile.NamedTemporaryFile()
tf.write(str.encode(video_files))
tf.flush()
video_files = tf.name
device_id = ParallelEnv().local_rank
logger.info(f'---------- device_id: {device_id} -----------')
pipe = VideoPipe(batch_size=self.batch_size,
num_threads=1,
device_id=device_id,
file_list=video_files,
sequence_length=self.num_seg * self.seglen,
num_seg=self.num_seg,
seg_length=self.seglen,
resize_shorter_scale=self.short_size,
crop_target_size=self.target_size,
is_training=True,
num_shards=self.num_shards,
shard_id=self.shard_id,
dali_mean=self.dali_mean,
dali_std=self.dali_std)
logger.info(
'initializing dataset, it will take several minutes if it is too large .... '
)
video_loader = DALIGenericIterator([pipe], ['image', 'label'],
len(lines),
dynamic_shape=True,
auto_reset=True)
return video_loader
dali_reader = reader_()
return dali_reader
class VideoPipe(Pipeline):
def __init__(self,
batch_size,
num_threads,
device_id,
file_list,
sequence_length,
num_seg,
seg_length,
resize_shorter_scale,
crop_target_size,
is_training=False,
initial_prefetch_size=20,
num_shards=1,
shard_id=0,
dali_mean=0.,
dali_std=1.0):
super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
self.input = ops.VideoReader(device="gpu",
file_list=file_list,
sequence_length=sequence_length,
num_seg=num_seg,
seg_length=seg_length,
is_training=is_training,
num_shards=num_shards,
shard_id=shard_id,
random_shuffle=is_training,
initial_fill=initial_prefetch_size)
# the sequece data read by ops.VideoReader is of shape [F, H, W, C]
# Because the ops.Resize does not support sequence data,
# it will be transposed into [H, W, F, C],
# then reshaped to [H, W, FC], and then resized like a 2-D image.
self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
self.reshape = ops.Reshape(device="gpu",
rel_shape=[1.0, 1.0, -1],
layout='HWC')
self.resize = ops.Resize(device="gpu",
resize_shorter=resize_shorter_scale)
# crops and mirror are applied by ops.CropMirrorNormalize.
# Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
# It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
self.crop_mirror_norm = ops.CropMirrorNormalize(
device="gpu",
crop=[crop_target_size, crop_target_size],
mean=dali_mean,
std=dali_std)
self.reshape_back = ops.Reshape(
device="gpu",
shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],
layout='FCHW')
self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
def define_graph(self):
output, label = self.input(name="Reader")
output = self.transpose(output)
output = self.reshape(output)
output = self.resize(output)
output = output / 255.
pos_x = self.pos_rng_x()
pos_y = self.pos_rng_y()
mirror_flag = self.mirror_generator()
mirror_flag = (mirror_flag > 0.5)
mirror_flag = self.cast_mirror(mirror_flag)
output = self.crop_mirror_norm(output,
crop_pos_x=pos_x,
crop_pos_y=pos_y,
mirror=mirror_flag)
output = self.reshape_back(output)
label = self.cast_label(label)
return output, label
def __len__(self):
return self.epoch_size()

@ -0,0 +1,109 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class MRIDataset(BaseDataset):
"""Rawframe dataset for action recognition.
The dataset loads raw frames from frame files, and apply specified transform operatation them.
The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
Example of an index file:
.. code-block:: txt
file_path-1 150 1
file_path-2 160 1
file_path-3 170 2
file_path-4 180 2
Args:
file_path (str): Path to the index file.
pipeline(XXX):
data_prefix (str): directory path of the data. Default: None.
test_mode (bool): Whether to bulid the test dataset. Default: False.
suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
"""
def __init__(self,
file_path,
pipeline,
num_retries=5,
data_prefix=None,
test_mode=False,
suffix='img_{:05}.jpg'):
self.num_retries = num_retries
self.suffix = suffix
super().__init__(file_path, pipeline, data_prefix, test_mode)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split()
frame_dir, frames_len, labels = line_split
if self.data_prefix is not None:
frame_dir = osp.join(self.data_prefix, frame_dir)
info.append(
dict(
frame_dir=frame_dir,
#suffix=self.suffix,
frames_len=frames_len,
labels=int(labels)))
return info
def prepare_train(self, idx):
"""Prepare the frames for training/valid gisven index. """
#Try to catch Exception caused by reading missing frames files
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['frame_dir'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return np.array(results['imgs']), np.array([results['labels']])
def prepare_test(self, idx):
"""Prepare the frames for test given index. """
#Try to catch Exception caused by reading missing frames files
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['frame_dir'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return np.array(results['imgs']), np.array([results['labels']])

@ -0,0 +1,111 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class SFMRIDataset(BaseDataset):
"""Rawframe dataset for action recognition.
The dataset loads raw frames from frame files, and apply specified transform operatation them.
The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
Example of an index file:
.. code-block:: txt
file_path-1 150 1
file_path-2 160 1
file_path-3 170 2
file_path-4 180 2
Args:
file_path (str): Path to the index file.
pipeline(XXX):
data_prefix (str): directory path of the data. Default: None.
test_mode (bool): Whether to bulid the test dataset. Default: False.
suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
"""
def __init__(self,
file_path,
pipeline,
num_retries=5,
data_prefix=None,
test_mode=False,
suffix='img_{:05}.jpg'):
self.num_retries = num_retries
self.suffix = suffix
super().__init__(file_path, pipeline, data_prefix, test_mode)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split()
frame_dir, frames_len, labels = line_split
if self.data_prefix is not None:
frame_dir = osp.join(self.data_prefix, frame_dir)
info.append(
dict(
frame_dir=frame_dir,
#suffix=self.suffix,
frames_len=frames_len,
labels=int(labels)))
return info
def prepare_train(self, idx):
"""Prepare the frames for training/valid gisven index. """
#Try to catch Exception caused by reading missing frames files
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['frame_dir'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return np.array(results['imgs'][0]), np.array(
results['imgs'][1]), np.array([results['labels']])
def prepare_test(self, idx):
"""Prepare the frames for test given index. """
#Try to catch Exception caused by reading missing frames files
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['frame_dir'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return np.array(results['imgs'][0]), np.array(
results['imgs'][1]), np.array([results['labels']])

@ -0,0 +1,41 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .actbert_dataset import ActBertDataset
from .ava_dataset import AVADataset
from .bmn_dataset import BMNDataset
from .davis_dataset import DavisDataset
from .feature import FeatureDataset
from .frame import FrameDataset, FrameDataset_Sport
from .MRI import MRIDataset
from .MRI_SlowFast import SFMRIDataset
from .msrvtt import MSRVTTDataset
from .actbert_dataset import ActBertDataset
from .asrf_dataset import ASRFDataset
from .ms_tcn_dataset import MSTCNDataset
from .oxford import MonoDataset
from .skeleton import SkeletonDataset
from .slowfast_video import SFVideoDataset
from .video import VideoDataset
from .ucf101_skeleton import UCF101SkeletonDataset
from .ucf24_dataset import UCF24Dataset
__all__ = [
'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',
'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',
'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',
'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',
'UCF101SkeletonDataset', 'UCF24Dataset'
]

@ -0,0 +1,74 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
try:
import lmdb
except ImportError as e:
print(
f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
)
import pickle
import json
try:
from paddlenlp.transformers import BertTokenizer
except ImportError as e:
print(
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
)
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class ActBertDataset(BaseDataset):
"""ActBert dataset.
"""
def __init__(
self,
file_path,
pipeline,
bert_model="bert-base-uncased",
data_prefix=None,
test_mode=False,
):
self.bert_model = bert_model
super().__init__(file_path, pipeline, data_prefix, test_mode)
def load_file(self):
"""Load index file to get video information."""
feature_data = np.load(self.file_path, allow_pickle=True)
self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,
do_lower_case=True)
self.info = []
for item in feature_data:
self.info.append(dict(feature=item, tokenizer=self.tokenizer))
return self.info
def prepare_train(self, idx):
"""Prepare the frames for training/valid given index. """
results = copy.deepcopy(self.info[idx])
#print('==results==', results)
results = self.pipeline(results)
return results['features']
def prepare_test(self, idx):
"""Prepare the frames for test given index. """
pass

@ -0,0 +1,104 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class ASRFDataset(BaseDataset):
"""Video dataset for action segmentation.
"""
def __init__(
self,
file_path,
pipeline,
feature_path,
label_path,
boundary_path,
**kwargs,
):
super().__init__(file_path, pipeline, **kwargs)
self.label_path = label_path
self.boundary_path = boundary_path
self.feature_path = feature_path
def load_file(self):
"""Load index file to get video information."""
file_ptr = open(self.file_path, 'r')
info = file_ptr.read().split('\n')[:-1]
file_ptr.close()
return info
def prepare_train(self, idx):
"""TRAIN & VALID: Prepare data for training/valid given the index."""
results = {}
video_name = self.info[idx]
# load video feature
file_name = video_name.split('.')[0] + ".npy"
feat_file_path = os.path.join(self.feature_path, file_name)
#TODO: check path
video_feat = np.load(feat_file_path)
# load label
file_name = video_name.split('.')[0] + ".npy"
label_file_path = os.path.join(self.label_path, file_name)
label = np.load(label_file_path).astype(np.int64)
# load boundary
file_name = video_name.split('.')[0] + ".npy"
boundary_file_path = os.path.join(self.boundary_path, file_name)
boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
results['video_feat'] = copy.deepcopy(video_feat)
results['video_label'] = copy.deepcopy(label)
results['video_boundary'] = copy.deepcopy(boundary)
results = self.pipeline(results)
return results['video_feat'], results['video_label'], results['video_boundary']
def prepare_test(self, idx):
"""TEST: Prepare the data for test given the index."""
results = {}
video_name = self.info[idx]
# load video feature
file_name = video_name.split('.')[0] + ".npy"
feat_file_path = os.path.join(self.feature_path, file_name)
#TODO: check path
video_feat = np.load(feat_file_path)
# load label
file_name = video_name.split('.')[0] + ".npy"
label_file_path = os.path.join(self.label_path, file_name)
label = np.load(label_file_path).astype(np.int64)
# load boundary
file_name = video_name.split('.')[0] + ".npy"
boundary_file_path = os.path.join(self.boundary_path, file_name)
boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
results['video_feat'] = copy.deepcopy(video_feat)
results['video_label'] = copy.deepcopy(label)
results['video_boundary'] = copy.deepcopy(boundary)
results = self.pipeline(results)
return results['video_feat'], results['video_label'], results['video_boundary']

@ -0,0 +1,249 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
import sys
import os
import pickle
from datetime import datetime
from ...metrics.ava_utils import ava_evaluate_results
from ..registry import DATASETS
from .base import BaseDataset
from collections import defaultdict
@DATASETS.register()
class AVADataset(BaseDataset):
"""AVA dataset for spatial temporal detection.
the dataset loads raw frames, bounding boxes, proposals and applies
transformations to return the frame tensors and other information.
"""
_FPS = 30
def __init__(self,
pipeline,
file_path=None,
exclude_file=None,
label_file=None,
suffix='{:05}.jpg',
proposal_file=None,
person_det_score_thr=0.9,
num_classes=81,
data_prefix=None,
test_mode=False,
num_max_proposals=1000,
timestamp_start=900,
timestamp_end=1800):
self.custom_classes = None
self.exclude_file = exclude_file
self.label_file = label_file
self.proposal_file = proposal_file
assert 0 <= person_det_score_thr <= 1, (
'The value of '
'person_det_score_thr should in [0, 1]. ')
self.person_det_score_thr = person_det_score_thr
self.num_classes = num_classes
self.suffix = suffix
self.num_max_proposals = num_max_proposals
self.timestamp_start = timestamp_start
self.timestamp_end = timestamp_end
super().__init__(
file_path,
pipeline,
data_prefix,
test_mode,
)
if self.proposal_file is not None:
self.proposals = self._load(self.proposal_file)
else:
self.proposals = None
if not test_mode:
valid_indexes = self.filter_exclude_file()
self.info = self.info = [self.info[i] for i in valid_indexes]
def _load(self, path):
f = open(path, 'rb')
res = pickle.load(f)
f.close()
return res
def parse_img_record(self, img_records):
bboxes, labels, entity_ids = [], [], []
while len(img_records) > 0:
img_record = img_records[0]
num_img_records = len(img_records)
selected_records = list(
filter(
lambda x: np.array_equal(x['entity_box'], img_record[
'entity_box']), img_records))
num_selected_records = len(selected_records)
img_records = list(
filter(
lambda x: not np.array_equal(x['entity_box'], img_record[
'entity_box']), img_records))
assert len(img_records) + num_selected_records == num_img_records
bboxes.append(img_record['entity_box'])
valid_labels = np.array([
selected_record['label'] for selected_record in selected_records
])
label = np.zeros(self.num_classes, dtype=np.float32)
label[valid_labels] = 1.
labels.append(label)
entity_ids.append(img_record['entity_id'])
bboxes = np.stack(bboxes)
labels = np.stack(labels)
entity_ids = np.stack(entity_ids)
return bboxes, labels, entity_ids
def filter_exclude_file(self):
valid_indexes = []
if self.exclude_file is None:
valid_indexes = list(range(len(self.info)))
else:
exclude_video_infos = [
x.strip().split(',') for x in open(self.exclude_file)
]
for i, video_info in enumerate(self.info):
valid_indexes.append(i)
for video_id, timestamp in exclude_video_infos:
if (video_info['video_id'] == video_id
and video_info['timestamp'] == int(timestamp)):
valid_indexes.pop()
break
return valid_indexes
def load_file(self):
"""Load index file to get video information."""
info = []
records_dict_by_img = defaultdict(list)
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split(',')
video_id = line_split[0]
timestamp = int(line_split[1])
img_key = f'{video_id},{timestamp:04d}'
entity_box = np.array(list(map(float, line_split[2:6])))
label = int(line_split[6])
entity_id = int(line_split[7])
shot_info = (0, (self.timestamp_end - self.timestamp_start) *
self._FPS)
video_info = dict(video_id=video_id,
timestamp=timestamp,
entity_box=entity_box,
label=label,
entity_id=entity_id,
shot_info=shot_info)
records_dict_by_img[img_key].append(video_info)
for img_key in records_dict_by_img:
video_id, timestamp = img_key.split(',')
bboxes, labels, entity_ids = self.parse_img_record(
records_dict_by_img[img_key])
ann = dict(gt_bboxes=bboxes,
gt_labels=labels,
entity_ids=entity_ids)
frame_dir = video_id
if self.data_prefix is not None:
frame_dir = osp.join(self.data_prefix, frame_dir)
video_info = dict(frame_dir=frame_dir,
video_id=video_id,
timestamp=int(timestamp),
img_key=img_key,
shot_info=shot_info,
fps=self._FPS,
ann=ann)
info.append(video_info)
return info
def prepare_train(self, idx):
results = copy.deepcopy(self.info[idx])
img_key = results['img_key']
results['suffix'] = self.suffix
results['timestamp_start'] = self.timestamp_start
results['timestamp_end'] = self.timestamp_end
if self.proposals is not None:
if img_key not in self.proposals:
results['proposals'] = np.array([[0, 0, 1, 1]])
results['scores'] = np.array([1])
else:
proposals = self.proposals[img_key]
assert proposals.shape[-1] in [4, 5]
if proposals.shape[-1] == 5:
thr = min(self.person_det_score_thr, max(proposals[:, 4]))
positive_inds = (proposals[:, 4] >= thr)
proposals = proposals[positive_inds]
proposals = proposals[:self.num_max_proposals]
results['proposals'] = proposals[:, :4]
results['scores'] = proposals[:, 4]
else:
proposals = proposals[:self.num_max_proposals]
results['proposals'] = proposals
ann = results.pop('ann')
results['gt_bboxes'] = ann['gt_bboxes']
results['gt_labels'] = ann['gt_labels']
results['entity_ids'] = ann['entity_ids']
#ret = self.pipeline(results, "")
ret = self.pipeline(results)
#padding for dataloader
len_proposals = ret['proposals'].shape[0]
len_gt_bboxes = ret['gt_bboxes'].shape[0]
len_gt_labels = ret['gt_labels'].shape[0]
len_scores = ret['scores'].shape[0]
len_entity_ids = ret['entity_ids'].shape[0]
padding_len = 128
ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)
ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)
ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)
ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)
ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)
return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[
'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[
'entity_ids'], np.array(
ret['img_shape'], dtype=int
), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids
def my_padding_2d(self, feat, max_len):
feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),
dtype=np.float32)
feat_pad = np.concatenate((feat, feat_add), axis=0)
return feat_pad
def my_padding_1d(self, feat, max_len):
feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)
feat_pad = np.concatenate((feat, feat_add), axis=0)
return feat_pad
def prepare_test(self, idx):
return self.prepare_train(idx)
def evaluate(self, results):
return ava_evaluate_results(self.info, len(self), results,
self.custom_classes, self.label_file,
self.file_path, self.exclude_file)

@ -0,0 +1,80 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import numpy as np
from abc import ABC, abstractmethod
import paddle
from paddle.io import Dataset
class BaseDataset(Dataset, ABC):
"""Base class for datasets
All datasets should subclass it.
All subclass should overwrite:
- Method: `load_file`, load info from index file.
- Method: `prepare_train`, providing train data.
- Method: `prepare_test`, providing test data.
Args:
file_path (str): index file path.
pipeline (Sequence XXX)
data_prefix (str): directory path of the data. Default: None.
test_mode (bool): whether to build test dataset. Default: False.
"""
def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
super().__init__()
self.file_path = file_path
self.data_prefix = osp.realpath(data_prefix) if \
data_prefix is not None and osp.isdir(data_prefix) else data_prefix
self.test_mode = test_mode
self.pipeline = pipeline
self.info = self.load_file()
@abstractmethod
def load_file(self):
"""load the video information from the index file path."""
pass
def prepare_train(self, idx):
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
#Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
#unsqueeze label to list
return results['imgs'], np.array([results['labels']])
def prepare_test(self, idx):
"""TEST: Prepare the data for test given the index."""
#Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
#unsqueeze label to list
return results['imgs'], np.array([results['labels']])
def __len__(self):
"""get the size of the dataset."""
return len(self.info)
def __getitem__(self, idx):
""" Get the sample for either training or testing given index"""
if self.test_mode:
return self.prepare_test(idx)
else:
return self.prepare_train(idx)

@ -0,0 +1,72 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class BMNDataset(BaseDataset):
"""Video dataset for action localization.
"""
def __init__(
self,
file_path,
pipeline,
subset,
**kwargs,
):
self.subset = subset
super().__init__(file_path, pipeline, **kwargs)
def load_file(self):
"""Load index file to get video information."""
info = []
annos = json.load(open(self.file_path))
for video_name in annos.keys():
video_subset = annos[video_name]["subset"]
if self.subset in video_subset:
info.append(
dict(
video_name=video_name,
video_info=annos[video_name],
))
#sort by video_name
sort_f = lambda elem: elem['video_name']
info.sort(key=sort_f)
#add video_idx to info
for idx, elem in enumerate(info):
info[idx]['video_idx'] = idx
logger.info("{} subset video numbers: {}".format(
self.subset, len(info)))
return info
def prepare_train(self, idx):
"""TRAIN & VALID: Prepare data for training/valid given the index."""
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
return results['video_feat'], results['gt_iou_map'], results['gt_start'],\
results['gt_end']
def prepare_test(self, idx):
"""TEST: Prepare the data for test given the index."""
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
return results['video_feat'], results['gt_iou_map'], results['gt_start'], \
results['gt_end'], results['video_idx']

@ -0,0 +1,189 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import os.path as osp
import copy
import random
import numpy as np
import shutil
from PIL import Image
import cv2
from paddle.io import Dataset
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
class VOS_Test(Dataset):
"""process frames in each video
"""
def __init__(self,
image_root,
label_root,
seq_name,
images,
labels,
pipeline=None,
rgb=False,
resolution=None):
self.image_root = image_root
self.label_root = label_root
self.seq_name = seq_name
self.images = images # image file list
self.labels = labels
self.obj_num = 1
self.num_frame = len(self.images)
self.pipeline = pipeline
self.rgb = rgb
self.resolution = resolution
self.obj_nums = []
temp_obj_num = 0
for img_name in self.images:
self.obj_nums.append(temp_obj_num)
current_label_name = img_name.split('.')[0] + '.png'
if current_label_name in self.labels:
current_label = self.read_label(current_label_name)
if temp_obj_num < np.unique(
current_label)[-1]: #get object number from label_id
temp_obj_num = np.unique(current_label)[-1]
def __len__(self):
return len(self.images)
def read_image(self, idx):
img_name = self.images[idx]
img_path = os.path.join(self.image_root, self.seq_name, img_name)
img = cv2.imread(img_path)
img = np.array(img, dtype=np.float32)
if self.rgb:
img = img[:, :, [2, 1, 0]]
return img
def read_label(self, label_name):
label_path = os.path.join(self.label_root, self.seq_name, label_name)
label = Image.open(label_path)
label = np.array(label, dtype=np.uint8)
return label
def __getitem__(self, idx):
img_name = self.images[idx]
current_img = self.read_image(idx)
current_img = np.array(current_img)
height, width, channels = current_img.shape
if self.resolution is not None:
width = int(np.ceil(float(width) * self.resolution / float(height)))
height = int(self.resolution)
current_label_name = img_name.split('.')[0] + '.png'
obj_num = self.obj_nums[idx]
if current_label_name in self.labels:
current_label = self.read_label(current_label_name)
current_label = np.array(current_label)
sample = {
'current_img': current_img,
'current_label': current_label
}
else:
sample = {
'current_img': current_img
} #only the first frame contains label
sample['meta'] = {
'seq_name': self.seq_name,
'frame_num': self.num_frame,
'obj_num': obj_num,
'current_name': img_name,
'height': height,
'width': width,
'flip': False
}
if self.pipeline is not None:
sample = self.pipeline(sample)
for s in sample:
s['current_img'] = np.array(s['current_img'])
if 'current_label' in s.keys():
s['current_label'] = s['current_label']
return sample
@DATASETS.register()
class DavisDataset(BaseDataset):
"""Davis 2017 dataset.
"""
def __init__(
self,
file_path,
result_root,
pipeline,
data_prefix=None,
test_mode=False,
year=2017,
rgb=False,
resolution='480p',
):
self.rgb = rgb
self.result_root = result_root
self.resolution = resolution
self.year = year
self.spt = 'val' if test_mode else 'train'
super().__init__(file_path, pipeline, data_prefix, test_mode)
def load_file(self):
self.image_root = os.path.join(self.file_path, 'JPEGImages',
self.resolution)
self.label_root = os.path.join(self.file_path, 'Annotations',
self.resolution)
seq_names = []
with open(
os.path.join(self.file_path, 'ImageSets', str(self.year),
self.spt + '.txt')) as f:
seqs_tmp = f.readlines()
seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
seq_names.extend(seqs_tmp)
self.info = list(np.unique(seq_names))
return self.info
def prepare_test(self, idx):
seq_name = self.info[idx] #video name
images = list(
np.sort(os.listdir(os.path.join(self.image_root, seq_name))))
labels = [images[0].replace('jpg', 'png')] #we have first frame target
# copy first frame target
if not os.path.isfile(
os.path.join(self.result_root, seq_name, labels[0])):
if not os.path.exists(os.path.join(self.result_root, seq_name)):
os.makedirs(os.path.join(self.result_root, seq_name))
source_label_path = os.path.join(self.label_root, seq_name,
labels[0])
result_label_path = os.path.join(self.result_root, seq_name,
labels[0])
shutil.copy(source_label_path, result_label_path)
seq_dataset = VOS_Test(self.image_root,
self.label_root,
seq_name,
images,
labels,
self.pipeline,
rgb=self.rgb,
resolution=480)
return seq_dataset

@ -0,0 +1,80 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os.path as osp
from ..registry import DATASETS
from .base import BaseDataset
@DATASETS.register()
class FeatureDataset(BaseDataset):
"""Feature dataset for action recognition
Example:(TODO)
Args:(TODO)
"""
def __init__(
self,
file_path,
pipeline,
data_prefix=None,
test_mode=False,
suffix=None,
):
self.suffix = suffix
super().__init__(file_path, pipeline, data_prefix, test_mode)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
filename = line.strip().split()[0]
if self.data_prefix is not None:
filename = osp.join(self.data_prefix, filename)
if self.suffix is not None:
filename = filename + self.suffix
info.append(dict(filename=filename))
return info
def prepare_train(self, idx):
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
if 'iou_norm' in results:
return results['rgb_data'], results['rgb_len'], results[
'rgb_mask'], results['audio_data'], results[
'audio_len'], results['audio_mask'], results[
'labels'], results['iou_norm']
else:
return results['rgb_data'], results['rgb_len'], results[
'rgb_mask'], results['audio_data'], results[
'audio_len'], results['audio_mask'], results['labels']
def prepare_test(self, idx):
"""TEST. Prepare the data for testing given the index."""
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
if 'iou_norm' in results:
return results['rgb_data'], results['rgb_len'], results[
'rgb_mask'], results['audio_data'], results[
'audio_len'], results['audio_mask'], results[
'labels'], results['iou_norm']
else:
return results['rgb_data'], results['rgb_len'], results[
'rgb_mask'], results['audio_data'], results[
'audio_len'], results['audio_mask'], results['labels']

@ -0,0 +1,177 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class FrameDataset(BaseDataset):
"""Rawframe dataset for action recognition.
The dataset loads raw frames from frame files, and apply specified transform operatation them.
The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
Example of an index file:
.. code-block:: txt
file_path-1 150 1
file_path-2 160 1
file_path-3 170 2
file_path-4 180 2
Args:
file_path (str): Path to the index file.
pipeline(XXX):
data_prefix (str): directory path of the data. Default: None.
test_mode (bool): Whether to bulid the test dataset. Default: False.
suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
"""
def __init__(self,
file_path,
pipeline,
num_retries=5,
data_prefix=None,
test_mode=False,
suffix='img_{:05}.jpg'):
self.num_retries = num_retries
self.suffix = suffix
super().__init__(file_path, pipeline, data_prefix, test_mode)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split()
frame_dir, frames_len, labels = line_split
if self.data_prefix is not None:
frame_dir = osp.join(self.data_prefix, frame_dir)
info.append(
dict(frame_dir=frame_dir,
suffix=self.suffix,
frames_len=frames_len,
labels=int(labels)))
return info
def prepare_train(self, idx):
"""Prepare the frames for training/valid given index. """
#Try to catch Exception caused by reading missing frames files
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['frame_dir'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'], np.array([results['labels']])
def prepare_test(self, idx):
"""Prepare the frames for test given index. """
#Try to catch Exception caused by reading missing frames files
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['frame_dir'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'], np.array([results['labels']])
@DATASETS.register()
class FrameDataset_Sport(BaseDataset):
"""Video dataset for action recognition
The dataset loads raw videos and apply specified transforms on them.
The index file is a file with multiple lines, and each line indicates
a sample video with the filepath and label, which are split with a whitesapce.
Example of a inde file:
.. code-block:: txt
path/000.mp4 1
path/001.mp4 1
path/002.mp4 2
path/003.mp4 2
Args:
file_path(str): Path to the index file.
pipeline(XXX): A sequence of data transforms.
**kwargs: Keyword arguments for ```BaseDataset```.
"""
def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
self.num_retries = num_retries
self.suffix = suffix
super().__init__(file_path, pipeline, **kwargs)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split()
frame_dir = line_split[0]
if self.data_prefix is not None:
frame_dir = osp.join(self.data_prefix, frame_dir)
info.append(dict(frame_dir=frame_dir, suffix=self.suffix))
return info
def prepare_train(self, idx):
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
#Try to catch Exception caused by reading corrupted video file
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['filename'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'], np.array([results['labels']])
def prepare_test(self, idx):
"""TEST. Prepare the data for test given the index."""
#Try to catch Exception caused by reading corrupted video file
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['filename'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'], np.array([results['labels']])

@ -0,0 +1,110 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class MSTCNDataset(BaseDataset):
"""Video dataset for action segmentation.
"""
def __init__(
self,
file_path,
pipeline,
feature_path,
gt_path,
actions_map_file_path,
**kwargs,
):
super().__init__(file_path, pipeline, **kwargs)
self.gt_path = gt_path
self.actions_map_file_path = actions_map_file_path
self.feature_path = feature_path
# actions dict generate
file_ptr = open(self.actions_map_file_path, 'r')
actions = file_ptr.read().split('\n')[:-1]
file_ptr.close()
self.actions_dict = dict()
for a in actions:
self.actions_dict[a.split()[1]] = int(a.split()[0])
self.num_classes = len(self.actions_dict.keys())
def load_file(self):
"""Load index file to get video information."""
file_ptr = open(self.file_path, 'r')
info = file_ptr.read().split('\n')[:-1]
file_ptr.close()
return info
def prepare_train(self, idx):
"""TRAIN & VALID: Prepare data for training/valid given the index."""
results = {}
video_name = self.info[idx]
# load video feature
file_name = video_name.split('.')[0] + ".npy"
feat_file_path = os.path.join(self.feature_path, file_name)
#TODO: check path
video_feat = np.load(feat_file_path)
# load label
target_file_path = os.path.join(self.gt_path, video_name)
file_ptr = open(target_file_path, 'r')
content = file_ptr.read().split('\n')[:-1]
classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')
for i in range(len(classes)):
classes[i] = self.actions_dict[content[i]]
# classes = classes * (-100)
results['video_feat'] = copy.deepcopy(video_feat)
results['video_gt'] = copy.deepcopy(classes)
results = self.pipeline(results)
return results['video_feat'], results['video_gt']
def prepare_test(self, idx):
"""TEST: Prepare the data for test given the index."""
results = {}
video_name = self.info[idx]
# load video feature
file_name = video_name.split('.')[0] + ".npy"
feat_file_path = os.path.join(self.feature_path, file_name)
#TODO: check path
video_feat = np.load(feat_file_path)
# load label
target_file_path = os.path.join(self.gt_path, video_name)
file_ptr = open(target_file_path, 'r')
content = file_ptr.read().split('\n')[:-1]
classes = np.zeros(min(np.shape(video_feat)[1], len(content)))
for i in range(len(classes)):
classes[i] = self.actions_dict[content[i]]
# classes = classes * (-100)
results['video_feat'] = copy.deepcopy(video_feat)
results['video_gt'] = copy.deepcopy(classes)
results = self.pipeline(results)
return results['video_feat'], results['video_gt']

@ -0,0 +1,220 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
try:
import lmdb
except ImportError as e:
print(
f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
)
import pickle
try:
from paddlenlp.transformers import BertTokenizer
except ImportError as e:
print(
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
)
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class MSRVTTDataset(BaseDataset):
"""MSR-VTT dataset for text-video clip retrieval.
"""
def __init__(
self,
file_path,
pipeline,
features_path,
bert_model="bert-base-uncased",
padding_index=0,
max_seq_length=36,
max_region_num=36,
max_action_num=5,
vision_feature_dim=2048,
action_feature_dim=2048,
spatials_dim=5,
data_prefix=None,
test_mode=False,
):
self.features_path = features_path
self.bert_model = bert_model
self.padding_index = padding_index
self.max_seq_length = max_seq_length
self.max_region_num = max_region_num
self._max_action_num = max_action_num
self.vision_feature_dim = vision_feature_dim
self.action_feature_dim = action_feature_dim
self.spatials_dim = spatials_dim
self._tokenizer = BertTokenizer.from_pretrained(bert_model,
do_lower_case=True)
super().__init__(file_path, pipeline, data_prefix, test_mode)
self.tokenize()
self.gen_feature()
def load_file(self):
"""Load index file to get video information."""
with open(self.file_path) as fin:
self.image_entries = []
self.caption_entries = []
for line in fin.readlines():
line = line.strip()
vid_id = line.split(',')[0]
self.image_entries.append(vid_id)
self.caption_entries.append({
"caption": line.split(',')[1],
"vid_id": vid_id
})
self.env = lmdb.open(self.features_path)
def tokenize(self):
for entry in self.caption_entries:
tokens = []
tokens.append("[CLS]")
for token in self._tokenizer.tokenize(entry["caption"]):
tokens.append(token)
tokens.append("[SEP]")
tokens = self._tokenizer.convert_tokens_to_ids(tokens)
segment_ids = [0] * len(tokens)
input_mask = [1] * len(tokens)
if len(tokens) < self.max_seq_length:
padding = [self.padding_index
] * (self.max_seq_length - len(tokens))
tokens = tokens + padding
input_mask += padding
segment_ids += padding
entry["token"] = np.array(tokens).astype('int64')
entry["input_mask"] = np.array(input_mask)
entry["segment_ids"] = np.array(segment_ids).astype('int64')
def get_image_feature(self, video_id):
video_id = str(video_id).encode()
with self.env.begin(write=False) as txn:
item = pickle.loads(txn.get(video_id))
video_id = item["video_id"]
image_h = int(item["image_h"])
image_w = int(item["image_w"])
features = item["features"].reshape(-1, self.vision_feature_dim)
boxes = item["boxes"].reshape(-1, 4)
num_boxes = features.shape[0]
g_feat = np.sum(features, axis=0) / num_boxes
num_boxes = num_boxes + 1
features = np.concatenate(
[np.expand_dims(g_feat, axis=0), features], axis=0)
action_features = item["action_features"].reshape(
-1, self.action_feature_dim)
image_location = np.zeros((boxes.shape[0], self.spatials_dim),
dtype=np.float32)
image_location[:, :4] = boxes
image_location[:,
4] = ((image_location[:, 3] - image_location[:, 1]) *
(image_location[:, 2] - image_location[:, 0]) /
(float(image_w) * float(image_h)))
image_location[:, 0] = image_location[:, 0] / float(image_w)
image_location[:, 1] = image_location[:, 1] / float(image_h)
image_location[:, 2] = image_location[:, 2] / float(image_w)
image_location[:, 3] = image_location[:, 3] / float(image_h)
g_location = np.array([0, 0, 1, 1, 1])
image_location = np.concatenate(
[np.expand_dims(g_location, axis=0), image_location], axis=0)
return features, num_boxes, image_location, action_features
def gen_feature(self):
num_inst = len(self.image_entries) #1000
self.features_all = np.zeros(
(num_inst, self.max_region_num, self.vision_feature_dim))
self.action_features_all = np.zeros(
(num_inst, self._max_action_num, self.action_feature_dim))
self.spatials_all = np.zeros(
(num_inst, self.max_region_num, self.spatials_dim))
self.image_mask_all = np.zeros((num_inst, self.max_region_num))
self.action_mask_all = np.zeros((num_inst, self._max_action_num))
for i, image_id in enumerate(self.image_entries):
features, num_boxes, boxes, action_features = self.get_image_feature(
image_id)
mix_num_boxes = min(int(num_boxes), self.max_region_num)
mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))
mix_features_pad = np.zeros(
(self.max_region_num, self.vision_feature_dim))
image_mask = [1] * (int(mix_num_boxes))
while len(image_mask) < self.max_region_num:
image_mask.append(0)
action_mask = [1] * (self._max_action_num)
while len(action_mask) < self._max_action_num:
action_mask.append(0)
mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
self.features_all[i] = mix_features_pad
x = action_features.shape[0]
self.action_features_all[i][:x] = action_features[:]
self.image_mask_all[i] = np.array(image_mask)
self.action_mask_all[i] = np.array(action_mask)
self.spatials_all[i] = mix_boxes_pad
self.features_all = self.features_all.astype("float32")
self.action_features_all = self.action_features_all.astype("float32")
self.image_mask_all = self.image_mask_all.astype("int64")
self.action_mask_all = self.action_mask_all.astype("int64")
self.spatials_all = self.spatials_all.astype("float32")
def prepare_train(self, idx):
pass
def prepare_test(self, idx):
entry = self.caption_entries[idx]
caption = entry["token"]
input_mask = entry["input_mask"]
segment_ids = entry["segment_ids"]
target_all = np.zeros(1000)
for i, image_id in enumerate(self.image_entries):
if image_id == entry["vid_id"]:
target_all[i] = 1
return (
caption,
self.action_features_all,
self.features_all,
self.spatials_all,
segment_ids,
input_mask,
self.image_mask_all,
self.action_mask_all,
target_all,
)
def __len__(self):
return len(self.caption_entries)

@ -0,0 +1,62 @@
# Copyright Niantic 2019. Patent Pending. All rights reserved.
#
# This software is licensed under the terms of the Monodepth2 licence
# which allows for non-commercial use only, the full terms of which are made
# available in the LICENSE file.
from __future__ import absolute_import, division, print_function
import copy
from os import path as osp
from PIL import Image
from ..registry import DATASETS
from .base import BaseDataset
def pil_loader(path):
# open path as file to avoid ResourceWarning
# (https://github.com/python-pillow/Pillow/issues/835)
with open(path, 'rb') as f:
with Image.open(f) as img:
return img.convert('RGB')
@DATASETS.register()
class MonoDataset(BaseDataset):
def __init__(self,
file_path,
data_prefix,
pipeline,
num_retries=0,
suffix='.png',
**kwargs):
self.num_retries = num_retries
self.suffix = suffix
super().__init__(file_path, pipeline, data_prefix, **kwargs)
def load_file(self):
info = []
with open(self.file_path, 'r') as f:
for line in f:
filename = line.strip() + self.suffix
folder = osp.dirname(filename)
frame_index = line.strip().split('/')[1]
info.append(
dict(data_path=self.data_prefix,
filename=filename,
folder=folder,
frame_index=int(frame_index)))
return info
def prepare_train(self, idx):
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
results['imgs']['idx'] = idx
return results['imgs'], results['day_or_night']
def prepare_test(self, idx):
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
return results['imgs'], results['day_or_night']

@ -0,0 +1,78 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
import pickle
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class SkeletonDataset(BaseDataset):
"""
Skeleton dataset for action recognition.
The dataset loads skeleton feature, and apply norm operatations.
Args:
file_path (str): Path to the index file.
pipeline(obj): Define the pipeline of data preprocessing.
data_prefix (str): directory path of the data. Default: None.
test_mode (bool): Whether to bulid the test dataset. Default: False.
"""
def __init__(self, file_path, pipeline, label_path=None, test_mode=False):
self.label_path = label_path
super().__init__(file_path, pipeline, test_mode=test_mode)
def load_file(self):
"""Load feature file to get skeleton information."""
logger.info("Loading data, it will take some moment...")
self.data = np.load(self.file_path)
if self.label_path:
if self.label_path.endswith('npy'):
self.label = np.load(self.label_path)
elif self.label_path.endswith('pkl'):
with open(self.label_path, 'rb') as f:
sample_name, self.label = pickle.load(f)
else:
logger.info(
"Label path not provided when test_mode={}, here just output predictions."
.format(self.test_mode))
logger.info("Data Loaded!")
return self.data # used for __len__
def prepare_train(self, idx):
"""Prepare the feature for training/valid given index. """
results = dict()
results['data'] = copy.deepcopy(self.data[idx])
results['label'] = copy.deepcopy(self.label[idx])
results = self.pipeline(results)
return results['data'], results['label']
def prepare_test(self, idx):
"""Prepare the feature for test given index. """
results = dict()
results['data'] = copy.deepcopy(self.data[idx])
if self.label_path:
results['label'] = copy.deepcopy(self.label[idx])
results = self.pipeline(results)
return results['data'], results['label']
else:
results = self.pipeline(results)
return [results['data']]

@ -0,0 +1,143 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class SFVideoDataset(BaseDataset):
"""Video dataset for action recognition
The dataset loads raw videos and apply specified transforms on them.
The index file is a file with multiple lines, and each line indicates
a sample video with the filepath and label, which are split with a whitesapce.
Example of a inde file:
.. code-block:: txt
path/000.mp4 1
path/001.mp4 1
path/002.mp4 2
path/003.mp4 2
Args:
file_path(str): Path to the index file.
pipeline(XXX): A sequence of data transforms.
num_ensemble_views(int): temporal segment when multi-crop test
num_spatial_crops(int): spatial crop number when multi-crop test
**kwargs: Keyword arguments for ```BaseDataset```.
"""
def __init__(
self,
file_path,
pipeline,
num_ensemble_views=1,
num_spatial_crops=1,
num_retries=5,
num_samples_precise_bn=None,
**kwargs,
):
self.num_ensemble_views = num_ensemble_views
self.num_spatial_crops = num_spatial_crops
self.num_retries = num_retries
self.num_samples_precise_bn = num_samples_precise_bn
super().__init__(file_path, pipeline, **kwargs)
#set random seed
random.seed(0)
np.random.seed(0)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split()
filename, labels = line_split
if self.data_prefix is not None:
filename = osp.join(self.data_prefix, filename)
for tidx in range(self.num_ensemble_views):
for sidx in range(self.num_spatial_crops):
info.append(
dict(
filename=filename,
labels=int(labels),
temporal_sample_index=tidx,
spatial_sample_index=sidx,
temporal_num_clips=self.num_ensemble_views,
spatial_num_clips=self.num_spatial_crops,
))
return info
def prepare_train(self, idx):
"""TRAIN & VALID. Prepare the data for training given the index."""
#Try to catch Exception caused by reading corrupted video file
short_cycle = False
if isinstance(idx, tuple):
idx, short_cycle_idx = idx
short_cycle = True
for ir in range(self.num_retries):
try:
#Multi-grid short cycle
if short_cycle:
results = copy.deepcopy(self.info[idx])
results['short_cycle_idx'] = short_cycle_idx
else:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['filename'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'][0], results['imgs'][1], np.array(
[results['labels']])
def prepare_test(self, idx):
"""TEST. Prepare the data for test given the index."""
#Try to catch Exception caused by reading corrupted video file
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['filename'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'][0], results['imgs'][1], np.array(
[results['labels']]), np.array([idx])
def __len__(self):
"""get the size of the dataset."""
if self.num_samples_precise_bn is None:
return len(self.info)
else:
random.shuffle(self.info)
return min(self.num_samples_precise_bn, len(self.info))

@ -0,0 +1,89 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
import pickle
import paddle
from paddle.io import Dataset
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class UCF101SkeletonDataset(BaseDataset):
"""
Skeleton dataset for action recognition.
The dataset loads skeleton feature, and apply norm operatations.
Args:
file_path (str): Path to the index file.
pipeline(obj): Define the pipeline of data preprocessing.
test_mode (bool): Whether to bulid the test dataset. Default: False.
"""
def __init__(self,
file_path,
pipeline,
split,
repeat_times,
test_mode=False):
self.split = split
self.repeat_times = repeat_times
super().__init__(file_path, pipeline, test_mode=test_mode)
self._ori_len = len(self.info)
self.start_index = 0
self.modality = "Pose"
def load_file(self):
"""Load annotation file to get video information."""
assert self.file_path.endswith('.pkl')
return self.load_pkl_annotations()
def load_pkl_annotations(self):
with open(self.file_path, "rb") as f:
data = pickle.load(f)
if self.split:
split, data = data['split'], data['annotations']
identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
data = [x for x in data if x[identifier] in split[self.split]]
return data
def prepare_train(self, idx):
"""Prepare the frames for training given the index."""
results = copy.deepcopy(self.info[idx % self._ori_len])
results['modality'] = self.modality
results['start_index'] = self.start_index
return self.pipeline(results)
def prepare_test(self, idx):
"""Prepare the frames for testing given the index."""
results = copy.deepcopy(self.info[idx % self._ori_len])
results['modality'] = self.modality
results['start_index'] = self.start_index
return self.pipeline(results)
def __len__(self):
"""get the size of the dataset."""
return len(self.info) * self.repeat_times

@ -0,0 +1,76 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class UCF24Dataset(BaseDataset):
"""Dataset for YOWO
The dataset loads raw videos and apply specified transforms on them.
The index file is a file with multiple lines, and each line indicates
a sample video with the filepath and label, which are split with a whitesapce.
Example of a inde file:
.. code-block:: txt
Args:
file_path(str): Path to the index file.
pipeline(XXX): A sequence of data transforms.
**kwargs: Keyword arguments for ```BaseDataset```.
"""
def __init__(self, file_path, pipeline, num_retries=5, **kwargs):
self.num_retries = num_retries
super().__init__(file_path, pipeline, **kwargs)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
lines = fin.readlines()
for line in lines:
line = line.strip() # 'data/ucf24/labels/class_name/video_name/key_frame.txt'
filename = line.replace('txt', 'jpg').replace(
'labels', 'rgb-images') # key frame path
info.append(dict(filename=filename))
return info
def prepare_train(self, idx):
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
im_path = results['filename']
im_path = im_path.replace('jpg', 'txt')
im_split = im_path.split('/')
frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
return results['imgs'], np.array([results['labels']]), frame_index
def prepare_test(self, idx):
"""TEST. Prepare the data for test given the index."""
# Try to catch Exception caused by reading corrupted video file
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
im_path = results['filename']
im_path = im_path.replace('jpg', 'txt')
im_split = im_path.split('/')
frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
return results['imgs'], np.array([results['labels']]), frame_index

@ -0,0 +1,95 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
import random
import numpy as np
from ..registry import DATASETS
from .base import BaseDataset
from ...utils import get_logger
logger = get_logger("paddlevideo")
@DATASETS.register()
class VideoDataset(BaseDataset):
"""Video dataset for action recognition
The dataset loads raw videos and apply specified transforms on them.
The index file is a file with multiple lines, and each line indicates
a sample video with the filepath and label, which are split with a whitesapce.
Example of a inde file:
.. code-block:: txt
path/000.mp4 1
path/001.mp4 1
path/002.mp4 2
path/003.mp4 2
Args:
file_path(str): Path to the index file.
pipeline(XXX): A sequence of data transforms.
**kwargs: Keyword arguments for ```BaseDataset```.
"""
def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
self.num_retries = num_retries
self.suffix = suffix
super().__init__(file_path, pipeline, **kwargs)
def load_file(self):
"""Load index file to get video information."""
info = []
with open(self.file_path, 'r') as fin:
for line in fin:
line_split = line.strip().split()
filename, labels = line_split
#TODO(hj): Required suffix format: may mp4/avi/wmv
filename = filename + self.suffix
if self.data_prefix is not None:
filename = osp.join(self.data_prefix, filename)
info.append(dict(filename=filename, labels=int(labels)))
return info
def prepare_train(self, idx):
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
#Try to catch Exception caused by reading corrupted video file
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['filename'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'], np.array([results['labels']])
def prepare_test(self, idx):
"""TEST. Prepare the data for test given the index."""
#Try to catch Exception caused by reading corrupted video file
for ir in range(self.num_retries):
try:
results = copy.deepcopy(self.info[idx])
results = self.pipeline(results)
except Exception as e:
#logger.info(e)
if ir < self.num_retries - 1:
logger.info(
"Error when loading {}, have {} trys, will try again".
format(results['filename'], ir))
idx = random.randint(0, len(self.info) - 1)
continue
return results['imgs'], np.array([results['labels']])

@ -0,0 +1,56 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat
from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,
GroupResize, Image2Array, JitterScale, MultiCrop,
Normalization, PackOutput, RandomCrop, RandomFlip,
RandomResizedCrop, Scale, TenCrop, ToArray,
UniformCrop, RandomGamma, MultiCenterCrop,
RandomBrightness, RandomHue, RandomSaturation, YowoAug)
from .augmentations_ava import *
from .compose import Compose
from .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder
from .decode_image import ImageDecoder
from .decode_sampler import DecodeSampler
from .mix import Cutmix, Mixup, VideoMix
from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize
from .sample import Sampler, SamplerPkl
from .sample_ava import *
from .segmentation import MultiNorm, MultiRestrictSize
from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm
from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation
from .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,
RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,
GeneratePoseTarget, FormatShape, Collect)
from .decode_sampler_MRI import SFMRI_DecodeSampler
from .segmentation_pipline import SegmentationSampler
from .sample_ucf24 import SamplerUCF24
__all__ = [
'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',
'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',
'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',
'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',
'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',
'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',
'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',
'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',
'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',
'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',
'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',
'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',
'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',
'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'
]

@ -0,0 +1,150 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
from ..registry import PIPELINES
"""pipeline ops for Activity Net.
"""
@PIPELINES.register()
class LoadFeat(object):
def __init__(self, feat_path):
self.feat_path = feat_path
def __call__(self, results):
video_name = results['video_name']
file_name = video_name + ".npy"
file_path = os.path.join(self.feat_path, file_name)
#TODO: check path
video_feat = np.load(file_path)
video_feat = video_feat.T
video_feat = video_feat.astype("float32")
results['video_feat'] = video_feat
return results
@PIPELINES.register()
class GetMatchMap(object):
def __init__(self, tscale):
self.tscale = tscale
self.tgap = 1. / self.tscale
def __call__(self, results):
match_map = []
for idx in range(self.tscale):
tmp_match_window = []
xmin = self.tgap * idx
for jdx in range(1, self.tscale + 1):
xmax = xmin + self.tgap * jdx
tmp_match_window.append([xmin, xmax])
match_map.append(tmp_match_window)
match_map = np.array(match_map)
match_map = np.transpose(match_map, [1, 0, 2])
match_map = np.reshape(match_map, [-1, 2])
anchor_xmin = [self.tgap * i for i in range(self.tscale)]
anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
results['match_map'] = match_map
results['anchor_xmin'] = anchor_xmin
results['anchor_xmax'] = anchor_xmax
return results
@PIPELINES.register()
class GetVideoLabel(object):
def __init__(self, tscale, dscale, datatype="float32"):
self.tscale = tscale
self.dscale = dscale
self.tgap = 1. / self.tscale
self.datatype = datatype
def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
"""Compute jaccard score between a box and the anchors.
"""
len_anchors = anchors_max - anchors_min
int_xmin = np.maximum(anchors_min, box_min)
int_xmax = np.minimum(anchors_max, box_max)
inter_len = np.maximum(int_xmax - int_xmin, 0.)
union_len = len_anchors - inter_len + box_max - box_min
jaccard = np.divide(inter_len, union_len)
return jaccard
def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
"""Compute intersection between score a box and the anchors.
"""
len_anchors = anchors_max - anchors_min
int_xmin = np.maximum(anchors_min, box_min)
int_xmax = np.minimum(anchors_max, box_max)
inter_len = np.maximum(int_xmax - int_xmin, 0.)
scores = np.divide(inter_len, len_anchors)
return scores
def __call__(self, results):
video_info = results['video_info']
match_map = results['match_map']
anchor_xmin = results['anchor_xmin']
anchor_xmax = results['anchor_xmax']
video_second = video_info['duration_second']
video_labels = video_info['annotations']
gt_bbox = []
gt_iou_map = []
for gt in video_labels:
tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
gt_bbox.append([tmp_start, tmp_end])
tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],
match_map[:, 1], tmp_start,
tmp_end)
tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
[self.dscale, self.tscale])
gt_iou_map.append(tmp_gt_iou_map)
gt_iou_map = np.array(gt_iou_map)
gt_iou_map = np.max(gt_iou_map, axis=0)
gt_bbox = np.array(gt_bbox)
gt_xmins = gt_bbox[:, 0]
gt_xmaxs = gt_bbox[:, 1]
gt_len_small = 3 * self.tgap
gt_start_bboxs = np.stack(
(gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
gt_end_bboxs = np.stack(
(gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
match_score_start = []
for jdx in range(len(anchor_xmin)):
match_score_start.append(
np.max(
self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
gt_start_bboxs[:, 0],
gt_start_bboxs[:, 1])))
match_score_end = []
for jdx in range(len(anchor_xmin)):
match_score_end.append(
np.max(
self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
gt_end_bboxs[:, 0], gt_end_bboxs[:,
1])))
gt_start = np.array(match_score_start)
gt_end = np.array(match_score_end)
results['gt_iou_map'] = gt_iou_map.astype(self.datatype)
results['gt_start'] = gt_start.astype(self.datatype)
results['gt_end'] = gt_end.astype(self.datatype)
return results

@ -0,0 +1,749 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
import math
from PIL import Image
from ..registry import PIPELINES
from collections.abc import Sequence
import cv2
pillow_interp_codes = {
'nearest': Image.NEAREST,
'bilinear': Image.BILINEAR,
'bicubic': Image.BICUBIC,
'box': Image.BOX,
'lanczos': Image.LANCZOS,
'hamming': Image.HAMMING
}
cv2_interp_codes = {
'nearest': cv2.INTER_NEAREST,
'bilinear': cv2.INTER_LINEAR,
'bicubic': cv2.INTER_CUBIC,
'area': cv2.INTER_AREA,
'lanczos': cv2.INTER_LANCZOS4
}
def _init_lazy_if_proper(results, lazy):
"""Initialize lazy operation properly.
Make sure that a lazy operation is properly initialized,
and avoid a non-lazy operation accidentally getting mixed in.
Required keys in results are "imgs" if "img_shape" not in results,
otherwise, Required keys in results are "img_shape", add or modified keys
are "img_shape", "lazy".
Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
"flip_direction", "interpolation".
Args:
results (dict): A dict stores data pipeline result.
lazy (bool): Determine whether to apply lazy operation. Default: False.
"""
if 'img_shape' not in results:
results['img_shape'] = results['imgs'][0].shape[:2]
if lazy:
if 'lazy' not in results:
img_h, img_w = results['img_shape']
lazyop = dict()
lazyop['original_shape'] = results['img_shape']
lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
dtype=np.float32)
lazyop['flip'] = False
lazyop['flip_direction'] = None
lazyop['interpolation'] = None
results['lazy'] = lazyop
else:
assert 'lazy' not in results, 'Use Fuse after lazy operations'
def _scale_size(size, scale):
"""Rescale a size by a ratio.
Args:
size (tuple[int]): (w, h).
scale (float): Scaling factor.
Returns:
tuple[int]: scaled size.
"""
w, h = size
return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)
def rescale_size(old_size, scale, return_scale=False):
"""Calculate the new size to be rescaled to.
Args:
old_size (tuple[int]): The old size (w, h) of image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image size.
Returns:
tuple[int]: The new rescaled image size.
"""
w, h = old_size
if isinstance(scale, (float, int)):
if scale <= 0:
raise ValueError(f'Invalid scale {scale}, must be positive.')
scale_factor = scale
elif isinstance(scale, tuple):
max_long_edge = max(scale)
max_short_edge = min(scale)
scale_factor = min(max_long_edge / max(h, w),
max_short_edge / min(h, w))
else:
raise TypeError(
f'Scale must be a number or tuple of int, but got {type(scale)}')
new_size = _scale_size((w, h), scale_factor)
if return_scale:
return new_size, scale_factor
else:
return new_size
def imresize(img,
size,
return_scale=False,
interpolation='bilinear',
out=None,
backend=None):
"""Resize image to a given size. """
h, w = img.shape[:2]
if backend is None:
backend = 'cv2'
if backend not in ['cv2', 'pillow']:
raise ValueError(f'backend: {backend} is not supported for resize.'
f"Supported backends are 'cv2', 'pillow'")
if backend == 'pillow':
assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
pil_image = Image.fromarray(img)
pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
resized_img = np.array(pil_image)
else:
resized_img = cv2.resize(
img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
if not return_scale:
return resized_img
else:
w_scale = size[0] / w
h_scale = size[1] / h
return resized_img, w_scale, h_scale
@PIPELINES.register()
class EntityBoxRescale:
"""Rescale the entity box and proposals according to the image shape.
Required keys are "proposals", "gt_bboxes", added or modified keys are
"gt_bboxes". If original "proposals" is not None, "proposals" and
will be added or modified.
Args:
scale_factor (np.ndarray): The scale factor used entity_box rescaling.
"""
def __init__(self, scale_factor):
self.scale_factor = scale_factor
def __call__(self, results):
scale_factor = np.concatenate([self.scale_factor, self.scale_factor])
if 'gt_bboxes' in results:
gt_bboxes = results['gt_bboxes']
results['gt_bboxes'] = gt_bboxes * scale_factor
if 'proposals' in results:
proposals = results['proposals']
if proposals is not None:
assert proposals.shape[1] == 4, (
'proposals shape should be in '
f'(n, 4), but got {proposals.shape}')
results['proposals'] = proposals * scale_factor
return results
def __repr__(self):
return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'
@PIPELINES.register()
class EntityBoxCrop:
"""Crop the entity boxes and proposals according to the cropped images.
Required keys are "proposals", "gt_bboxes", added or modified keys are
"gt_bboxes". If original "proposals" is not None, "proposals" will be
modified.
Args:
crop_bbox(np.ndarray | None): The bbox used to crop the original image.
"""
def __init__(self, crop_bbox):
self.crop_bbox = crop_bbox
def __call__(self, results):
proposals = results['proposals']
gt_bboxes = results['gt_bboxes']
if self.crop_bbox is None:
return results
x1, y1, x2, y2 = self.crop_bbox
img_w, img_h = x2 - x1, y2 - y1
assert gt_bboxes.shape[-1] == 4
gt_bboxes_ = gt_bboxes.copy()
gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)
gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)
results['gt_bboxes'] = gt_bboxes_
if proposals is not None:
assert proposals.shape[-1] == 4
proposals_ = proposals.copy()
proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,
img_w - 1)
proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,
img_h - 1)
results['proposals'] = proposals_
return results
def __repr__(self):
return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'
@PIPELINES.register()
class EntityBoxFlip:
"""Flip the entity boxes and proposals with a probability.
Reverse the order of elements in the given bounding boxes and proposals
with a specific direction. The shape of them are preserved, but the
elements are reordered. Only the horizontal flip is supported (seems
vertical flipping makes no sense). Required keys are "proposals",
"gt_bboxes", added or modified keys are "gt_bboxes". If "proposals"
is not None, it will also be modified.
Args:
img_shape (tuple[int]): The img shape.
"""
def __init__(self, img_shape):
self.img_shape = img_shape
def __call__(self, results):
proposals = results['proposals']
gt_bboxes = results['gt_bboxes']
img_h, img_w = self.img_shape
assert gt_bboxes.shape[-1] == 4
gt_bboxes_ = gt_bboxes.copy()
gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1
gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1
if proposals is not None:
assert proposals.shape[-1] == 4
proposals_ = proposals.copy()
proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1
proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1
else:
proposals_ = None
results['proposals'] = proposals_
results['gt_bboxes'] = gt_bboxes_
return results
def __repr__(self):
repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'
return repr_str
@PIPELINES.register()
class Resize:
"""Resize images to a specific size.
Required keys are "imgs", "img_shape", "modality", added or modified
keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
"resize_size". Required keys in "lazy" is None, added or modified key is
"interpolation".
Args:
scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
factor or maximum size:
If it is a float number, the image will be rescaled by this
factor, else if it is a tuple of 2 integers, the image will
be rescaled as large as possible within the scale.
Otherwise, it serves as (w, h) of output size.
keep_ratio (bool): If set to True, Images will be resized without
changing the aspect ratio. Otherwise, it will resize images to a
given size. Default: True.
interpolation (str): Algorithm used for interpolation:
"nearest" | "bilinear". Default: "bilinear".
lazy (bool): Determine whether to apply lazy operation. Default: False.
"""
def __init__(self,
scale,
keep_ratio=True,
interpolation='bilinear',
lazy=False):
if isinstance(scale, str):
scale = eval(scale)
if isinstance(scale, float):
if scale <= 0:
raise ValueError(f'Invalid scale {scale}, must be positive.')
elif isinstance(scale, tuple):
max_long_edge = max(scale)
max_short_edge = min(scale)
if max_short_edge == -1:
# assign np.inf to long edge for rescaling short edge later.
scale = (np.inf, max_long_edge)
else:
raise TypeError(
f'Scale must be float or tuple of int, but got {type(scale)}')
self.scale = scale
self.keep_ratio = keep_ratio
self.interpolation = interpolation
self.lazy = lazy
def __call__(self, results):
"""Performs the Resize augmentation.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
_init_lazy_if_proper(results, self.lazy)
if 'scale_factor' not in results:
results['scale_factor'] = np.array([1, 1], dtype=np.float32)
img_h, img_w = results['img_shape']
if self.keep_ratio:
new_w, new_h = rescale_size((img_w, img_h), self.scale)
else:
new_w, new_h = self.scale
self.scale_factor = np.array([new_w / img_w, new_h / img_h],
dtype=np.float32)
results['img_shape'] = (new_h, new_w)
results['keep_ratio'] = self.keep_ratio
results['scale_factor'] = results['scale_factor'] * self.scale_factor
if not self.lazy:
if 'imgs' in results:
results['imgs'] = [
imresize(
img, (new_w, new_h), interpolation=self.interpolation)
for img in results['imgs']
]
if 'keypoint' in results:
results['keypoint'] = results['keypoint'] * self.scale_factor
else:
lazyop = results['lazy']
if lazyop['flip']:
raise NotImplementedError('Put Flip at last for now')
lazyop['interpolation'] = self.interpolation
#if 'gt_bboxes' in results:
assert not self.lazy
entity_box_rescale = EntityBoxRescale(self.scale_factor)
results = entity_box_rescale(results)
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
f'interpolation={self.interpolation}, '
f'lazy={self.lazy})')
return repr_str
@PIPELINES.register()
class RandomRescale:
"""Randomly resize images so that the short_edge is resized to a specific
size in a given range. The scale ratio is unchanged after resizing.
"""
def __init__(self, scale_range, interpolation='bilinear'):
scale_range = eval(scale_range)
self.scale_range = scale_range
assert len(scale_range) == 2
assert scale_range[0] < scale_range[1]
assert np.all([x > 0 for x in scale_range])
self.keep_ratio = True
self.interpolation = interpolation
def __call__(self, results):
"""Performs the Resize augmentation.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
short_edge = np.random.randint(self.scale_range[0],
self.scale_range[1] + 1)
resize = Resize((-1, short_edge),
keep_ratio=True,
interpolation=self.interpolation,
lazy=False)
results = resize(results)
results['short_edge'] = short_edge
return results
def __repr__(self):
scale_range = self.scale_range
repr_str = (f'{self.__class__.__name__}('
f'scale_range=({scale_range[0]}, {scale_range[1]}), '
f'interpolation={self.interpolation})')
return repr_str
@PIPELINES.register()
class Rescale:
"""resize images so that the short_edge is resized to a specific
size in a given range. The scale ratio is unchanged after resizing.
Required keys are "imgs", "img_shape", "modality", added or modified
keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
"short_edge".
Args:
scale_range (tuple[int]): The range of short edge length. A closed
interval.
interpolation (str): Algorithm used for interpolation:
"nearest" | "bilinear". Default: "bilinear".
"""
def __init__(self, scale_range, interpolation='bilinear'):
scale_range = eval(scale_range)
self.scale_range = scale_range
self.keep_ratio = True
self.interpolation = interpolation
def __call__(self, results):
"""Performs the Resize augmentation.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
resize = Resize(
self.scale_range,
keep_ratio=True,
interpolation=self.interpolation,
lazy=False)
results = resize(results)
return results
def __repr__(self):
scale_range = self.scale_range
repr_str = (f'{self.__class__.__name__}('
f'scale_range=({scale_range[0]}, {scale_range[1]}), '
f'interpolation={self.interpolation})')
return repr_str
@PIPELINES.register()
class RandomCrop_v2:
"""Vanilla square random crop that specifics the output size.
Required keys in results are "imgs" and "img_shape", added or
modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip",
"crop_bbox", added or modified key is "crop_bbox".
Args:
size (int): The output size of the images.
lazy (bool): Determine whether to apply lazy operation. Default: False.
"""
def __init__(self, size, lazy=False):
if not isinstance(size, int):
raise TypeError(f'Size must be an int, but got {type(size)}')
self.size = size
self.lazy = lazy
def __call__(self, results):
"""Performs the RandomCrop augmentation.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
_init_lazy_if_proper(results, self.lazy)
img_h, img_w = results['img_shape']
assert self.size <= img_h and self.size <= img_w
y_offset = 0
x_offset = 0
if img_h > self.size:
y_offset = int(np.random.randint(0, img_h - self.size))
if img_w > self.size:
x_offset = int(np.random.randint(0, img_w - self.size))
if 'crop_quadruple' not in results:
results['crop_quadruple'] = np.array(
[0, 0, 1, 1], # x, y, w, h
dtype=np.float32)
x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
w_ratio, h_ratio = self.size / img_w, self.size / img_h
old_crop_quadruple = results['crop_quadruple']
old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
new_crop_quadruple = [
old_x_ratio + x_ratio * old_w_ratio,
old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
h_ratio * old_x_ratio
]
results['crop_quadruple'] = np.array(
new_crop_quadruple, dtype=np.float32)
new_h, new_w = self.size, self.size
results['crop_bbox'] = np.array(
[x_offset, y_offset, x_offset + new_w, y_offset + new_h])
results['img_shape'] = (new_h, new_w)
if not self.lazy:
results['imgs'] = [
img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
for img in results['imgs']
]
else:
lazyop = results['lazy']
if lazyop['flip']:
raise NotImplementedError('Put Flip at last for now')
# record crop_bbox in lazyop dict to ensure only crop once in Fuse
lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
left = x_offset * (lazy_right - lazy_left) / img_w
right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
top = y_offset * (lazy_bottom - lazy_top) / img_h
bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
lazyop['crop_bbox'] = np.array(
[(lazy_left + left), (lazy_top + top), (lazy_left + right),
(lazy_top + bottom)],
dtype=np.float32)
# Process entity boxes
if 'gt_bboxes' in results:
assert not self.lazy
entity_box_crop = EntityBoxCrop(results['crop_bbox'])
results = entity_box_crop(results)
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}(size={self.size}, '
f'lazy={self.lazy})')
return repr_str
def imflip_(img, direction='horizontal'):
"""Inplace flip an image horizontally or vertically.
Args:
img (ndarray): Image to be flipped.
direction (str): The flip direction, either "horizontal" or
"vertical" or "diagonal".
Returns:
ndarray: The flipped image (inplace).
"""
assert direction in ['horizontal', 'vertical', 'diagonal']
if direction == 'horizontal':
return cv2.flip(img, 1, img)
elif direction == 'vertical':
return cv2.flip(img, 0, img)
else:
return cv2.flip(img, -1, img)
def iminvert(img):
"""Invert (negate) an image.
Args:
img (ndarray): Image to be inverted.
Returns:
ndarray: The inverted image.
"""
return np.full_like(img, 255) - img
@PIPELINES.register()
class Flip:
"""Flip the input images with a probability.
Reverse the order of elements in the given imgs with a specific direction.
The shape of the imgs is preserved, but the elements are reordered.
Required keys are "imgs", "img_shape", "modality", added or modified
keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
None, added or modified key are "flip" and "flip_direction". The Flip
augmentation should be placed after any cropping / reshaping augmentations,
to make sure crop_quadruple is calculated properly.
Args:
flip_ratio (float): Probability of implementing flip. Default: 0.5.
direction (str): Flip imgs horizontally or vertically. Options are
"horizontal" | "vertical". Default: "horizontal".
lazy (bool): Determine whether to apply lazy operation. Default: False.
"""
_directions = ['horizontal', 'vertical']
def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):
if direction not in self._directions:
raise ValueError(f'Direction {direction} is not supported. '
f'Currently support ones are {self._directions}')
self.flip_ratio = flip_ratio
self.direction = direction
self.lazy = lazy
def __call__(self, results):
"""Performs the Flip augmentation.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
_init_lazy_if_proper(results, self.lazy)
flip = np.random.rand() < self.flip_ratio
results['flip'] = flip
results['flip_direction'] = self.direction
if not self.lazy:
if flip:
for i, img in enumerate(results['imgs']):
imflip_(img, self.direction)
lt = len(results['imgs'])
else:
results['imgs'] = list(results['imgs'])
else:
lazyop = results['lazy']
if lazyop['flip']:
raise NotImplementedError('Use one Flip please')
lazyop['flip'] = flip
lazyop['flip_direction'] = self.direction
if 'gt_bboxes' in results and flip:
assert not self.lazy and self.direction == 'horizontal'
entity_box_flip = EntityBoxFlip(results['img_shape'])
results = entity_box_flip(results)
return results
def __repr__(self):
repr_str = (
f'{self.__class__.__name__}('
f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
f'lazy={self.lazy})')
return repr_str
def imnormalize_(img, mean, std, to_rgb=True):
"""Inplace normalize an image with mean and std.
Args:
img (ndarray): Image to be normalized.
mean (ndarray): The mean to be used for normalize.
std (ndarray): The std to be used for normalize.
to_rgb (bool): Whether to convert to rgb.
Returns:
ndarray: The normalized image.
"""
# cv2 inplace normalization does not accept uint8
assert img.dtype != np.uint8
mean = np.float64(mean.reshape(1, -1))
stdinv = 1 / np.float64(std.reshape(1, -1))
if to_rgb:
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
cv2.subtract(img, mean, img) # inplace
cv2.multiply(img, stdinv, img) # inplace
return img
@PIPELINES.register()
class Normalize:
"""Normalize images with the given mean and std value.
Required keys are "imgs", "img_shape", "modality", added or modified
keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
keys "scale_factor" is required
Args:
mean (Sequence[float]): Mean values of different channels.
std (Sequence[float]): Std values of different channels.
to_bgr (bool): Whether to convert channels from RGB to BGR.
Default: False.
adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
on 'scale_factor' when modality is 'Flow'. Default: False.
"""
def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
if not isinstance(mean, Sequence):
raise TypeError(
f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
if not isinstance(std, Sequence):
raise TypeError(
f'Std must be list, tuple or np.ndarray, but got {type(std)}')
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_bgr = to_bgr
self.adjust_magnitude = adjust_magnitude
def __call__(self, results):
n = len(results['imgs'])
h, w, c = results['imgs'][0].shape
imgs = np.empty((n, h, w, c), dtype=np.float32)
for i, img in enumerate(results['imgs']):
imgs[i] = img
for img in imgs:
imnormalize_(img, self.mean, self.std, self.to_bgr)
results['imgs'] = imgs
results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_bgr=self.to_bgr)
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'mean={self.mean}, '
f'std={self.std}, '
f'to_bgr={self.to_bgr}, '
f'adjust_magnitude={self.adjust_magnitude})')
return repr_str

@ -0,0 +1,76 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Sequence
from ..registry import PIPELINES
import traceback
from ...utils import build
from ...utils import get_logger
@PIPELINES.register()
class Compose(object):
"""
Composes several pipelines(include decode func, sample func, and transforms) together.
Note: To deal with ```list``` type cfg temporaray, like:
transform:
- Crop: # A list
attribute: 10
- Resize: # A list
attribute: 20
every key of list will pass as the key name to build a module.
XXX: will be improved in the future.
Args:
pipelines (list): List of transforms to compose.
Returns:
A compose object which is callable, __call__ for this Compose
object will call each given :attr:`transforms` sequencely.
"""
def __init__(self, pipelines):
#assert isinstance(pipelines, Sequence)
self.pipelines = []
for p in pipelines.values():
if isinstance(p, dict):
p = build(p, PIPELINES)
self.pipelines.append(p)
elif isinstance(p, list):
for t in p:
#XXX: to deal with old format cfg, ugly code here!
temp_dict = dict(name=list(t.keys())[0])
for all_sub_t in t.values():
if all_sub_t is not None:
temp_dict.update(all_sub_t)
t = build(temp_dict, PIPELINES)
self.pipelines.append(t)
elif callable(p):
self.pipelines.append(p)
else:
raise TypeError(f'pipelines must be callable or a dict,'
f'but got {type(p)}')
def __call__(self, data):
for p in self.pipelines:
try:
data = p(data)
except Exception as e:
stack_info = traceback.format_exc()
logger = get_logger("paddlevideo")
logger.info("fail to perform transform [{}] with error: "
"{} and stack:\n{}".format(p, e, str(stack_info)))
raise e
return data

@ -0,0 +1,348 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
try:
import av
except ImportError as e:
print(
f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models."
)
import cv2
import pickle
import decord as de
import math
import random
from ..registry import PIPELINES
def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
delta = max(video_size - clip_size, 0)
if clip_idx == -1: # here
# Random temporal sampling.
start_idx = random.uniform(0, delta)
else: # ignore
# Uniformly sample the clip with the given index.
start_idx = delta * clip_idx / num_clips
end_idx = start_idx + clip_size - 1
return start_idx, end_idx
@PIPELINES.register()
class VideoDecoder(object):
"""
Decode mp4 file to frames.
Args:
filepath: the file path of mp4 file
"""
def __init__(self,
backend='cv2',
mode='train',
sampling_rate=32,
num_seg=8,
num_clips=1,
target_fps=30):
self.backend = backend
# params below only for TimeSformer
self.mode = mode
self.sampling_rate = sampling_rate
self.num_seg = num_seg
self.num_clips = num_clips
self.target_fps = target_fps
def __call__(self, results):
"""
Perform mp4 decode operations.
return:
List where each item is a numpy array after decoder.
"""
file_path = results['filename']
results['format'] = 'video'
results['backend'] = self.backend
if self.backend == 'cv2':
cap = cv2.VideoCapture(file_path)
videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
sampledFrames = []
for i in range(videolen):
ret, frame = cap.read()
# maybe first frame is empty
if ret == False:
continue
img = frame[:, :, ::-1]
sampledFrames.append(img)
results['frames'] = sampledFrames
results['frames_len'] = len(sampledFrames)
elif self.backend == 'decord':
container = de.VideoReader(file_path)
frames_len = len(container)
results['frames'] = container
results['frames_len'] = frames_len
elif self.backend == 'pyav': # for TimeSformer
if self.mode in ["train", "valid"]:
clip_idx = -1
elif self.mode in ["test"]:
clip_idx = 0
else:
raise NotImplementedError
container = av.open(file_path)
num_clips = 1 # always be 1
# decode process
fps = float(container.streams.video[0].average_rate)
frames_length = container.streams.video[0].frames
duration = container.streams.video[0].duration
if duration is None:
# If failed to fetch the decoding information, decode the entire video.
decode_all_video = True
video_start_pts, video_end_pts = 0, math.inf
else:
decode_all_video = False
start_idx, end_idx = get_start_end_idx(
frames_length,
self.sampling_rate * self.num_seg / self.target_fps * fps,
clip_idx, num_clips)
timebase = duration / frames_length
video_start_pts = int(start_idx * timebase)
video_end_pts = int(end_idx * timebase)
frames = None
# If video stream was found, fetch video frames from the video.
if container.streams.video:
margin = 1024
seek_offset = max(video_start_pts - margin, 0)
container.seek(seek_offset,
any_frame=False,
backward=True,
stream=container.streams.video[0])
tmp_frames = {}
buffer_count = 0
max_pts = 0
for frame in container.decode(**{"video": 0}):
max_pts = max(max_pts, frame.pts)
if frame.pts < video_start_pts:
continue
if frame.pts <= video_end_pts:
tmp_frames[frame.pts] = frame
else:
buffer_count += 1
tmp_frames[frame.pts] = frame
if buffer_count >= 0:
break
video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
container.close()
frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
start_idx, end_idx = get_start_end_idx(
len(frames), # frame_len
clip_sz,
clip_idx if decode_all_video else
0, # If decode all video, -1 in train and valid, 0 in test;
# else, always 0 in train, valid and test, as we has selected clip size frames when decode.
1)
results['frames'] = frames
results['frames_len'] = len(frames)
results['start_idx'] = start_idx
results['end_idx'] = end_idx
else:
raise NotImplementedError
# pass
return results
@PIPELINES.register()
class FrameDecoder(object):
"""just parse results
"""
def __init__(self):
pass
def __call__(self, results):
results['format'] = 'frame'
return results
@PIPELINES.register()
class MRIDecoder(object):
"""just parse results
"""
def __init__(self):
pass
def __call__(self, results):
results['format'] = 'MRI'
return results
@PIPELINES.register()
class FeatureDecoder(object):
"""
Perform feature decode operations.e.g.youtube8m
"""
def __init__(self, num_classes, max_len=512, has_label=True):
self.max_len = max_len
self.num_classes = num_classes
self.has_label = has_label
def __call__(self, results):
"""
Perform feature decode operations.
return:
List where each item is a numpy array after decoder.
"""
#1. load pkl
#2. parse to rgb/audio/
#3. padding
filepath = results['filename']
data = pickle.load(open(filepath, 'rb'), encoding='bytes')
record = data
nframes = record['nframes'] if 'nframes' in record else record[
b'nframes']
rgb = record['feature'].astype(
float) if 'feature' in record else record[b'feature'].astype(float)
audio = record['audio'].astype(
float) if 'audio' in record else record[b'audio'].astype(float)
if self.has_label:
label = record['label'] if 'label' in record else record[b'label']
one_hot_label = self.make_one_hot(label, self.num_classes)
rgb = rgb[0:nframes, :]
audio = audio[0:nframes, :]
rgb = self.dequantize(rgb,
max_quantized_value=2.,
min_quantized_value=-2.)
audio = self.dequantize(audio,
max_quantized_value=2,
min_quantized_value=-2)
if self.has_label:
results['labels'] = one_hot_label.astype("float32")
feat_pad_list = []
feat_len_list = []
mask_list = []
vitem = [rgb, audio]
for vi in range(2): #rgb and audio
if vi == 0:
prefix = "rgb_"
else:
prefix = "audio_"
feat = vitem[vi]
results[prefix + 'len'] = feat.shape[0]
#feat pad step 1. padding
feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
dtype=np.float32)
feat_pad = np.concatenate((feat, feat_add), axis=0)
results[prefix + 'data'] = feat_pad.astype("float32")
#feat pad step 2. mask
feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
feat_mask_add = feat_add
feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
axis=0)
results[prefix + 'mask'] = feat_mask.astype("float32")
return results
def dequantize(self,
feat_vector,
max_quantized_value=2.,
min_quantized_value=-2.):
"""
Dequantize the feature from the byte format to the float format
"""
assert max_quantized_value > min_quantized_value
quantized_range = max_quantized_value - min_quantized_value
scalar = quantized_range / 255.0
bias = (quantized_range / 512.0) + min_quantized_value
return feat_vector * scalar + bias
def make_one_hot(self, label, dim=3862):
one_hot_label = np.zeros(dim)
one_hot_label = one_hot_label.astype(float)
for ind in label:
one_hot_label[int(ind)] = 1
return one_hot_label
@PIPELINES.register()
class ActionFeatureDecoder(object):
"""
Perform feature decode operations on footballaction
"""
def __init__(self, num_classes, max_len=512, has_label=True):
self.max_len = max_len
self.num_classes = num_classes
self.has_label = has_label
def __call__(self, results):
"""
Perform feature decode operations.
return:
List where each item is a numpy array after decoder.
"""
#1. load pkl
#2. parse to rgb/audio/
#3. padding
filepath = results['filename']
data = pickle.load(open(filepath, 'rb'), encoding='bytes')
pkl_data = data
rgb = pkl_data['image_feature'].astype(float)
audio = pkl_data['audio_feature'].astype(float)
label_id_info = pkl_data['label_info']
label_cls = [label_id_info['label']]
label_one = int(label_cls[0])
if len(label_cls) > 1:
label_index = random.randint(0, 1)
label_one = int(label_cls[label_index])
iou_norm = float(label_id_info['norm_iou'])
results['labels'] = np.array([label_one])
results['iou_norm'] = float(iou_norm)
vitem = [rgb, audio]
for vi in range(2): #rgb and audio
if vi == 0:
prefix = "rgb_"
else:
prefix = "audio_"
feat = vitem[vi]
results[prefix + 'len'] = feat.shape[0]
#feat pad step 1. padding
feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
dtype=np.float32)
feat_pad = np.concatenate((feat, feat_add), axis=0)
results[prefix + 'data'] = feat_pad.astype("float32")
#feat pad step 2. mask
feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
results[prefix + 'mask'] = feat_mask.astype("float32")
return results

@ -0,0 +1,206 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import PIL.Image as pil
try:
import skimage.transform
except ImportError as e:
print(
f"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS."
)
from PIL import Image
from ..registry import PIPELINES
@PIPELINES.register()
class ImageDecoder(object):
"""Decode Image
"""
def __init__(self,
dataset,
frame_idxs,
num_scales,
side_map,
full_res_shape,
img_ext,
backend='cv2'):
self.backend = backend
self.dataset = dataset
self.frame_idxs = frame_idxs
self.num_scales = num_scales
self.side_map = side_map
self.full_res_shape = full_res_shape
self.img_ext = img_ext
def _pil_loader(self, path):
with open(path, 'rb') as f:
with Image.open(f) as img:
return img.convert('RGB')
def get_color(self, folder, frame_index, side):
color = self._pil_loader(
self.get_image_path(self.dataset, folder, frame_index, side))
return color
def get_image_path(self, dataset, folder, frame_index, side):
if dataset == "kitti":
f_str = "{:010d}{}".format(frame_index, self.img_ext)
image_path = os.path.join(self.data_path, folder, f_str)
elif dataset == "kitti_odom":
f_str = "{:06d}{}".format(frame_index, self.img_ext)
image_path = os.path.join(self.data_path,
"sequences/{:02d}".format(int(folder)),
"image_{}".format(self.side_map[side]),
f_str)
elif dataset == "kitti_depth":
f_str = "{:010d}{}".format(frame_index, self.img_ext)
image_path = os.path.join(
self.data_path, folder,
"image_0{}/data".format(self.side_map[side]), f_str)
return image_path
def get_depth(self, dataset, folder, frame_index, side):
if dataset == "kitii_depth":
f_str = "{:010d}.png".format(frame_index)
depth_path = os.path.join(
self.data_path, folder,
"proj_depth/groundtruth/image_0{}".format(self.side_map[side]),
f_str)
depth_gt = pil.open(depth_path)
depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)
depth_gt = np.array(depth_gt).astype(np.float32) / 256
else:
f_str = "{:010d}{}".format(frame_index, self.img_ext)
depth_path = os.path.join(self.data_path, folder + '_gt', f_str)
img_file = Image.open(depth_path)
depth_png = np.array(img_file, dtype=int)
img_file.close()
# make sure we have a proper 16bit depth map here.. not 8bit!
assert np.max(depth_png) > 255, \
"np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path)
depth_gt = depth_png.astype(np.float) / 256.
depth_gt = depth_gt[160:960 - 160, :]
depth_gt = skimage.transform.resize(depth_gt,
self.full_res_shape[::-1],
order=0,
preserve_range=True,
mode='constant')
return depth_gt
def __call__(self, results):
"""
Perform mp4 decode operations.
return:
List where each item is a numpy array after decoder.
"""
if results.get('mode', None) == 'infer':
imgs = {}
imgs[("color", 0,
-1)] = Image.open(results["filename"]).convert("RGB")
results['imgs'] = imgs
return results
self.data_path = results['data_path']
results['backend'] = self.backend
imgs = {}
results['frame_idxs'] = self.frame_idxs
results['num_scales'] = self.num_scales
file_name = results['filename']
folder = results['folder']
frame_index = results['frame_index']
line = file_name.split('/')
istrain = folder.split('_')[1]
if 'mode' not in results:
results['mode'] = istrain
results['day_or_night'] = folder.split('_')[0]
if istrain == "train":
if folder[0] == 'd':
folder2 = folder + '_fake_night'
flag = 0
else:
folder2 = folder + '_fake_day'
tmp = folder
folder = folder2
folder2 = tmp
flag = 1
if len(line) == 3:
side = line[2]
else:
side = None
results['side'] = side
for i in self.frame_idxs:
if i == "s":
other_side = {"r": "l", "l": "r"}[side]
imgs[("color", i,
-1)] = self.get_color(folder, frame_index, other_side)
imgs[("color_n", i,
-1)] = self.get_color(folder2, frame_index,
other_side)
else:
imgs[("color", i,
-1)] = self.get_color(folder, frame_index + i, side)
imgs[("color_n", i,
-1)] = self.get_color(folder2, frame_index + i, side)
istrain = folder.split('_')[1]
if istrain != 'train':
if flag:
depth_gt = self.get_depth(folder2, frame_index, side)
else:
depth_gt = self.get_depth(folder, frame_index, side)
imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
elif istrain == 'val':
if len(line) == 3:
side = line[2]
else:
side = None
for i in self.frame_idxs:
if i == "s":
other_side = {"r": "l", "l": "r"}[side]
imgs[("color", i,
-1)] = self.get_color(folder, frame_index, other_side)
else:
imgs[("color", i,
-1)] = self.get_color(folder, frame_index + i, side)
# adjusting intrinsics to match each scale in the pyramid
depth_gt = self.get_depth(self.dataset, folder, frame_index, side)
imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
results['imgs'] = imgs
return results

@ -0,0 +1,93 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
from PIL import Image
import decord as de
from ..registry import PIPELINES
@PIPELINES.register()
class DecodeSampler(object):
"""
We use 'decord' for decode and sampling, which is faster than opencv.
This is used in slowfast model.
Args:
num_frames(int): the number of frames we want to sample.
sampling_rate(int): sampling rate for video data.
target_fps(int): desired fps, default 30
test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.
"""
def __init__(self,
num_frames,
sampling_rate,
default_sampling_rate=2,
target_fps=30,
test_mode=False):
self.num_frames = num_frames
self.orig_sampling_rate = self.sampling_rate = sampling_rate
self.default_sampling_rate = default_sampling_rate
self.target_fps = target_fps
self.test_mode = test_mode
def get_start_end_idx(self, video_size, clip_size, clip_idx,
temporal_num_clips):
delta = max(video_size - clip_size, 0)
if not self.test_mode:
# Random temporal sampling.
start_idx = random.uniform(0, delta)
else:
# Uniformly sample the clip with the given index.
start_idx = delta * clip_idx / temporal_num_clips
end_idx = start_idx + clip_size - 1
return start_idx, end_idx
def __call__(self, results):
"""
Perform mp4 decode operations.
return:
List where each item is a numpy array after decoder.
"""
short_cycle_idx = results.get('short_cycle_idx')
if short_cycle_idx:
self.sampling_rate = random.randint(self.default_sampling_rate,
self.orig_sampling_rate)
filepath = results['filename']
temporal_sample_index = results['temporal_sample_index']
temporal_num_clips = results['temporal_num_clips']
vr = de.VideoReader(filepath)
videolen = len(vr)
# fps = vr.get_avg_fps()
clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps
start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,
temporal_sample_index,
temporal_num_clips)
index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64")
index = np.clip(index, 0, videolen)
frames_select = vr.get_batch(index) #1 for buffer
# dearray_to_img
np_frames = frames_select.asnumpy()
frames_select_list = []
for i in range(np_frames.shape[0]):
imgbuf = np_frames[i]
frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))
results['imgs'] = frames_select_list
return results

@ -0,0 +1,224 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import numpy as np
from PIL import Image
try:
import SimpleITK as sitk
except ImportError as e:
print(
f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
)
import cv2
from ..registry import PIPELINES
@PIPELINES.register()
class SFMRI_DecodeSampler(object):
"""
Sample frames id.
NOTE: Use PIL to read image here, has diff with CV2
Args:
num_seg(int): number of segments.
seg_len(int): number of sampled frames in each segment.
valid_mode(bool): True or False.
select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
Returns:
frames_idx: the index of sampled #frames.
"""
def __init__(self,
num_seg,
seg_len,
valid_mode=False,
select_left=False,
dense_sample=False,
linspace_sample=False):
self.num_seg = num_seg
self.seg_len = seg_len
self.valid_mode = valid_mode
self.select_left = select_left
self.dense_sample = dense_sample
self.linspace_sample = linspace_sample
def _get(self, frames_idx_s, frames_idx_f, results):
frame_dir = results['frame_dir']
imgs_s = []
imgs_f = []
MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
for idx in frames_idx_s:
item = MRI[idx]
item = cv2.resize(item, (224, 224))
imgs_s.append(item)
for idx in frames_idx_f:
item = MRI[idx]
item = cv2.resize(item, (224, 224))
imgs_f.append(item)
results['imgs'] = [imgs_s, imgs_f]
return results
def __call__(self, results):
"""
Args:
frames_len: length of frames.
return:
sampling id.
"""
frames_len = int(results['frames_len'])
average_dur1 = int(frames_len / self.num_seg[0])
average_dur2 = int(frames_len / self.num_seg[1])
frames_idx_s = []
frames_idx_f = []
if self.linspace_sample:
if 'start_idx' in results and 'end_idx' in results:
offsets_s = np.linspace(results['start_idx'],
results['end_idx'], self.num_seg[0])
offsets_f = np.linspace(results['start_idx'],
results['end_idx'], self.num_seg[1])
else:
offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])
offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])
offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)
offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)
frames_idx_s = list(offsets_s)
frames_idx_f = list(offsets_f)
return self._get(frames_idx_s, frames_idx_f, results)
if not self.select_left:
if self.dense_sample: # For ppTSM
if not self.valid_mode: # train
sample_pos = max(1, 1 + frames_len - 64)
t_stride1 = 64 // self.num_seg[0]
t_stride2 = 64 // self.num_seg[1]
start_idx = 0 if sample_pos == 1 else np.random.randint(
0, sample_pos - 1)
offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1
for idx in range(self.num_seg[0])]
offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1
for idx in range(self.num_seg[1])]
frames_idx_s = offsets_s
frames_idx_f = offsets_f
else:
sample_pos = max(1, 1 + frames_len - 64)
t_stride1 = 64 // self.num_seg[0]
t_stride2 = 64 // self.num_seg[1]
start_list = np.linspace(0,
sample_pos - 1,
num=10,
dtype=int)
offsets_s = []
offsets_f = []
for start_idx in start_list.tolist():
offsets_s += [
(idx * t_stride1 + start_idx) % frames_len + 1
for idx in range(self.num_seg[0])
]
for start_idx in start_list.tolist():
offsets_f += [
(idx * t_stride2 + start_idx) % frames_len + 1
for idx in range(self.num_seg[1])
]
frames_idx_s = offsets_s
frames_idx_f = offsets_f
else:
for i in range(self.num_seg[0]):
idx = 0
if not self.valid_mode:
if average_dur1 >= self.seg_len:
idx = random.randint(0, average_dur1 - self.seg_len)
idx += i * average_dur1
elif average_dur1 >= 1:
idx += i * average_dur1
else:
idx = i
else:
if average_dur1 >= self.seg_len:
idx = (average_dur1 - 1) // 2
idx += i * average_dur1
elif average_dur1 >= 1:
idx += i * average_dur1
else:
idx = i
for jj in range(idx, idx + self.seg_len):
frames_idx_s.append(jj)
for i in range(self.num_seg[1]):
idx = 0
if not self.valid_mode:
if average_dur2 >= self.seg_len:
idx = random.randint(0, average_dur2 - self.seg_len)
idx += i * average_dur2
elif average_dur2 >= 1:
idx += i * average_dur2
else:
idx = i
else:
if average_dur2 >= self.seg_len:
idx = (average_dur2 - 1) // 2
idx += i * average_dur2
elif average_dur2 >= 1:
idx += i * average_dur2
else:
idx = i
for jj in range(idx, idx + self.seg_len):
frames_idx_f.append(jj)
return self._get(frames_idx_s, frames_idx_f, results)
else: # for TSM
if not self.valid_mode:
if average_dur2 > 0:
offsets_s = np.multiply(list(range(
self.num_seg[0])), average_dur1) + np.random.randint(
average_dur1, size=self.num_seg[0])
offsets_f = np.multiply(list(range(
self.num_seg[1])), average_dur2) + np.random.randint(
average_dur2, size=self.num_seg[1])
elif frames_len > self.num_seg[1]:
offsets_s = np.sort(
np.random.randint(frames_len, size=self.num_seg[0]))
offsets_f = np.sort(
np.random.randint(frames_len, size=self.num_seg[1]))
else:
offsets_s = np.zeros(shape=(self.num_seg[0], ))
offsets_f = np.zeros(shape=(self.num_seg[1], ))
else:
if frames_len > self.num_seg[1]:
average_dur_float_s = frames_len / self.num_seg[0]
offsets_s = np.array([
int(average_dur_float_s / 2.0 + average_dur_float_s * x)
for x in range(self.num_seg[0])
])
average_dur_float_f = frames_len / self.num_seg[1]
offsets_f = np.array([
int(average_dur_float_f / 2.0 + average_dur_float_f * x)
for x in range(self.num_seg[1])
])
else:
offsets_s = np.zeros(shape=(self.num_seg[0], ))
offsets_f = np.zeros(shape=(self.num_seg[1], ))
frames_idx_s = list(offsets_s)
frames_idx_f = list(offsets_f)
return self._get(frames_idx_s, frames_idx_f, results)

@ -0,0 +1,116 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from ..registry import PIPELINES
@PIPELINES.register()
class Mixup(object):
"""
Mixup operator.
Args:
alpha(float): alpha value.
"""
def __init__(self, alpha=0.2):
assert alpha > 0., \
'parameter alpha[%f] should > 0.0' % (alpha)
self.alpha = alpha
def __call__(self, batch):
imgs, labels = list(zip(*batch))
imgs = np.array(imgs)
labels = np.array(labels)
bs = len(batch)
idx = np.random.permutation(bs)
lam = np.random.beta(self.alpha, self.alpha)
lams = np.array([lam] * bs, dtype=np.float32)
imgs = lam * imgs + (1 - lam) * imgs[idx]
return list(zip(imgs, labels, labels[idx], lams))
@PIPELINES.register()
class Cutmix(object):
""" Cutmix operator
Args:
alpha(float): alpha value.
"""
def __init__(self, alpha=0.2):
assert alpha > 0., \
'parameter alpha[%f] should > 0.0' % (alpha)
self.alpha = alpha
def rand_bbox(self, size, lam):
""" rand_bbox """
w = size[2]
h = size[3]
cut_rat = np.sqrt(1. - lam)
cut_w = np.int(w * cut_rat)
cut_h = np.int(h * cut_rat)
# uniform
cx = np.random.randint(w)
cy = np.random.randint(h)
bbx1 = np.clip(cx - cut_w // 2, 0, w)
bby1 = np.clip(cy - cut_h // 2, 0, h)
bbx2 = np.clip(cx + cut_w // 2, 0, w)
bby2 = np.clip(cy + cut_h // 2, 0, h)
return bbx1, bby1, bbx2, bby2
def __call__(self, batch):
imgs, labels = list(zip(*batch))
imgs = np.array(imgs)
labels = np.array(labels)
bs = len(batch)
idx = np.random.permutation(bs)
lam = np.random.beta(self.alpha, self.alpha)
bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
(imgs.shape[-2] * imgs.shape[-1]))
lams = np.array([lam] * bs, dtype=np.float32)
return list(zip(imgs, labels, labels[idx], lams))
@PIPELINES.register()
class VideoMix(object):
"""
VideoMix operator.
Args:
cutmix_prob(float): prob choose cutmix
mixup_alpha(float): alpha for mixup aug
cutmix_alpha(float): alpha for cutmix aug
"""
def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):
assert cutmix_prob > 0., \
'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)
assert mixup_alpha > 0., \
'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)
assert cutmix_alpha > 0., \
'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)
self.cutmix_prob = cutmix_prob
self.mixup = Mixup(mixup_alpha)
self.cutmix = Cutmix(cutmix_alpha)
def __call__(self, batch):
if np.random.random() < self.cutmix_prob:
return self.cutmix(batch)
else:
return self.mixup(batch)

@ -0,0 +1,380 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
from PIL import Image
# import decord as de
import copy
import json
from ..registry import PIPELINES
try:
from paddlenlp.transformers import BertTokenizer
except ImportError as e:
print(
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
)
@PIPELINES.register()
class FeaturePadding(object):
"""
Padding feature to target shape.
"""
def __init__(self, max_region_num=36, max_action_num=5):
self.max_region_num = max_region_num
self.max_action_num = max_action_num
def __call__(self, results):
"""
Padding feature.
"""
pack_feature = results['feature']
tokenizer = results['tokenizer']
image_feature_wp, image_target_wp, image_location_wp, \
num_boxes, image_h, image_w, image_id, caption, \
action_feature_wp, action_target_wp, num_actions = pack_feature
image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
num_boxes = int(num_boxes)
image_feature[:num_boxes] = image_feature_wp
image_target[:num_boxes] = image_target_wp
image_location[:num_boxes, :4] = image_location_wp
image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
float(image_h))
image_location[:, 0] = image_location[:, 0] / float(image_w)
image_location[:, 1] = image_location[:, 1] / float(image_h)
image_location[:, 2] = image_location[:, 2] / float(image_w)
image_location[:, 3] = image_location[:, 3] / float(image_h)
image_feature = copy.deepcopy(image_feature)
image_target = copy.deepcopy(image_target)
num_actions = int(num_actions)
action_feature[:num_actions] = action_feature_wp
action_target[:num_actions] = action_target_wp
action_feature = copy.deepcopy(action_feature)
action_target = copy.deepcopy(action_target)
results = dict(image_feat=image_feature,
image_target=image_target,
caption=caption,
image_loc=image_location,
num_boxes=int(num_boxes),
action_feat=action_feature,
action_target=action_target,
num_actions=int(num_actions),
tokenizer=tokenizer)
return results
@PIPELINES.register()
class RandomCap(object):
def __init__(self, caption_path):
"""
Random Caption for NSP task
"""
self.caption_path = caption_path
def select_caption(self, caption):
captions = caption.split('!')
rind = random.randint(0, len(captions) - 1)
caption = captions[rind]
return caption
def get_random_caption(self, all_captions):
num_caps = len(all_captions)
rand_doc_idx = random.randint(0, num_caps - 1)
caption = all_captions[rand_doc_idx]
caption = self.select_caption(caption)
return caption
def random_cap(self, caption, all_captions):
if random.random() > 0.5:
label = 0
else:
caption = self.get_random_caption(all_captions)
label = 1
return caption, label
def __call__(self, results):
caption = results['caption']
all_captions = list(json.load(open(self.caption_path, 'r')))
caption = self.select_caption(caption)
caption, label = self.random_cap(caption, all_captions)
results['caption'] = caption
results['is_next'] = label
return results
@PIPELINES.register()
class Tokenize(object):
def __init__(self, ):
"""
Tokenize caption
"""
pass
def __call__(self, results):
caption = results['caption']
tokenizer = results['tokenizer']
tokens_caption = tokenizer.tokenize(caption)
results['caption'] = tokens_caption
return results
@PIPELINES.register()
class RandomMask(object):
def __init__(self,
max_seq_length=36,
max_action_length=5,
max_region_length=36):
self.max_seq_length = max_seq_length
self.max_action_length = max_action_length
self.max_region_length = max_region_length
def get_image_global_feature(self, image_feat, image_loc, image_mask):
g_image_feat = np.sum(image_feat, axis=0) / np.sum(
image_mask, axis=0, keepdims=True)
image_feat = np.concatenate(
[np.expand_dims(g_image_feat, axis=0), image_feat],
axis=0).astype("float32")
g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
image_loc = np.concatenate(
[np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
g_image_mask = np.array([1])
image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
return image_feat, image_loc, image_mask
def _truncate_seq_pair(self, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length.
This is a simple heuristic which will always truncate the longer sequence
one token at a time. This makes more sense than truncating an equal percent
of tokens from each, since if one sequence is very short then each token
that's truncated likely contains more information than a longer sequence.
"""
while True:
total_length = len(tokens_b)
if total_length <= max_length:
break
tokens_b.pop()
def random_word(self, tokens, tokenizer):
"""
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
Args:
tokens: list of str, tokenized sentence.
tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
Return:
(list of str, list of int), masked tokens and related labels for LM prediction
"""
output_label = []
for i, token in enumerate(tokens):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.8:
tokens[i] = "[MASK]"
# 10% randomly change token to random token
elif prob < 0.9:
#tok = random.choice(list(tokenizer.vocab.items()))[0]
tok = tokenizer.vocab.idx_to_token[random.randint(
0,
tokenizer.vocab_size,
)]
tokens[i] = tok
# rest 10% randomly keep current token
# append current token to output (we will predict these later)
try:
output_label.append(tokenizer.vocab[token])
except KeyError:
# For unknown words (should not occur with BPE vocab)
output_label.append(tokenizer.vocab["[UNK]"])
print(
"Cannot find token '{}' in vocab. Using [UNK] insetad".
format(token))
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return tokens, output_label
def random_region(self, image_feat, image_loc, num_boxes):
output_label = []
for i in range(num_boxes):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.9:
image_feat[i] = 0
# rest 20% randomly keep current token
# append current token to output (we will predict these later)
output_label.append(1)
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return image_feat, image_loc, output_label
def random_action(self, action_feat, action_target, num_actions):
output_label = []
for i in range(num_actions):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 90% randomly change token to mask token
if prob < 0.9:
action_feat[i] = 0
# rest 10% randomly keep current token
# append current token to output (we will predict these later)
output_label.append(action_target[i])
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return action_feat, output_label
def __call__(self, results):
caption = results['caption']
tokenizer = results['tokenizer']
image_feat = results['image_feat']
image_loc = results['image_loc']
num_boxes = results['num_boxes']
action_feat = results['action_feat']
action_target = results['action_target']
num_actions = results['num_actions']
is_next = results['is_next']
image_target = results['image_target']
self._truncate_seq_pair(caption, self.max_seq_length - 2)
caption, caption_label = self.random_word(caption, tokenizer)
image_feat, image_loc, image_label = self.random_region(
image_feat, image_loc, num_boxes)
action_feat, action_label = self.random_action(action_feat,
action_target,
num_actions)
# concatenate lm labels and account for CLS, SEP, SEP
lm_label_ids = [-1] + caption_label + [-1]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in caption:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
input_mask = [1] * (len(input_ids))
image_mask = [1] * (num_boxes)
action_mask = [1] * (num_actions)
# Zero-pad up to the visual sequence length.
while len(image_mask) < self.max_region_length:
image_mask.append(0)
image_label.append(-1)
while len(action_mask) < self.max_action_length:
action_mask.append(0)
action_label.append(-1)
# Zero-pad up to the sequence length.
while len(input_ids) < self.max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
lm_label_ids.append(-1)
assert len(input_ids) == self.max_seq_length
assert len(input_mask) == self.max_seq_length
assert len(segment_ids) == self.max_seq_length
assert len(lm_label_ids) == self.max_seq_length
assert len(image_mask) == self.max_region_length
assert len(image_label) == self.max_region_length
assert len(action_mask) == self.max_action_length
assert len(action_label) == self.max_action_length
image_feat, image_loc, image_mask = self.get_image_global_feature(
image_feat, image_loc, np.array(image_mask))
features = [
np.array(input_ids),
action_feat,
image_feat,
image_loc,
np.array(segment_ids),
np.array(input_mask),
image_mask,
np.array(action_mask),
np.array(lm_label_ids),
np.array(action_label),
np.array(is_next),
np.array(image_label),
image_target,
]
results['features'] = features
return results

@ -0,0 +1,382 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import numpy as np
from PIL import Image
try:
import SimpleITK as sitk
except ImportError as e:
print(
f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
)
import cv2
from ..registry import PIPELINES
try:
import cPickle as pickle
from cStringIO import StringIO
except ImportError:
import pickle
from io import BytesIO
@PIPELINES.register()
class Sampler(object):
"""
Sample frames id.
NOTE: Use PIL to read image here, has diff with CV2
Args:
num_seg(int): number of segments.
seg_len(int): number of sampled frames in each segment.
valid_mode(bool): True or False.
select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
Returns:
frames_idx: the index of sampled #frames.
"""
def __init__(self,
num_seg,
seg_len,
frame_interval=None,
valid_mode=False,
select_left=False,
dense_sample=False,
linspace_sample=False,
use_pil=True):
self.num_seg = num_seg
self.seg_len = seg_len
self.frame_interval = frame_interval
self.valid_mode = valid_mode
self.select_left = select_left
self.dense_sample = dense_sample
self.linspace_sample = linspace_sample
self.use_pil = use_pil
def _get(self, frames_idx, results):
data_format = results['format']
if data_format == "frame":
frame_dir = results['frame_dir']
imgs = []
for idx in frames_idx:
img = Image.open(
os.path.join(frame_dir,
results['suffix'].format(idx))).convert('RGB')
imgs.append(img)
elif data_format == "MRI":
frame_dir = results['frame_dir']
imgs = []
MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
for idx in frames_idx:
item = MRI[idx]
item = cv2.resize(item, (224, 224))
imgs.append(item)
elif data_format == "video":
if results['backend'] == 'cv2':
frames = np.array(results['frames'])
imgs = []
for idx in frames_idx:
imgbuf = frames[idx]
img = Image.fromarray(imgbuf, mode='RGB')
imgs.append(img)
elif results['backend'] == 'decord':
container = results['frames']
if self.use_pil:
frames_select = container.get_batch(frames_idx)
# dearray_to_img
np_frames = frames_select.asnumpy()
imgs = []
for i in range(np_frames.shape[0]):
imgbuf = np_frames[i]
imgs.append(Image.fromarray(imgbuf, mode='RGB'))
else:
if frames_idx.ndim != 1:
frames_idx = np.squeeze(frames_idx)
frame_dict = {
idx: container[idx].asnumpy()
for idx in np.unique(frames_idx)
}
imgs = [frame_dict[idx] for idx in frames_idx]
elif results['backend'] == 'pyav':
imgs = []
frames = np.array(results['frames'])
for idx in frames_idx:
if self.dense_sample:
idx = idx - 1
imgbuf = frames[idx]
imgs.append(imgbuf)
imgs = np.stack(imgs) # thwc
else:
raise NotImplementedError
else:
raise NotImplementedError
results['imgs'] = imgs
return results
def _get_train_clips(self, num_frames):
ori_seg_len = self.seg_len * self.frame_interval
avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
if avg_interval > 0:
base_offsets = np.arange(self.num_seg) * avg_interval
clip_offsets = base_offsets + np.random.randint(avg_interval,
size=self.num_seg)
elif num_frames > max(self.num_seg, ori_seg_len):
clip_offsets = np.sort(
np.random.randint(num_frames - ori_seg_len + 1,
size=self.num_seg))
elif avg_interval == 0:
ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
clip_offsets = np.around(np.arange(self.num_seg) * ratio)
else:
clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
return clip_offsets
def _get_test_clips(self, num_frames):
ori_seg_len = self.seg_len * self.frame_interval
avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
if num_frames > ori_seg_len - 1:
base_offsets = np.arange(self.num_seg) * avg_interval
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
else:
clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
return clip_offsets
def __call__(self, results):
"""
Args:
frames_len: length of frames.
return:
sampling id.
"""
frames_len = int(results['frames_len'])
frames_idx = []
if self.frame_interval is not None:
assert isinstance(self.frame_interval, int)
if not self.valid_mode:
offsets = self._get_train_clips(frames_len)
else:
offsets = self._get_test_clips(frames_len)
offsets = offsets[:, None] + np.arange(
self.seg_len)[None, :] * self.frame_interval
offsets = np.concatenate(offsets)
offsets = offsets.reshape((-1, self.seg_len))
offsets = np.mod(offsets, frames_len)
offsets = np.concatenate(offsets)
if results['format'] == 'video':
frames_idx = offsets
elif results['format'] == 'frame':
frames_idx = list(offsets + 1)
else:
raise NotImplementedError
return self._get(frames_idx, results)
if self.linspace_sample:
if 'start_idx' in results and 'end_idx' in results:
offsets = np.linspace(results['start_idx'], results['end_idx'],
self.num_seg)
else:
offsets = np.linspace(0, frames_len - 1, self.num_seg)
offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
if results['format'] == 'video':
frames_idx = list(offsets)
frames_idx = [x % frames_len for x in frames_idx]
elif results['format'] == 'frame':
frames_idx = list(offsets + 1)
elif results['format'] == 'MRI':
frames_idx = list(offsets)
else:
raise NotImplementedError
return self._get(frames_idx, results)
average_dur = int(frames_len / self.num_seg)
if not self.select_left:
if self.dense_sample: # For ppTSM
if not self.valid_mode: # train
sample_pos = max(1, 1 + frames_len - 64)
t_stride = 64 // self.num_seg
start_idx = 0 if sample_pos == 1 else np.random.randint(
0, sample_pos - 1)
offsets = [(idx * t_stride + start_idx) % frames_len + 1
for idx in range(self.num_seg)]
frames_idx = offsets
else:
sample_pos = max(1, 1 + frames_len - 64)
t_stride = 64 // self.num_seg
start_list = np.linspace(0,
sample_pos - 1,
num=10,
dtype=int)
offsets = []
for start_idx in start_list.tolist():
offsets += [
(idx * t_stride + start_idx) % frames_len + 1
for idx in range(self.num_seg)
]
frames_idx = offsets
else:
for i in range(self.num_seg):
idx = 0
if not self.valid_mode:
if average_dur >= self.seg_len:
idx = random.randint(0, average_dur - self.seg_len)
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
else:
if average_dur >= self.seg_len:
idx = (average_dur - 1) // 2
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
for jj in range(idx, idx + self.seg_len):
if results['format'] == 'video':
frames_idx.append(int(jj % frames_len))
elif results['format'] == 'frame':
frames_idx.append(jj + 1)
elif results['format'] == 'MRI':
frames_idx.append(jj)
else:
raise NotImplementedError
return self._get(frames_idx, results)
else: # for TSM
if not self.valid_mode:
if average_dur > 0:
offsets = np.multiply(list(range(self.num_seg)),
average_dur) + np.random.randint(
average_dur, size=self.num_seg)
elif frames_len > self.num_seg:
offsets = np.sort(
np.random.randint(frames_len, size=self.num_seg))
else:
offsets = np.zeros(shape=(self.num_seg, ))
else:
if frames_len > self.num_seg:
average_dur_float = frames_len / self.num_seg
offsets = np.array([
int(average_dur_float / 2.0 + average_dur_float * x)
for x in range(self.num_seg)
])
else:
offsets = np.zeros(shape=(self.num_seg, ))
if results['format'] == 'video':
frames_idx = list(offsets)
frames_idx = [x % frames_len for x in frames_idx]
elif results['format'] == 'frame':
frames_idx = list(offsets + 1)
elif results['format'] == 'MRI':
frames_idx = list(offsets)
else:
raise NotImplementedError
return self._get(frames_idx, results)
@PIPELINES.register()
class SamplerPkl(object):
"""
Sample frames id.
NOTE: Use PIL to read image here, has diff with CV2
Args:
num_seg(int): number of segments.
seg_len(int): number of sampled frames in each segment.
mode(str): 'train', 'valid'
Returns:
frames_idx: the index of sampled #frames.
"""
def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):
self.num_seg = num_seg
self.seg_len = seg_len
self.valid_mode = valid_mode
self.backend = backend
def _get(self, buf):
if isinstance(buf, str):
img = Image.open(StringIO(buf))
else:
img = Image.open(BytesIO(buf))
img = img.convert('RGB')
if self.backend != 'pillow':
img = np.array(img)
return img
def __call__(self, results):
"""
Args:
frames_len: length of frames.
return:
sampling id.
"""
filename = results['frame_dir']
data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')
video_name, label, frames = data_loaded
if isinstance(label, dict):
label = label['动作类型']
results['labels'] = label
elif len(label) == 1:
results['labels'] = int(label[0])
else:
results['labels'] = int(label[0]) if random.random() < 0.5 else int(
label[1])
results['frames_len'] = len(frames)
frames_len = results['frames_len']
average_dur = int(int(frames_len) / self.num_seg)
imgs = []
for i in range(self.num_seg):
idx = 0
if not self.valid_mode:
if average_dur >= self.seg_len:
idx = random.randint(0, average_dur - self.seg_len)
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
else:
if average_dur >= self.seg_len:
idx = (average_dur - 1) // 2
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
for jj in range(idx, idx + self.seg_len):
imgbuf = frames[int(jj % results['frames_len'])]
img = self._get(imgbuf)
imgs.append(img)
results['backend'] = self.backend
results['imgs'] = imgs
return results

@ -0,0 +1,375 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from PIL import Image
from ..registry import PIPELINES
import os
import numpy as np
import io
import os.path as osp
from abc import ABCMeta, abstractmethod
import cv2
from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED
import inspect
imread_backend = 'cv2'
imread_flags = {
'color': IMREAD_COLOR,
'grayscale': IMREAD_GRAYSCALE,
'unchanged': IMREAD_UNCHANGED
}
@PIPELINES.register()
class SampleFrames:
"""Sample frames from the video. """
def __init__(self,
clip_len,
frame_interval=1,
num_clips=1,
temporal_jitter=False,
twice_sample=False,
out_of_bound_opt='loop',
test_mode=False):
self.clip_len = clip_len
self.frame_interval = frame_interval
self.num_clips = num_clips
self.temporal_jitter = temporal_jitter
self.twice_sample = twice_sample
self.out_of_bound_opt = out_of_bound_opt
self.test_mode = test_mode
assert self.out_of_bound_opt in ['loop', 'repeat_last']
def _get_train_clips(self, num_frames):
"""Get clip offsets in train mode. """
ori_clip_len = self.clip_len * self.frame_interval
avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
if avg_interval > 0:
base_offsets = np.arange(self.num_clips) * avg_interval
clip_offsets = base_offsets + np.random.randint(
avg_interval, size=self.num_clips)
elif num_frames > max(self.num_clips, ori_clip_len):
clip_offsets = np.sort(
np.random.randint(
num_frames - ori_clip_len + 1, size=self.num_clips))
elif avg_interval == 0:
ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
clip_offsets = np.around(np.arange(self.num_clips) * ratio)
else:
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
return clip_offsets
def _get_test_clips(self, num_frames):
"""Get clip offsets in test mode. """
ori_clip_len = self.clip_len * self.frame_interval
avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
if num_frames > ori_clip_len - 1:
base_offsets = np.arange(self.num_clips) * avg_interval
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
if self.twice_sample:
clip_offsets = np.concatenate([clip_offsets, base_offsets])
else:
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
return clip_offsets
def _sample_clips(self, num_frames):
"""Choose clip offsets for the video in a given mode. """
if self.test_mode:
clip_offsets = self._get_test_clips(num_frames)
else:
clip_offsets = self._get_train_clips(num_frames)
return clip_offsets
def __call__(self, results):
"""Perform the SampleFrames loading. """
total_frames = results['total_frames']
clip_offsets = self._sample_clips(total_frames)
frame_inds = clip_offsets[:, None] + np.arange(
self.clip_len)[None, :] * self.frame_interval
frame_inds = np.concatenate(frame_inds)
if self.temporal_jitter:
perframe_offsets = np.random.randint(
self.frame_interval, size=len(frame_inds))
frame_inds += perframe_offsets
frame_inds = frame_inds.reshape((-1, self.clip_len))
if self.out_of_bound_opt == 'loop':
frame_inds = np.mod(frame_inds, total_frames)
elif self.out_of_bound_opt == 'repeat_last':
safe_inds = frame_inds < total_frames
unsafe_inds = 1 - safe_inds
last_ind = np.max(safe_inds * frame_inds, axis=1)
new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
frame_inds = new_inds
else:
raise ValueError('Illegal out_of_bound option.')
start_index = results['start_index']
frame_inds = np.concatenate(frame_inds) + start_index
results['frame_inds'] = frame_inds.astype(np.int)
results['clip_len'] = self.clip_len
results['frame_interval'] = self.frame_interval
results['num_clips'] = self.num_clips
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'clip_len={self.clip_len}, '
f'frame_interval={self.frame_interval}, '
f'num_clips={self.num_clips}, '
f'temporal_jitter={self.temporal_jitter}, '
f'twice_sample={self.twice_sample}, '
f'out_of_bound_opt={self.out_of_bound_opt}, '
f'test_mode={self.test_mode})')
return repr_str
class BaseStorageBackend(metaclass=ABCMeta):
"""Abstract class of storage backends. """
@abstractmethod
def get(self, filepath):
pass
@abstractmethod
def get_text(self, filepath):
pass
class HardDiskBackend(BaseStorageBackend):
"""Raw hard disks storage backend."""
def get(self, filepath):
filepath = str(filepath)
with open(filepath, 'rb') as f:
value_buf = f.read()
return value_buf
def get_text(self, filepath):
filepath = str(filepath)
with open(filepath, 'r') as f:
value_buf = f.read()
return value_buf
class FileClient:
"""A general file client to access files in different backend. """
_backends = {
'disk': HardDiskBackend,
}
def __init__(self, backend='disk', **kwargs):
if backend not in self._backends:
raise ValueError(
f'Backend {backend} is not supported. Currently supported ones'
f' are {list(self._backends.keys())}')
self.backend = backend
self.client = self._backends[backend](**kwargs)
@classmethod
def _register_backend(cls, name, backend, force=False):
if not isinstance(name, str):
raise TypeError('the backend name should be a string, '
f'but got {type(name)}')
if not inspect.isclass(backend):
raise TypeError(
f'backend should be a class but got {type(backend)}')
if not issubclass(backend, BaseStorageBackend):
raise TypeError(
f'backend {backend} is not a subclass of BaseStorageBackend')
if not force and name in cls._backends:
raise KeyError(
f'{name} is already registered as a storage backend, '
'add "force=True" if you want to override it')
cls._backends[name] = backend
@classmethod
def register_backend(cls, name, backend=None, force=False):
"""Register a backend to FileClient. """
if backend is not None:
cls._register_backend(name, backend, force=force)
return
def _register(backend_cls):
cls._register_backend(name, backend_cls, force=force)
return backend_cls
return _register
def get(self, filepath):
return self.client.get(filepath)
def get_text(self, filepath):
return self.client.get_text(filepath)
@PIPELINES.register()
class RawFrameDecode:
"""Load and decode frames with given indices. """
def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
self.io_backend = io_backend
self.decoding_backend = decoding_backend
self.kwargs = kwargs
self.file_client = None
def _pillow2array(self,img, flag='color', channel_order='bgr'):
"""Convert a pillow image to numpy array. """
channel_order = channel_order.lower()
if channel_order not in ['rgb', 'bgr']:
raise ValueError('channel order must be either "rgb" or "bgr"')
if flag == 'unchanged':
array = np.array(img)
if array.ndim >= 3 and array.shape[2] >= 3: # color image
array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR
else:
# If the image mode is not 'RGB', convert it to 'RGB' first.
if img.mode != 'RGB':
if img.mode != 'LA':
# Most formats except 'LA' can be directly converted to RGB
img = img.convert('RGB')
else:
# When the mode is 'LA', the default conversion will fill in
# the canvas with black, which sometimes shadows black objects
# in the foreground.
#
# Therefore, a random color (124, 117, 104) is used for canvas
img_rgba = img.convert('RGBA')
img = Image.new('RGB', img_rgba.size, (124, 117, 104))
img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha
if flag == 'color':
array = np.array(img)
if channel_order != 'rgb':
array = array[:, :, ::-1] # RGB to BGR
elif flag == 'grayscale':
img = img.convert('L')
array = np.array(img)
else:
raise ValueError(
'flag must be "color", "grayscale" or "unchanged", '
f'but got {flag}')
return array
def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):
"""Read an image from bytes. """
img_np = np.frombuffer(content, np.uint8)
flag = imread_flags[flag] if isinstance(flag, str) else flag
img = cv2.imdecode(img_np, flag)
if flag == IMREAD_COLOR and channel_order == 'rgb':
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
return img
def __call__(self, results):
"""Perform the ``RawFrameDecode`` to pick frames given indices.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
# mmcv.use_backend(self.decoding_backend)
directory = results['frame_dir']
suffix = results['suffix']
#modality = results['modality']
if self.file_client is None:
self.file_client = FileClient(self.io_backend, **self.kwargs)
imgs = list()
if results['frame_inds'].ndim != 1:
results['frame_inds'] = np.squeeze(results['frame_inds'])
offset = results.get('offset', 0)
for frame_idx in results['frame_inds']:
frame_idx += offset
filepath = osp.join(directory, suffix.format(frame_idx))
img_bytes = self.file_client.get(filepath) #以二进制方式读取图片
# Get frame with channel order RGB directly.
cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')
imgs.append(cur_frame)
results['imgs'] = imgs
results['original_shape'] = imgs[0].shape[:2]
results['img_shape'] = imgs[0].shape[:2]
# we resize the gt_bboxes and proposals to their real scale
h, w = results['img_shape']
scale_factor = np.array([w, h, w, h])
if 'gt_bboxes' in results:
gt_bboxes = results['gt_bboxes']
gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)
results['gt_bboxes'] = gt_bboxes_new
if 'proposals' in results and results['proposals'] is not None:
proposals = results['proposals']
proposals = (proposals * scale_factor).astype(np.float32)
results['proposals'] = proposals
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'io_backend={self.io_backend}, '
f'decoding_backend={self.decoding_backend})')
return repr_str
@PIPELINES.register()
class SampleAVAFrames(SampleFrames):
def __init__(self, clip_len, frame_interval=2, test_mode=False):
super().__init__(clip_len, frame_interval, test_mode=test_mode)
def _get_clips(self, center_index, skip_offsets, shot_info):
start = center_index - (self.clip_len // 2) * self.frame_interval
end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
frame_inds = list(range(start, end, self.frame_interval))
frame_inds = frame_inds + skip_offsets
frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
return frame_inds
def __call__(self, results):
fps = results['fps']
timestamp = results['timestamp']
timestamp_start = results['timestamp_start']
shot_info = results['shot_info']
#delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒
#center_index=fps*delta为该帧距离15min视频开头有几帧
#center_index+1是为了避免后续采样时出现负数?
#后续需要以center_index为中心前后采样视频帧片段
center_index = fps * (timestamp - timestamp_start) + 1
skip_offsets = np.random.randint(
-self.frame_interval // 2, (self.frame_interval + 1) // 2,
size=self.clip_len)
frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
results['frame_inds'] = np.array(frame_inds, dtype=np.int)
results['clip_len'] = self.clip_len
results['frame_interval'] = self.frame_interval
results['num_clips'] = 1
results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'clip_len={self.clip_len}, '
f'frame_interval={self.frame_interval}, '
f'test_mode={self.test_mode})')
return repr_str

@ -0,0 +1,69 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from PIL import Image
from ..registry import PIPELINES
@PIPELINES.register()
class SamplerUCF24(object):
"""
Sample frames id.
NOTE: Use PIL to read image here, has diff with CV2
Args:
num_frames(int): The amount of frames used in a video
frame_interval(int): Sampling rate
valid_mode(bool): True or False.
Returns:
frames_idx: the index of sampled #frames.
"""
def __init__(self,
num_frames=16,
frame_interval=1,
valid_mode=False):
self.num_frames = num_frames
self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)
self.valid_mode = valid_mode
def _get(self, frames_idxs, img_folder, results):
imgs = []
for idx in frames_idxs:
img = Image.open(
os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')
imgs.append(img)
results['imgs'] = imgs
return results
def _make_clip(self, im_ind, max_num):
frame_idxs = []
for i in reversed(range(self.num_frames)):
# make it as a loop
i_temp = im_ind - i * self.frame_interval
if i_temp < 1:
i_temp = 1
elif i_temp > max_num:
i_temp = max_num
frame_idxs.append(i_temp)
return frame_idxs
def __call__(self, results):
img_folder, key_frame = os.path.split(results['filename'])
frame_len = len(os.listdir(img_folder))
key_idx = int(key_frame[0:5])
frame_idxs = self._make_clip(key_idx, frame_len)
return self._get(frame_idxs, img_folder, results)

@ -0,0 +1,130 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from PIL import Image
import copy
import cv2
from ..registry import PIPELINES
@PIPELINES.register()
class MultiRestrictSize(object):
def __init__(self,
min_size=None,
max_size=800,
flip=False,
multi_scale=[1.3]):
self.min_size = min_size
self.max_size = max_size
self.multi_scale = multi_scale
self.flip = flip
assert ((min_size is None)) or ((max_size is None))
def __call__(self, sample):
samples = []
image = sample['current_img']
h, w = image.shape[:2]
for scale in self.multi_scale:
# Fixed range of scales
sc = None
# Align short edge
if not (self.min_size is None):
if h > w:
short_edge = w
else:
short_edge = h
if short_edge > self.min_size:
sc = float(self.min_size) / short_edge
else:
if h > w:
long_edge = h
else:
long_edge = w
if long_edge > self.max_size:
sc = float(self.max_size) / long_edge
if sc is None:
new_h = h
new_w = w
else:
new_h = sc * h
new_w = sc * w
new_h = int(new_h * scale)
new_w = int(new_w * scale)
if (new_h - 1) % 16 != 0:
new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)
if (new_w - 1) % 16 != 0:
new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)
if new_h == h and new_w == w:
samples.append(sample)
else:
new_sample = {}
for elem in sample.keys():
if 'meta' in elem:
new_sample[elem] = sample[elem]
continue
tmp = sample[elem]
if 'label' in elem:
new_sample[elem] = sample[elem]
continue
else:
flagval = cv2.INTER_CUBIC
tmp = cv2.resize(tmp,
dsize=(new_w, new_h),
interpolation=flagval)
new_sample[elem] = tmp
samples.append(new_sample)
if self.flip:
now_sample = samples[-1]
new_sample = {}
for elem in now_sample.keys():
if 'meta' in elem:
new_sample[elem] = now_sample[elem].copy()
new_sample[elem]['flip'] = True
continue
tmp = now_sample[elem]
tmp = tmp[:, ::-1].copy()
new_sample[elem] = tmp
samples.append(new_sample)
return samples
@PIPELINES.register()
class MultiNorm(object):
def __call__(self, samples):
for idx in range(len(samples)):
sample = samples[idx]
for elem in sample.keys():
if 'meta' in elem:
continue
tmp = sample[elem]
if tmp is None:
continue
if tmp.ndim == 2:
tmp = tmp[:, :, np.newaxis]
else:
tmp = tmp / 255.
tmp -= (0.485, 0.456, 0.406)
tmp /= (0.229, 0.224, 0.225)
tmp = tmp.transpose((2, 0, 1))
samples[idx][elem] = tmp
return samples

@ -0,0 +1,40 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os
import numpy as np
import random
import paddle
from ..registry import PIPELINES
"""
pipeline ops for Action Segmentation Dataset.
"""
@PIPELINES.register()
class SegmentationSampler(object):
def __init__(self, sample_rate):
self.sample_rate = sample_rate
def __call__(self, results):
for key, data in results.items():
if len(data.shape) == 1:
data = data[::self.sample_rate]
results[key] = copy.deepcopy(data)
else:
data = data[:, ::self.sample_rate]
results[key] = copy.deepcopy(data)
return results

@ -0,0 +1,18 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..utils import Registry
PIPELINES = Registry("pipeline")
DATASETS = Registry("datasets")

@ -0,0 +1,3 @@
from .anet_prop import ANETproposal
__all__ = ['ANETproposal']

@ -0,0 +1,359 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import json
import numpy as np
import pandas as pd
import urllib.request as urllib2
from paddlevideo.utils import get_logger
logger = get_logger("paddlevideo")
class ANETproposal(object):
"""
This class is used for calculating AR@N and AUC;
Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)
"""
GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
PROPOSAL_FIELDS = ['results', 'version', 'external_data']
API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'
def __init__(self,
ground_truth_filename=None,
proposal_filename=None,
ground_truth_fields=GROUND_TRUTH_FIELDS,
proposal_fields=PROPOSAL_FIELDS,
tiou_thresholds=np.linspace(0.5, 0.95, 10),
max_avg_nr_proposals=None,
subset='validation',
verbose=False,
check_status=True):
if not ground_truth_filename:
raise IOError('Please input a valid ground truth file.')
if not proposal_filename:
raise IOError('Please input a valid proposal file.')
self.subset = subset
self.tiou_thresholds = tiou_thresholds
self.max_avg_nr_proposals = max_avg_nr_proposals
self.verbose = verbose
self.gt_fields = ground_truth_fields
self.pred_fields = proposal_fields
self.recall = None
self.avg_recall = None
self.proposals_per_video = None
self.check_status = check_status
# Retrieve blocked videos from server.
if self.check_status:
self.blocked_videos = self.get_blocked_videos()
else:
self.blocked_videos = list()
# Import ground truth and proposals.
self.ground_truth, self.activity_index = self._import_ground_truth(
ground_truth_filename)
self.proposal = self._import_proposal(proposal_filename)
if self.verbose:
print('[INIT] Loaded annotations from {} subset.'.format(subset))
nr_gt = len(self.ground_truth)
print('\tNumber of ground truth instances: {}'.format(nr_gt))
nr_pred = len(self.proposal)
print('\tNumber of proposals: {}'.format(nr_pred))
print('\tFixed threshold for tiou score: {}'.format(
self.tiou_thresholds))
def _import_ground_truth(self, ground_truth_filename):
"""
Reads ground truth file, checks if it is well formatted, and returns
the ground truth instances and the activity classes.
Parameters:
ground_truth_filename (str): full path to the ground truth json file.
Returns:
ground_truth (df): Data frame containing the ground truth instances.
activity_index (dict): Dictionary containing class index.
"""
with open(ground_truth_filename, 'r') as fobj:
data = json.load(fobj)
# Checking format
if not all([field in data.keys() for field in self.gt_fields]):
raise IOError('Please input a valid ground truth file.')
# Read ground truth data.
activity_index, cidx = {}, 0
video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
for videoid, v in data['database'].items():
if self.subset != v['subset']:
continue
if videoid in self.blocked_videos:
continue
for ann in v['annotations']:
if ann['label'] not in activity_index:
activity_index[ann['label']] = cidx
cidx += 1
video_lst.append(videoid)
t_start_lst.append(float(ann['segment'][0]))
t_end_lst.append(float(ann['segment'][1]))
label_lst.append(activity_index[ann['label']])
ground_truth = pd.DataFrame({
'video-id': video_lst,
't-start': t_start_lst,
't-end': t_end_lst,
'label': label_lst
})
return ground_truth, activity_index
def _import_proposal(self, proposal_filename):
"""
Reads proposal file, checks if it is well formatted, and returns
the proposal instances.
Parameters:
proposal_filename (str): Full path to the proposal json file.
Returns:
proposal (df): Data frame containing the proposal instances.
"""
with open(proposal_filename, 'r') as fobj:
data = json.load(fobj)
# Checking format...
if not all([field in data.keys() for field in self.pred_fields]):
raise IOError('Please input a valid proposal file.')
# Read predictions.
video_lst, t_start_lst, t_end_lst = [], [], []
score_lst = []
for videoid, v in data['results'].items():
if videoid in self.blocked_videos:
continue
for result in v:
video_lst.append(videoid)
t_start_lst.append(float(result['segment'][0]))
t_end_lst.append(float(result['segment'][1]))
score_lst.append(result['score'])
proposal = pd.DataFrame({
'video-id': video_lst,
't-start': t_start_lst,
't-end': t_end_lst,
'score': score_lst
})
return proposal
def evaluate(self):
"""
Evaluates a proposal file. To measure the performance of a
method for the proposal task, we computes the area under the
average recall vs average number of proposals per video curve.
"""
recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(
self.ground_truth,
self.proposal,
max_avg_nr_proposals=self.max_avg_nr_proposals,
tiou_thresholds=self.tiou_thresholds)
area_under_curve = np.trapz(avg_recall, proposals_per_video)
if self.verbose:
print('[RESULTS] Performance on ActivityNet proposal task.')
with open("data/bmn/BMN_Test_results/auc_result.txt",
"a") as text_file:
text_file.write(
'\tArea Under the AR vs AN curve: {}% \n'.format(
100. * float(area_under_curve) /
proposals_per_video[-1]))
print('\tArea Under the AR vs AN curve: {}%'.format(
100. * float(area_under_curve) / proposals_per_video[-1]))
self.recall = recall
self.avg_recall = avg_recall
self.proposals_per_video = proposals_per_video
def average_recall_vs_avg_nr_proposals(self,
ground_truth,
proposals,
max_avg_nr_proposals=None,
tiou_thresholds=np.linspace(
0.5, 0.95, 10)):
"""
Computes the average recall given an average number of
proposals per video.
Parameters:
ground_truth(df): Data frame containing the ground truth instances.
Required fields: ['video-id', 't-start', 't-end']
proposal(df): Data frame containing the proposal instances.
Required fields: ['video-id, 't-start', 't-end', 'score']
tiou_thresholds(1d-array | optional): array with tiou thresholds.
Returns:
recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth
average number of average number of proposals per video.
average_recall(1d-array): recall averaged over a list of tiou threshold.
This is equivalent to recall.mean(axis=0).
proposals_per_video(1d-array): average number of proposals per video.
"""
# Get list of videos.
video_lst = ground_truth['video-id'].unique()
if not max_avg_nr_proposals:
max_avg_nr_proposals = float(
proposals.shape[0]) / video_lst.shape[0]
ratio = max_avg_nr_proposals * float(
video_lst.shape[0]) / proposals.shape[0]
# Adaptation to query faster
ground_truth_gbvn = ground_truth.groupby('video-id')
proposals_gbvn = proposals.groupby('video-id')
# For each video, computes tiou scores among the retrieved proposals.
score_lst = []
total_nr_proposals = 0
for videoid in video_lst:
# Get ground-truth instances associated to this video.
ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
this_video_ground_truth = ground_truth_videoid.loc[:, [
't-start', 't-end'
]].values
# Get proposals for this video.
try:
proposals_videoid = proposals_gbvn.get_group(videoid)
except:
n = this_video_ground_truth.shape[0]
score_lst.append(np.zeros((n, 1)))
continue
this_video_proposals = proposals_videoid.loc[:,
['t-start', 't-end'
]].values
if this_video_proposals.shape[0] == 0:
n = this_video_ground_truth.shape[0]
score_lst.append(np.zeros((n, 1)))
continue
# Sort proposals by score.
sort_idx = proposals_videoid['score'].argsort()[::-1]
this_video_proposals = this_video_proposals[sort_idx, :]
if this_video_proposals.ndim != 2:
this_video_proposals = np.expand_dims(this_video_proposals,
axis=0)
if this_video_ground_truth.ndim != 2:
this_video_ground_truth = np.expand_dims(
this_video_ground_truth, axis=0)
nr_proposals = np.minimum(
int(this_video_proposals.shape[0] * ratio),
this_video_proposals.shape[0])
total_nr_proposals += nr_proposals
this_video_proposals = this_video_proposals[:nr_proposals, :]
# Compute tiou scores.
tiou = self.wrapper_segment_iou(this_video_proposals,
this_video_ground_truth)
score_lst.append(tiou)
# Given that the length of the videos is really varied, we
# compute the number of proposals in terms of a ratio of the total
# proposals retrieved, i.e. average recall at a percentage of proposals
# retrieved per video.
# Computes average recall.
pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(
video_lst.shape[0]) / total_nr_proposals)
matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))
positives = np.empty(video_lst.shape[0])
recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))
# Iterates over each tiou threshold.
for ridx, tiou in enumerate(tiou_thresholds):
# Inspect positives retrieved per video at different
# number of proposals (percentage of the total retrieved).
for i, score in enumerate(score_lst):
# Total positives per video.
positives[i] = score.shape[0]
# Find proposals that satisfies minimum tiou threshold.
true_positives_tiou = score >= tiou
# Get number of proposals as a percentage of total retrieved.
pcn_proposals = np.minimum(
(score.shape[1] * pcn_lst).astype(int), score.shape[1])
for j, nr_proposals in enumerate(pcn_proposals):
# Compute the number of matches for each percentage of the proposals
matches[i, j] = np.count_nonzero(
(true_positives_tiou[:, :nr_proposals]).sum(axis=1))
# Computes recall given the set of matches per video.
recall[ridx, :] = matches.sum(axis=0) / positives.sum()
# Recall is averaged.
avg_recall = recall.mean(axis=0)
# Get the average number of proposals per video.
proposals_per_video = pcn_lst * (float(total_nr_proposals) /
video_lst.shape[0])
return recall, avg_recall, proposals_per_video
def get_blocked_videos(self, api=API):
api_url = '{}?action=get_blocked'.format(api)
req = urllib2.Request(api_url)
response = urllib2.urlopen(req)
return json.loads(response.read())
def wrapper_segment_iou(self, target_segments, candidate_segments):
"""
Compute intersection over union btw segments
Parameters:
target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]
candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]
Returns:
tiou(nd-array): 2-dim array [n x m] with IOU ratio.
Note: It assumes that candidate-segments are more scarce that target-segments
"""
if candidate_segments.ndim != 2 or target_segments.ndim != 2:
raise ValueError('Dimension of arguments is incorrect')
n, m = candidate_segments.shape[0], target_segments.shape[0]
tiou = np.empty((n, m))
for i in range(m):
tiou[:, i] = self.segment_iou(target_segments[i, :],
candidate_segments)
return tiou
def segment_iou(self, target_segment, candidate_segments):
"""
Compute the temporal intersection over union between a
target segment and all the test segments.
Parameters:
target_segment(1d-array): Temporal target segment containing [starting, ending] times.
candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.
Returns:
tiou(1d-array): Temporal intersection over union score of the N's candidate segments.
"""
tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
# Intersection including Non-negative overlap score.
segments_intersection = (tt2 - tt1).clip(0)
# Segment union.
segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
+ (target_segment[1] - target_segment[0]) - segments_intersection
# Compute overlap as the ratio of the intersection
# over union of two segments.
tIoU = segments_intersection.astype(float) / segments_union
return tIoU

@ -0,0 +1,36 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .bmn_metric import BMNMetric
from .build import build_metric
from .center_crop_metric import CenterCropMetric
from .depth_metric import DepthMetric
from .msrvtt_metric import MSRVTTMetric
from .multi_crop_metric import MultiCropMetric
from .registry import METRIC
from .skeleton_metric import SkeletonMetric
from .transnetv2_metric import TransNetV2Metric
from .youtube8m.eval_util import HitOneMetric
from .segmentation_metric import SegmentationMetric
from .ava_metric import AVAMetric
from .vos_metric import VOSMetric
from .center_crop_metric_MRI import CenterCropMetric_MRI
from .yowo_metric import YOWOMetric
__all__ = [
'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',
'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',
'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',
'SegmentationMetric', 'YOWOMetric'
]

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save