0808更新项目代码
parent
65cca19d91
commit
530524ff53
@ -0,0 +1,152 @@
|
||||
# Copyright 2020 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""MediaPipe solution drawing utils."""
|
||||
|
||||
import math
|
||||
from typing import List, Mapping, Optional, Tuple, Union
|
||||
|
||||
import cv2
|
||||
import dataclasses
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from mediapipe.framework.formats import detection_pb2
|
||||
from mediapipe.framework.formats import location_data_pb2
|
||||
from mediapipe.framework.formats import landmark_pb2
|
||||
|
||||
_PRESENCE_THRESHOLD = 0.5
|
||||
_VISIBILITY_THRESHOLD = 0.5
|
||||
_BGR_CHANNELS = 3
|
||||
|
||||
WHITE_COLOR = (224, 224, 224)
|
||||
BLACK_COLOR = (0, 0, 0)
|
||||
RED_COLOR = (0, 0, 255)
|
||||
GREEN_COLOR = (0, 128, 0)
|
||||
BLUE_COLOR = (255, 0, 0)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class DrawingSpec:
|
||||
# Color for drawing the annotation. Default to the white color.
|
||||
color: Tuple[int, int, int] = WHITE_COLOR
|
||||
# Thickness for drawing the annotation. Default to 2 pixels.
|
||||
thickness: int = 2
|
||||
# Circle radius. Default to 2 pixels.
|
||||
circle_radius: int = 2
|
||||
|
||||
|
||||
def _normalized_to_pixel_coordinates(
|
||||
normalized_x: float, normalized_y: float, image_width: int,
|
||||
image_height: int) -> Union[None, Tuple[int, int]]:
|
||||
"""Converts normalized value pair to pixel coordinates."""
|
||||
|
||||
# Checks if the float value is between 0 and 1.
|
||||
def is_valid_normalized_value(value: float) -> bool:
|
||||
return (value > 0 or math.isclose(0, value)) and (value < 1 or
|
||||
math.isclose(1, value))
|
||||
|
||||
if not (is_valid_normalized_value(normalized_x) and
|
||||
is_valid_normalized_value(normalized_y)):
|
||||
# TODO: Draw coordinates even if it's outside of the image bounds.
|
||||
return None
|
||||
x_px = min(math.floor(normalized_x * image_width), image_width - 1)
|
||||
y_px = min(math.floor(normalized_y * image_height), image_height - 1)
|
||||
return x_px, y_px
|
||||
|
||||
|
||||
|
||||
def draw_landmarks(
|
||||
image: np.ndarray,
|
||||
landmark_list: landmark_pb2.NormalizedLandmarkList,
|
||||
connections: Optional[List[Tuple[int, int]]] = None):
|
||||
"""Draws the landmarks and the connections on the image.
|
||||
|
||||
Args:
|
||||
image: A three channel BGR image represented as numpy ndarray.
|
||||
landmark_list: A normalized landmark list proto message to be annotated on
|
||||
the image.
|
||||
connections: A list of landmark index tuples that specifies how landmarks to
|
||||
be connected in the drawing.
|
||||
landmark_drawing_spec: Either a DrawingSpec object or a mapping from hand
|
||||
landmarks to the DrawingSpecs that specifies the landmarks' drawing
|
||||
settings such as color, line thickness, and circle radius. If this
|
||||
argument is explicitly set to None, no landmarks will be drawn.
|
||||
connection_drawing_spec: Either a DrawingSpec object or a mapping from hand
|
||||
connections to the DrawingSpecs that specifies the connections' drawing
|
||||
settings such as color and line thickness. If this argument is explicitly
|
||||
set to None, no landmark connections will be drawn.
|
||||
|
||||
Raises:
|
||||
ValueError: If one of the followings:
|
||||
a) If the input image is not three channel BGR.
|
||||
b) If any connetions contain invalid landmark index.
|
||||
"""
|
||||
if not landmark_list:
|
||||
return
|
||||
if image.shape[2] != _BGR_CHANNELS:
|
||||
raise ValueError('Input image must contain three channel bgr data.')
|
||||
image_rows, image_cols, _ = image.shape
|
||||
|
||||
# 所有的点转换成坐标的字典
|
||||
idx_to_coordinates = {}
|
||||
for idx, landmark in enumerate(landmark_list.landmark):
|
||||
# print('landmark:',landmark)
|
||||
if ((landmark.HasField('visibility') and
|
||||
landmark.visibility < _VISIBILITY_THRESHOLD) or
|
||||
(landmark.HasField('presence') and
|
||||
landmark.presence < _PRESENCE_THRESHOLD)):
|
||||
continue
|
||||
landmark_px = _normalized_to_pixel_coordinates(landmark.x, landmark.y,
|
||||
image_cols, image_rows)
|
||||
# print('landmark_px:',landmark_px)
|
||||
if landmark_px:
|
||||
idx_to_coordinates[idx] = landmark_px
|
||||
|
||||
|
||||
if connections:
|
||||
num_landmarks = len(landmark_list.landmark)
|
||||
# print('connections:',connections)
|
||||
|
||||
# Draws the connections if the start and end landmarks are both visible.
|
||||
|
||||
start_list = []
|
||||
end_list = []
|
||||
for connection in connections:
|
||||
# print(connection)
|
||||
|
||||
start_idx = connection[0]
|
||||
end_idx = connection[1]
|
||||
|
||||
start_list.append(start_idx)
|
||||
end_list.append(end_idx)
|
||||
|
||||
|
||||
point_list = []
|
||||
for point_idx in end_list:
|
||||
|
||||
# if point_idx not in start_list:
|
||||
|
||||
# print(point_idx)
|
||||
point_list.append(point_idx)
|
||||
|
||||
|
||||
point_axis_list = []
|
||||
for point in point_list:
|
||||
|
||||
if point in list(idx_to_coordinates.keys()):
|
||||
point_axis_list.append(idx_to_coordinates[point])
|
||||
|
||||
|
||||
return point_axis_list
|
||||
|
@ -0,0 +1,15 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .version import paddlevideo_version
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,22 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .builder import build_dataset, build_dataloader, build_batch_pipeline
|
||||
from .dataset import VideoDataset
|
||||
from .dali_loader import TSN_Dali_loader, get_input_data
|
||||
|
||||
__all__ = [
|
||||
'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset',
|
||||
'TSN_Dali_loader', 'get_input_data'
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,132 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import signal
|
||||
import os
|
||||
import paddle
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from .registry import DATASETS, PIPELINES
|
||||
from ..utils.build_utils import build
|
||||
from .pipelines.compose import Compose
|
||||
from paddlevideo.utils import get_logger
|
||||
from paddlevideo.utils.multigrid import DistributedShortSampler
|
||||
import numpy as np
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
def build_pipeline(cfg):
|
||||
"""Build pipeline.
|
||||
Args:
|
||||
cfg (dict): root config dict.
|
||||
"""
|
||||
if cfg == None:
|
||||
return
|
||||
return Compose(cfg)
|
||||
|
||||
|
||||
def build_dataset(cfg):
|
||||
"""Build dataset.
|
||||
Args:
|
||||
cfg (dict): root config dict.
|
||||
|
||||
Returns:
|
||||
dataset: dataset.
|
||||
"""
|
||||
#XXX: ugly code here!
|
||||
cfg_dataset, cfg_pipeline = cfg
|
||||
cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
|
||||
dataset = build(cfg_dataset, DATASETS, key="format")
|
||||
return dataset
|
||||
|
||||
|
||||
def build_batch_pipeline(cfg):
|
||||
|
||||
batch_pipeline = build(cfg, PIPELINES)
|
||||
return batch_pipeline
|
||||
|
||||
|
||||
def build_dataloader(dataset,
|
||||
batch_size,
|
||||
num_workers,
|
||||
places,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
multigrid=False,
|
||||
collate_fn_cfg=None,
|
||||
**kwargs):
|
||||
"""Build Paddle Dataloader.
|
||||
|
||||
XXX explain how the dataloader work!
|
||||
|
||||
Args:
|
||||
dataset (paddle.dataset): A PaddlePaddle dataset object.
|
||||
batch_size (int): batch size on single card.
|
||||
num_worker (int): num_worker
|
||||
shuffle(bool): whether to shuffle the data at every epoch.
|
||||
"""
|
||||
if multigrid:
|
||||
sampler = DistributedShortSampler(dataset,
|
||||
batch_sizes=batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
drop_last=drop_last)
|
||||
|
||||
#NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
|
||||
# batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
|
||||
# [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.
|
||||
|
||||
def mix_collate_fn(batch):
|
||||
pipeline = build_batch_pipeline(collate_fn_cfg)
|
||||
batch = pipeline(batch)
|
||||
slots = []
|
||||
for items in batch:
|
||||
for i, item in enumerate(items):
|
||||
if len(slots) < len(items):
|
||||
slots.append([item])
|
||||
else:
|
||||
slots[i].append(item)
|
||||
return [np.stack(slot, axis=0) for slot in slots]
|
||||
|
||||
#if collate_fn_cfg is not None:
|
||||
#ugly code here. collate_fn is mix op config
|
||||
# collate_fn = mix_collate_fn(collate_fn_cfg)
|
||||
|
||||
data_loader = DataLoader(
|
||||
dataset,
|
||||
batch_sampler=sampler,
|
||||
places=places,
|
||||
num_workers=num_workers,
|
||||
collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
|
||||
return_list=True,
|
||||
**kwargs)
|
||||
|
||||
return data_loader
|
||||
|
||||
|
||||
def term_mp(sig_num, frame):
|
||||
""" kill all child processes
|
||||
"""
|
||||
pid = os.getpid()
|
||||
pgid = os.getpgid(os.getpid())
|
||||
logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
return
|
||||
|
||||
|
||||
signal.signal(signal.SIGINT, term_mp)
|
||||
signal.signal(signal.SIGTERM, term_mp)
|
@ -0,0 +1,206 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import math
|
||||
|
||||
import paddle
|
||||
from paddle.distributed import ParallelEnv
|
||||
import paddle.distributed as dist
|
||||
from paddlevideo.utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
try:
|
||||
from nvidia.dali.pipeline import Pipeline
|
||||
import nvidia.dali.ops as ops
|
||||
import nvidia.dali.types as types
|
||||
import tempfile
|
||||
from nvidia.dali.plugin.paddle import DALIGenericIterator
|
||||
except:
|
||||
Pipeline = object
|
||||
|
||||
|
||||
def get_input_data(data):
|
||||
return paddle.to_tensor(data[0]['image']), paddle.to_tensor(
|
||||
data[0]['label'])
|
||||
|
||||
|
||||
class TSN_Dali_loader(object):
|
||||
def __init__(self, cfg):
|
||||
self.batch_size = cfg.batch_size
|
||||
self.file_path = cfg.file_path
|
||||
|
||||
self.num_seg = cfg.num_seg
|
||||
self.seglen = cfg.seglen
|
||||
self.short_size = cfg.short_size
|
||||
self.target_size = cfg.target_size
|
||||
|
||||
# set num_shards and shard_id when distributed training is implemented
|
||||
self.num_shards = dist.get_world_size()
|
||||
self.shard_id = ParallelEnv().local_rank
|
||||
self.dali_mean = cfg.mean * (self.num_seg * self.seglen)
|
||||
self.dali_std = cfg.std * (self.num_seg * self.seglen)
|
||||
|
||||
def build_dali_reader(self):
|
||||
"""
|
||||
build dali training reader
|
||||
"""
|
||||
def reader_():
|
||||
with open(self.file_path) as flist:
|
||||
full_lines = [line for line in flist]
|
||||
if (not hasattr(reader_, 'seed')):
|
||||
reader_.seed = 0
|
||||
random.Random(reader_.seed).shuffle(full_lines)
|
||||
logger.info(f"reader shuffle seed: {reader_.seed}.")
|
||||
if reader_.seed is not None:
|
||||
reader_.seed += 1
|
||||
|
||||
per_node_lines = int(
|
||||
math.ceil(len(full_lines) * 1.0 / self.num_shards))
|
||||
total_lines = per_node_lines * self.num_shards
|
||||
|
||||
# aligned full_lines so that it can evenly divisible
|
||||
full_lines += full_lines[:(total_lines - len(full_lines))]
|
||||
assert len(full_lines) == total_lines
|
||||
|
||||
# trainer get own sample
|
||||
lines = full_lines[self.shard_id:total_lines:self.num_shards]
|
||||
assert len(lines) == per_node_lines
|
||||
|
||||
logger.info(
|
||||
f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
|
||||
)
|
||||
logger.info(
|
||||
f"read videos from {self.shard_id * per_node_lines}, "
|
||||
f"length: {per_node_lines}, "
|
||||
f"lines length: {len(lines)}, "
|
||||
f"total: {len(full_lines)}")
|
||||
|
||||
video_files = ''.join([item for item in lines])
|
||||
tf = tempfile.NamedTemporaryFile()
|
||||
tf.write(str.encode(video_files))
|
||||
tf.flush()
|
||||
video_files = tf.name
|
||||
|
||||
device_id = ParallelEnv().local_rank
|
||||
logger.info(f'---------- device_id: {device_id} -----------')
|
||||
|
||||
pipe = VideoPipe(batch_size=self.batch_size,
|
||||
num_threads=1,
|
||||
device_id=device_id,
|
||||
file_list=video_files,
|
||||
sequence_length=self.num_seg * self.seglen,
|
||||
num_seg=self.num_seg,
|
||||
seg_length=self.seglen,
|
||||
resize_shorter_scale=self.short_size,
|
||||
crop_target_size=self.target_size,
|
||||
is_training=True,
|
||||
num_shards=self.num_shards,
|
||||
shard_id=self.shard_id,
|
||||
dali_mean=self.dali_mean,
|
||||
dali_std=self.dali_std)
|
||||
|
||||
logger.info(
|
||||
'initializing dataset, it will take several minutes if it is too large .... '
|
||||
)
|
||||
video_loader = DALIGenericIterator([pipe], ['image', 'label'],
|
||||
len(lines),
|
||||
dynamic_shape=True,
|
||||
auto_reset=True)
|
||||
|
||||
return video_loader
|
||||
|
||||
dali_reader = reader_()
|
||||
return dali_reader
|
||||
|
||||
|
||||
class VideoPipe(Pipeline):
|
||||
def __init__(self,
|
||||
batch_size,
|
||||
num_threads,
|
||||
device_id,
|
||||
file_list,
|
||||
sequence_length,
|
||||
num_seg,
|
||||
seg_length,
|
||||
resize_shorter_scale,
|
||||
crop_target_size,
|
||||
is_training=False,
|
||||
initial_prefetch_size=20,
|
||||
num_shards=1,
|
||||
shard_id=0,
|
||||
dali_mean=0.,
|
||||
dali_std=1.0):
|
||||
super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
|
||||
self.input = ops.VideoReader(device="gpu",
|
||||
file_list=file_list,
|
||||
sequence_length=sequence_length,
|
||||
num_seg=num_seg,
|
||||
seg_length=seg_length,
|
||||
is_training=is_training,
|
||||
num_shards=num_shards,
|
||||
shard_id=shard_id,
|
||||
random_shuffle=is_training,
|
||||
initial_fill=initial_prefetch_size)
|
||||
# the sequece data read by ops.VideoReader is of shape [F, H, W, C]
|
||||
# Because the ops.Resize does not support sequence data,
|
||||
# it will be transposed into [H, W, F, C],
|
||||
# then reshaped to [H, W, FC], and then resized like a 2-D image.
|
||||
self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
|
||||
self.reshape = ops.Reshape(device="gpu",
|
||||
rel_shape=[1.0, 1.0, -1],
|
||||
layout='HWC')
|
||||
self.resize = ops.Resize(device="gpu",
|
||||
resize_shorter=resize_shorter_scale)
|
||||
# crops and mirror are applied by ops.CropMirrorNormalize.
|
||||
# Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
|
||||
# It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
|
||||
self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
|
||||
self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
|
||||
self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
|
||||
self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
|
||||
self.crop_mirror_norm = ops.CropMirrorNormalize(
|
||||
device="gpu",
|
||||
crop=[crop_target_size, crop_target_size],
|
||||
mean=dali_mean,
|
||||
std=dali_std)
|
||||
self.reshape_back = ops.Reshape(
|
||||
device="gpu",
|
||||
shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size],
|
||||
layout='FCHW')
|
||||
self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
|
||||
|
||||
def define_graph(self):
|
||||
output, label = self.input(name="Reader")
|
||||
output = self.transpose(output)
|
||||
output = self.reshape(output)
|
||||
|
||||
output = self.resize(output)
|
||||
output = output / 255.
|
||||
pos_x = self.pos_rng_x()
|
||||
pos_y = self.pos_rng_y()
|
||||
mirror_flag = self.mirror_generator()
|
||||
mirror_flag = (mirror_flag > 0.5)
|
||||
mirror_flag = self.cast_mirror(mirror_flag)
|
||||
output = self.crop_mirror_norm(output,
|
||||
crop_pos_x=pos_x,
|
||||
crop_pos_y=pos_y,
|
||||
mirror=mirror_flag)
|
||||
output = self.reshape_back(output)
|
||||
label = self.cast_label(label)
|
||||
return output, label
|
||||
|
||||
def __len__(self):
|
||||
return self.epoch_size()
|
@ -0,0 +1,109 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class MRIDataset(BaseDataset):
|
||||
"""Rawframe dataset for action recognition.
|
||||
The dataset loads raw frames from frame files, and apply specified transform operatation them.
|
||||
The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
|
||||
Example of an index file:
|
||||
|
||||
.. code-block:: txt
|
||||
|
||||
file_path-1 150 1
|
||||
file_path-2 160 1
|
||||
file_path-3 170 2
|
||||
file_path-4 180 2
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the index file.
|
||||
pipeline(XXX):
|
||||
data_prefix (str): directory path of the data. Default: None.
|
||||
test_mode (bool): Whether to bulid the test dataset. Default: False.
|
||||
suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
|
||||
|
||||
"""
|
||||
def __init__(self,
|
||||
file_path,
|
||||
pipeline,
|
||||
num_retries=5,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
suffix='img_{:05}.jpg'):
|
||||
self.num_retries = num_retries
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split()
|
||||
frame_dir, frames_len, labels = line_split
|
||||
if self.data_prefix is not None:
|
||||
frame_dir = osp.join(self.data_prefix, frame_dir)
|
||||
info.append(
|
||||
dict(
|
||||
frame_dir=frame_dir,
|
||||
#suffix=self.suffix,
|
||||
frames_len=frames_len,
|
||||
labels=int(labels)))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""Prepare the frames for training/valid gisven index. """
|
||||
#Try to catch Exception caused by reading missing frames files
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['frame_dir'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return np.array(results['imgs']), np.array([results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""Prepare the frames for test given index. """
|
||||
#Try to catch Exception caused by reading missing frames files
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['frame_dir'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return np.array(results['imgs']), np.array([results['labels']])
|
@ -0,0 +1,111 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class SFMRIDataset(BaseDataset):
|
||||
"""Rawframe dataset for action recognition.
|
||||
The dataset loads raw frames from frame files, and apply specified transform operatation them.
|
||||
The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
|
||||
Example of an index file:
|
||||
|
||||
.. code-block:: txt
|
||||
|
||||
file_path-1 150 1
|
||||
file_path-2 160 1
|
||||
file_path-3 170 2
|
||||
file_path-4 180 2
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the index file.
|
||||
pipeline(XXX):
|
||||
data_prefix (str): directory path of the data. Default: None.
|
||||
test_mode (bool): Whether to bulid the test dataset. Default: False.
|
||||
suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
|
||||
|
||||
"""
|
||||
def __init__(self,
|
||||
file_path,
|
||||
pipeline,
|
||||
num_retries=5,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
suffix='img_{:05}.jpg'):
|
||||
self.num_retries = num_retries
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split()
|
||||
frame_dir, frames_len, labels = line_split
|
||||
if self.data_prefix is not None:
|
||||
frame_dir = osp.join(self.data_prefix, frame_dir)
|
||||
info.append(
|
||||
dict(
|
||||
frame_dir=frame_dir,
|
||||
#suffix=self.suffix,
|
||||
frames_len=frames_len,
|
||||
labels=int(labels)))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""Prepare the frames for training/valid gisven index. """
|
||||
#Try to catch Exception caused by reading missing frames files
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['frame_dir'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return np.array(results['imgs'][0]), np.array(
|
||||
results['imgs'][1]), np.array([results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""Prepare the frames for test given index. """
|
||||
#Try to catch Exception caused by reading missing frames files
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['frame_dir'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return np.array(results['imgs'][0]), np.array(
|
||||
results['imgs'][1]), np.array([results['labels']])
|
@ -0,0 +1,41 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .actbert_dataset import ActBertDataset
|
||||
from .ava_dataset import AVADataset
|
||||
from .bmn_dataset import BMNDataset
|
||||
from .davis_dataset import DavisDataset
|
||||
from .feature import FeatureDataset
|
||||
from .frame import FrameDataset, FrameDataset_Sport
|
||||
from .MRI import MRIDataset
|
||||
from .MRI_SlowFast import SFMRIDataset
|
||||
from .msrvtt import MSRVTTDataset
|
||||
from .actbert_dataset import ActBertDataset
|
||||
from .asrf_dataset import ASRFDataset
|
||||
from .ms_tcn_dataset import MSTCNDataset
|
||||
from .oxford import MonoDataset
|
||||
from .skeleton import SkeletonDataset
|
||||
from .slowfast_video import SFVideoDataset
|
||||
from .video import VideoDataset
|
||||
from .ucf101_skeleton import UCF101SkeletonDataset
|
||||
from .ucf24_dataset import UCF24Dataset
|
||||
|
||||
|
||||
__all__ = [
|
||||
'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset',
|
||||
'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset',
|
||||
'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset',
|
||||
'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset',
|
||||
'UCF101SkeletonDataset', 'UCF24Dataset'
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,74 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
try:
|
||||
import lmdb
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
|
||||
)
|
||||
import pickle
|
||||
import json
|
||||
try:
|
||||
from paddlenlp.transformers import BertTokenizer
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
|
||||
)
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class ActBertDataset(BaseDataset):
|
||||
"""ActBert dataset.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
bert_model="bert-base-uncased",
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
):
|
||||
self.bert_model = bert_model
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
feature_data = np.load(self.file_path, allow_pickle=True)
|
||||
self.tokenizer = BertTokenizer.from_pretrained(self.bert_model,
|
||||
do_lower_case=True)
|
||||
self.info = []
|
||||
for item in feature_data:
|
||||
self.info.append(dict(feature=item, tokenizer=self.tokenizer))
|
||||
return self.info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""Prepare the frames for training/valid given index. """
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
#print('==results==', results)
|
||||
results = self.pipeline(results)
|
||||
return results['features']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""Prepare the frames for test given index. """
|
||||
pass
|
@ -0,0 +1,104 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class ASRFDataset(BaseDataset):
|
||||
"""Video dataset for action segmentation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
feature_path,
|
||||
label_path,
|
||||
boundary_path,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
self.label_path = label_path
|
||||
self.boundary_path = boundary_path
|
||||
self.feature_path = feature_path
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
file_ptr = open(self.file_path, 'r')
|
||||
info = file_ptr.read().split('\n')[:-1]
|
||||
file_ptr.close()
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID: Prepare data for training/valid given the index."""
|
||||
results = {}
|
||||
video_name = self.info[idx]
|
||||
# load video feature
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
feat_file_path = os.path.join(self.feature_path, file_name)
|
||||
#TODO: check path
|
||||
video_feat = np.load(feat_file_path)
|
||||
|
||||
# load label
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
label_file_path = os.path.join(self.label_path, file_name)
|
||||
label = np.load(label_file_path).astype(np.int64)
|
||||
|
||||
# load boundary
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
boundary_file_path = os.path.join(self.boundary_path, file_name)
|
||||
boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
|
||||
|
||||
results['video_feat'] = copy.deepcopy(video_feat)
|
||||
results['video_label'] = copy.deepcopy(label)
|
||||
results['video_boundary'] = copy.deepcopy(boundary)
|
||||
|
||||
results = self.pipeline(results)
|
||||
return results['video_feat'], results['video_label'], results['video_boundary']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST: Prepare the data for test given the index."""
|
||||
results = {}
|
||||
video_name = self.info[idx]
|
||||
# load video feature
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
feat_file_path = os.path.join(self.feature_path, file_name)
|
||||
#TODO: check path
|
||||
video_feat = np.load(feat_file_path)
|
||||
|
||||
# load label
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
label_file_path = os.path.join(self.label_path, file_name)
|
||||
label = np.load(label_file_path).astype(np.int64)
|
||||
|
||||
# load boundary
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
boundary_file_path = os.path.join(self.boundary_path, file_name)
|
||||
boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32)
|
||||
|
||||
results['video_feat'] = copy.deepcopy(video_feat)
|
||||
results['video_label'] = copy.deepcopy(label)
|
||||
results['video_boundary'] = copy.deepcopy(boundary)
|
||||
|
||||
results = self.pipeline(results)
|
||||
return results['video_feat'], results['video_label'], results['video_boundary']
|
@ -0,0 +1,249 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
import pickle
|
||||
from datetime import datetime
|
||||
from ...metrics.ava_utils import ava_evaluate_results
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class AVADataset(BaseDataset):
|
||||
"""AVA dataset for spatial temporal detection.
|
||||
the dataset loads raw frames, bounding boxes, proposals and applies
|
||||
transformations to return the frame tensors and other information.
|
||||
"""
|
||||
|
||||
_FPS = 30
|
||||
|
||||
def __init__(self,
|
||||
pipeline,
|
||||
file_path=None,
|
||||
exclude_file=None,
|
||||
label_file=None,
|
||||
suffix='{:05}.jpg',
|
||||
proposal_file=None,
|
||||
person_det_score_thr=0.9,
|
||||
num_classes=81,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
num_max_proposals=1000,
|
||||
timestamp_start=900,
|
||||
timestamp_end=1800):
|
||||
self.custom_classes = None
|
||||
self.exclude_file = exclude_file
|
||||
self.label_file = label_file
|
||||
self.proposal_file = proposal_file
|
||||
assert 0 <= person_det_score_thr <= 1, (
|
||||
'The value of '
|
||||
'person_det_score_thr should in [0, 1]. ')
|
||||
self.person_det_score_thr = person_det_score_thr
|
||||
self.num_classes = num_classes
|
||||
self.suffix = suffix
|
||||
self.num_max_proposals = num_max_proposals
|
||||
self.timestamp_start = timestamp_start
|
||||
self.timestamp_end = timestamp_end
|
||||
super().__init__(
|
||||
file_path,
|
||||
pipeline,
|
||||
data_prefix,
|
||||
test_mode,
|
||||
)
|
||||
if self.proposal_file is not None:
|
||||
self.proposals = self._load(self.proposal_file)
|
||||
else:
|
||||
self.proposals = None
|
||||
if not test_mode:
|
||||
valid_indexes = self.filter_exclude_file()
|
||||
self.info = self.info = [self.info[i] for i in valid_indexes]
|
||||
|
||||
def _load(self, path):
|
||||
f = open(path, 'rb')
|
||||
res = pickle.load(f)
|
||||
f.close()
|
||||
return res
|
||||
|
||||
def parse_img_record(self, img_records):
|
||||
bboxes, labels, entity_ids = [], [], []
|
||||
while len(img_records) > 0:
|
||||
img_record = img_records[0]
|
||||
num_img_records = len(img_records)
|
||||
selected_records = list(
|
||||
filter(
|
||||
lambda x: np.array_equal(x['entity_box'], img_record[
|
||||
'entity_box']), img_records))
|
||||
num_selected_records = len(selected_records)
|
||||
img_records = list(
|
||||
filter(
|
||||
lambda x: not np.array_equal(x['entity_box'], img_record[
|
||||
'entity_box']), img_records))
|
||||
assert len(img_records) + num_selected_records == num_img_records
|
||||
|
||||
bboxes.append(img_record['entity_box'])
|
||||
valid_labels = np.array([
|
||||
selected_record['label'] for selected_record in selected_records
|
||||
])
|
||||
|
||||
label = np.zeros(self.num_classes, dtype=np.float32)
|
||||
label[valid_labels] = 1.
|
||||
|
||||
labels.append(label)
|
||||
entity_ids.append(img_record['entity_id'])
|
||||
|
||||
bboxes = np.stack(bboxes)
|
||||
labels = np.stack(labels)
|
||||
entity_ids = np.stack(entity_ids)
|
||||
return bboxes, labels, entity_ids
|
||||
|
||||
def filter_exclude_file(self):
|
||||
valid_indexes = []
|
||||
if self.exclude_file is None:
|
||||
valid_indexes = list(range(len(self.info)))
|
||||
else:
|
||||
exclude_video_infos = [
|
||||
x.strip().split(',') for x in open(self.exclude_file)
|
||||
]
|
||||
for i, video_info in enumerate(self.info):
|
||||
valid_indexes.append(i)
|
||||
for video_id, timestamp in exclude_video_infos:
|
||||
if (video_info['video_id'] == video_id
|
||||
and video_info['timestamp'] == int(timestamp)):
|
||||
valid_indexes.pop()
|
||||
break
|
||||
return valid_indexes
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
records_dict_by_img = defaultdict(list)
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split(',')
|
||||
|
||||
video_id = line_split[0]
|
||||
timestamp = int(line_split[1])
|
||||
img_key = f'{video_id},{timestamp:04d}'
|
||||
|
||||
entity_box = np.array(list(map(float, line_split[2:6])))
|
||||
label = int(line_split[6])
|
||||
entity_id = int(line_split[7])
|
||||
shot_info = (0, (self.timestamp_end - self.timestamp_start) *
|
||||
self._FPS)
|
||||
|
||||
video_info = dict(video_id=video_id,
|
||||
timestamp=timestamp,
|
||||
entity_box=entity_box,
|
||||
label=label,
|
||||
entity_id=entity_id,
|
||||
shot_info=shot_info)
|
||||
records_dict_by_img[img_key].append(video_info)
|
||||
|
||||
for img_key in records_dict_by_img:
|
||||
video_id, timestamp = img_key.split(',')
|
||||
bboxes, labels, entity_ids = self.parse_img_record(
|
||||
records_dict_by_img[img_key])
|
||||
ann = dict(gt_bboxes=bboxes,
|
||||
gt_labels=labels,
|
||||
entity_ids=entity_ids)
|
||||
frame_dir = video_id
|
||||
if self.data_prefix is not None:
|
||||
frame_dir = osp.join(self.data_prefix, frame_dir)
|
||||
video_info = dict(frame_dir=frame_dir,
|
||||
video_id=video_id,
|
||||
timestamp=int(timestamp),
|
||||
img_key=img_key,
|
||||
shot_info=shot_info,
|
||||
fps=self._FPS,
|
||||
ann=ann)
|
||||
info.append(video_info)
|
||||
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
img_key = results['img_key']
|
||||
|
||||
results['suffix'] = self.suffix
|
||||
results['timestamp_start'] = self.timestamp_start
|
||||
results['timestamp_end'] = self.timestamp_end
|
||||
|
||||
if self.proposals is not None:
|
||||
if img_key not in self.proposals:
|
||||
results['proposals'] = np.array([[0, 0, 1, 1]])
|
||||
results['scores'] = np.array([1])
|
||||
else:
|
||||
proposals = self.proposals[img_key]
|
||||
assert proposals.shape[-1] in [4, 5]
|
||||
if proposals.shape[-1] == 5:
|
||||
thr = min(self.person_det_score_thr, max(proposals[:, 4]))
|
||||
positive_inds = (proposals[:, 4] >= thr)
|
||||
proposals = proposals[positive_inds]
|
||||
proposals = proposals[:self.num_max_proposals]
|
||||
results['proposals'] = proposals[:, :4]
|
||||
results['scores'] = proposals[:, 4]
|
||||
else:
|
||||
proposals = proposals[:self.num_max_proposals]
|
||||
results['proposals'] = proposals
|
||||
|
||||
ann = results.pop('ann')
|
||||
results['gt_bboxes'] = ann['gt_bboxes']
|
||||
results['gt_labels'] = ann['gt_labels']
|
||||
results['entity_ids'] = ann['entity_ids']
|
||||
|
||||
#ret = self.pipeline(results, "")
|
||||
ret = self.pipeline(results)
|
||||
#padding for dataloader
|
||||
len_proposals = ret['proposals'].shape[0]
|
||||
len_gt_bboxes = ret['gt_bboxes'].shape[0]
|
||||
len_gt_labels = ret['gt_labels'].shape[0]
|
||||
len_scores = ret['scores'].shape[0]
|
||||
len_entity_ids = ret['entity_ids'].shape[0]
|
||||
padding_len = 128
|
||||
ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len)
|
||||
ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len)
|
||||
ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len)
|
||||
ret['scores'] = self.my_padding_1d(ret['scores'], padding_len)
|
||||
ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len)
|
||||
return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[
|
||||
'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[
|
||||
'entity_ids'], np.array(
|
||||
ret['img_shape'], dtype=int
|
||||
), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids
|
||||
|
||||
def my_padding_2d(self, feat, max_len):
|
||||
feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]),
|
||||
dtype=np.float32)
|
||||
feat_pad = np.concatenate((feat, feat_add), axis=0)
|
||||
return feat_pad
|
||||
|
||||
def my_padding_1d(self, feat, max_len):
|
||||
feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32)
|
||||
feat_pad = np.concatenate((feat, feat_add), axis=0)
|
||||
return feat_pad
|
||||
|
||||
def prepare_test(self, idx):
|
||||
return self.prepare_train(idx)
|
||||
|
||||
def evaluate(self, results):
|
||||
return ava_evaluate_results(self.info, len(self), results,
|
||||
self.custom_classes, self.label_file,
|
||||
self.file_path, self.exclude_file)
|
@ -0,0 +1,80 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import numpy as np
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import paddle
|
||||
from paddle.io import Dataset
|
||||
|
||||
|
||||
class BaseDataset(Dataset, ABC):
|
||||
"""Base class for datasets
|
||||
|
||||
All datasets should subclass it.
|
||||
All subclass should overwrite:
|
||||
|
||||
- Method: `load_file`, load info from index file.
|
||||
- Method: `prepare_train`, providing train data.
|
||||
- Method: `prepare_test`, providing test data.
|
||||
|
||||
Args:
|
||||
file_path (str): index file path.
|
||||
pipeline (Sequence XXX)
|
||||
data_prefix (str): directory path of the data. Default: None.
|
||||
test_mode (bool): whether to build test dataset. Default: False.
|
||||
|
||||
"""
|
||||
def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False):
|
||||
super().__init__()
|
||||
self.file_path = file_path
|
||||
self.data_prefix = osp.realpath(data_prefix) if \
|
||||
data_prefix is not None and osp.isdir(data_prefix) else data_prefix
|
||||
self.test_mode = test_mode
|
||||
self.pipeline = pipeline
|
||||
self.info = self.load_file()
|
||||
|
||||
@abstractmethod
|
||||
def load_file(self):
|
||||
"""load the video information from the index file path."""
|
||||
pass
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
|
||||
#Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
#unsqueeze label to list
|
||||
return results['imgs'], np.array([results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST: Prepare the data for test given the index."""
|
||||
#Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
#unsqueeze label to list
|
||||
return results['imgs'], np.array([results['labels']])
|
||||
|
||||
def __len__(self):
|
||||
"""get the size of the dataset."""
|
||||
return len(self.info)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
""" Get the sample for either training or testing given index"""
|
||||
if self.test_mode:
|
||||
return self.prepare_test(idx)
|
||||
else:
|
||||
return self.prepare_train(idx)
|
@ -0,0 +1,72 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import json
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class BMNDataset(BaseDataset):
|
||||
"""Video dataset for action localization.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
subset,
|
||||
**kwargs,
|
||||
):
|
||||
self.subset = subset
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
annos = json.load(open(self.file_path))
|
||||
for video_name in annos.keys():
|
||||
video_subset = annos[video_name]["subset"]
|
||||
if self.subset in video_subset:
|
||||
info.append(
|
||||
dict(
|
||||
video_name=video_name,
|
||||
video_info=annos[video_name],
|
||||
))
|
||||
#sort by video_name
|
||||
sort_f = lambda elem: elem['video_name']
|
||||
info.sort(key=sort_f)
|
||||
#add video_idx to info
|
||||
for idx, elem in enumerate(info):
|
||||
info[idx]['video_idx'] = idx
|
||||
logger.info("{} subset video numbers: {}".format(
|
||||
self.subset, len(info)))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID: Prepare data for training/valid given the index."""
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
return results['video_feat'], results['gt_iou_map'], results['gt_start'],\
|
||||
results['gt_end']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST: Prepare the data for test given the index."""
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
return results['video_feat'], results['gt_iou_map'], results['gt_start'], \
|
||||
results['gt_end'], results['video_idx']
|
@ -0,0 +1,189 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
import shutil
|
||||
from PIL import Image
|
||||
import cv2
|
||||
from paddle.io import Dataset
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
class VOS_Test(Dataset):
|
||||
"""process frames in each video
|
||||
"""
|
||||
def __init__(self,
|
||||
image_root,
|
||||
label_root,
|
||||
seq_name,
|
||||
images,
|
||||
labels,
|
||||
pipeline=None,
|
||||
rgb=False,
|
||||
resolution=None):
|
||||
self.image_root = image_root
|
||||
self.label_root = label_root
|
||||
self.seq_name = seq_name
|
||||
self.images = images # image file list
|
||||
self.labels = labels
|
||||
self.obj_num = 1
|
||||
self.num_frame = len(self.images)
|
||||
self.pipeline = pipeline
|
||||
self.rgb = rgb
|
||||
self.resolution = resolution
|
||||
|
||||
self.obj_nums = []
|
||||
temp_obj_num = 0
|
||||
for img_name in self.images:
|
||||
self.obj_nums.append(temp_obj_num)
|
||||
current_label_name = img_name.split('.')[0] + '.png'
|
||||
if current_label_name in self.labels:
|
||||
current_label = self.read_label(current_label_name)
|
||||
if temp_obj_num < np.unique(
|
||||
current_label)[-1]: #get object number from label_id
|
||||
temp_obj_num = np.unique(current_label)[-1]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.images)
|
||||
|
||||
def read_image(self, idx):
|
||||
img_name = self.images[idx]
|
||||
img_path = os.path.join(self.image_root, self.seq_name, img_name)
|
||||
img = cv2.imread(img_path)
|
||||
img = np.array(img, dtype=np.float32)
|
||||
if self.rgb:
|
||||
img = img[:, :, [2, 1, 0]]
|
||||
return img
|
||||
|
||||
def read_label(self, label_name):
|
||||
label_path = os.path.join(self.label_root, self.seq_name, label_name)
|
||||
label = Image.open(label_path)
|
||||
label = np.array(label, dtype=np.uint8)
|
||||
return label
|
||||
|
||||
def __getitem__(self, idx):
|
||||
img_name = self.images[idx]
|
||||
current_img = self.read_image(idx)
|
||||
current_img = np.array(current_img)
|
||||
height, width, channels = current_img.shape
|
||||
if self.resolution is not None:
|
||||
width = int(np.ceil(float(width) * self.resolution / float(height)))
|
||||
height = int(self.resolution)
|
||||
|
||||
current_label_name = img_name.split('.')[0] + '.png'
|
||||
obj_num = self.obj_nums[idx]
|
||||
|
||||
if current_label_name in self.labels:
|
||||
current_label = self.read_label(current_label_name)
|
||||
current_label = np.array(current_label)
|
||||
sample = {
|
||||
'current_img': current_img,
|
||||
'current_label': current_label
|
||||
}
|
||||
else:
|
||||
sample = {
|
||||
'current_img': current_img
|
||||
} #only the first frame contains label
|
||||
|
||||
sample['meta'] = {
|
||||
'seq_name': self.seq_name,
|
||||
'frame_num': self.num_frame,
|
||||
'obj_num': obj_num,
|
||||
'current_name': img_name,
|
||||
'height': height,
|
||||
'width': width,
|
||||
'flip': False
|
||||
}
|
||||
if self.pipeline is not None:
|
||||
sample = self.pipeline(sample)
|
||||
for s in sample:
|
||||
s['current_img'] = np.array(s['current_img'])
|
||||
if 'current_label' in s.keys():
|
||||
s['current_label'] = s['current_label']
|
||||
return sample
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class DavisDataset(BaseDataset):
|
||||
"""Davis 2017 dataset.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
result_root,
|
||||
pipeline,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
year=2017,
|
||||
rgb=False,
|
||||
resolution='480p',
|
||||
):
|
||||
self.rgb = rgb
|
||||
self.result_root = result_root
|
||||
self.resolution = resolution
|
||||
self.year = year
|
||||
self.spt = 'val' if test_mode else 'train'
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
|
||||
def load_file(self):
|
||||
self.image_root = os.path.join(self.file_path, 'JPEGImages',
|
||||
self.resolution)
|
||||
self.label_root = os.path.join(self.file_path, 'Annotations',
|
||||
self.resolution)
|
||||
seq_names = []
|
||||
with open(
|
||||
os.path.join(self.file_path, 'ImageSets', str(self.year),
|
||||
self.spt + '.txt')) as f:
|
||||
seqs_tmp = f.readlines()
|
||||
seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp))
|
||||
seq_names.extend(seqs_tmp)
|
||||
self.info = list(np.unique(seq_names))
|
||||
return self.info
|
||||
|
||||
def prepare_test(self, idx):
|
||||
seq_name = self.info[idx] #video name
|
||||
images = list(
|
||||
np.sort(os.listdir(os.path.join(self.image_root, seq_name))))
|
||||
labels = [images[0].replace('jpg', 'png')] #we have first frame target
|
||||
|
||||
# copy first frame target
|
||||
if not os.path.isfile(
|
||||
os.path.join(self.result_root, seq_name, labels[0])):
|
||||
if not os.path.exists(os.path.join(self.result_root, seq_name)):
|
||||
os.makedirs(os.path.join(self.result_root, seq_name))
|
||||
source_label_path = os.path.join(self.label_root, seq_name,
|
||||
labels[0])
|
||||
result_label_path = os.path.join(self.result_root, seq_name,
|
||||
labels[0])
|
||||
|
||||
shutil.copy(source_label_path, result_label_path)
|
||||
|
||||
seq_dataset = VOS_Test(self.image_root,
|
||||
self.label_root,
|
||||
seq_name,
|
||||
images,
|
||||
labels,
|
||||
self.pipeline,
|
||||
rgb=self.rgb,
|
||||
resolution=480)
|
||||
return seq_dataset
|
@ -0,0 +1,80 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import os.path as osp
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class FeatureDataset(BaseDataset):
|
||||
"""Feature dataset for action recognition
|
||||
Example:(TODO)
|
||||
Args:(TODO)
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
suffix=None,
|
||||
):
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
filename = line.strip().split()[0]
|
||||
if self.data_prefix is not None:
|
||||
filename = osp.join(self.data_prefix, filename)
|
||||
if self.suffix is not None:
|
||||
filename = filename + self.suffix
|
||||
|
||||
info.append(dict(filename=filename))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
|
||||
if 'iou_norm' in results:
|
||||
return results['rgb_data'], results['rgb_len'], results[
|
||||
'rgb_mask'], results['audio_data'], results[
|
||||
'audio_len'], results['audio_mask'], results[
|
||||
'labels'], results['iou_norm']
|
||||
else:
|
||||
return results['rgb_data'], results['rgb_len'], results[
|
||||
'rgb_mask'], results['audio_data'], results[
|
||||
'audio_len'], results['audio_mask'], results['labels']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST. Prepare the data for testing given the index."""
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
|
||||
if 'iou_norm' in results:
|
||||
return results['rgb_data'], results['rgb_len'], results[
|
||||
'rgb_mask'], results['audio_data'], results[
|
||||
'audio_len'], results['audio_mask'], results[
|
||||
'labels'], results['iou_norm']
|
||||
else:
|
||||
return results['rgb_data'], results['rgb_len'], results[
|
||||
'rgb_mask'], results['audio_data'], results[
|
||||
'audio_len'], results['audio_mask'], results['labels']
|
@ -0,0 +1,177 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class FrameDataset(BaseDataset):
|
||||
"""Rawframe dataset for action recognition.
|
||||
The dataset loads raw frames from frame files, and apply specified transform operatation them.
|
||||
The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace.
|
||||
Example of an index file:
|
||||
|
||||
.. code-block:: txt
|
||||
|
||||
file_path-1 150 1
|
||||
file_path-2 160 1
|
||||
file_path-3 170 2
|
||||
file_path-4 180 2
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the index file.
|
||||
pipeline(XXX):
|
||||
data_prefix (str): directory path of the data. Default: None.
|
||||
test_mode (bool): Whether to bulid the test dataset. Default: False.
|
||||
suffix (str): suffix of file. Default: 'img_{:05}.jpg'.
|
||||
|
||||
"""
|
||||
def __init__(self,
|
||||
file_path,
|
||||
pipeline,
|
||||
num_retries=5,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
suffix='img_{:05}.jpg'):
|
||||
self.num_retries = num_retries
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split()
|
||||
frame_dir, frames_len, labels = line_split
|
||||
if self.data_prefix is not None:
|
||||
frame_dir = osp.join(self.data_prefix, frame_dir)
|
||||
info.append(
|
||||
dict(frame_dir=frame_dir,
|
||||
suffix=self.suffix,
|
||||
frames_len=frames_len,
|
||||
labels=int(labels)))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""Prepare the frames for training/valid given index. """
|
||||
#Try to catch Exception caused by reading missing frames files
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['frame_dir'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'], np.array([results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""Prepare the frames for test given index. """
|
||||
#Try to catch Exception caused by reading missing frames files
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['frame_dir'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'], np.array([results['labels']])
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class FrameDataset_Sport(BaseDataset):
|
||||
"""Video dataset for action recognition
|
||||
The dataset loads raw videos and apply specified transforms on them.
|
||||
The index file is a file with multiple lines, and each line indicates
|
||||
a sample video with the filepath and label, which are split with a whitesapce.
|
||||
Example of a inde file:
|
||||
.. code-block:: txt
|
||||
path/000.mp4 1
|
||||
path/001.mp4 1
|
||||
path/002.mp4 2
|
||||
path/003.mp4 2
|
||||
Args:
|
||||
file_path(str): Path to the index file.
|
||||
pipeline(XXX): A sequence of data transforms.
|
||||
**kwargs: Keyword arguments for ```BaseDataset```.
|
||||
"""
|
||||
def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
|
||||
self.num_retries = num_retries
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split()
|
||||
frame_dir = line_split[0]
|
||||
if self.data_prefix is not None:
|
||||
frame_dir = osp.join(self.data_prefix, frame_dir)
|
||||
info.append(dict(frame_dir=frame_dir, suffix=self.suffix))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
|
||||
#Try to catch Exception caused by reading corrupted video file
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['filename'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'], np.array([results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST. Prepare the data for test given the index."""
|
||||
#Try to catch Exception caused by reading corrupted video file
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['filename'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'], np.array([results['labels']])
|
@ -0,0 +1,110 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class MSTCNDataset(BaseDataset):
|
||||
"""Video dataset for action segmentation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
feature_path,
|
||||
gt_path,
|
||||
actions_map_file_path,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
self.gt_path = gt_path
|
||||
self.actions_map_file_path = actions_map_file_path
|
||||
self.feature_path = feature_path
|
||||
|
||||
# actions dict generate
|
||||
file_ptr = open(self.actions_map_file_path, 'r')
|
||||
actions = file_ptr.read().split('\n')[:-1]
|
||||
file_ptr.close()
|
||||
self.actions_dict = dict()
|
||||
for a in actions:
|
||||
self.actions_dict[a.split()[1]] = int(a.split()[0])
|
||||
|
||||
self.num_classes = len(self.actions_dict.keys())
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
file_ptr = open(self.file_path, 'r')
|
||||
info = file_ptr.read().split('\n')[:-1]
|
||||
file_ptr.close()
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID: Prepare data for training/valid given the index."""
|
||||
results = {}
|
||||
video_name = self.info[idx]
|
||||
# load video feature
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
feat_file_path = os.path.join(self.feature_path, file_name)
|
||||
#TODO: check path
|
||||
video_feat = np.load(feat_file_path)
|
||||
|
||||
# load label
|
||||
target_file_path = os.path.join(self.gt_path, video_name)
|
||||
file_ptr = open(target_file_path, 'r')
|
||||
content = file_ptr.read().split('\n')[:-1]
|
||||
classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64')
|
||||
for i in range(len(classes)):
|
||||
classes[i] = self.actions_dict[content[i]]
|
||||
# classes = classes * (-100)
|
||||
|
||||
results['video_feat'] = copy.deepcopy(video_feat)
|
||||
results['video_gt'] = copy.deepcopy(classes)
|
||||
|
||||
results = self.pipeline(results)
|
||||
return results['video_feat'], results['video_gt']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST: Prepare the data for test given the index."""
|
||||
results = {}
|
||||
video_name = self.info[idx]
|
||||
# load video feature
|
||||
file_name = video_name.split('.')[0] + ".npy"
|
||||
feat_file_path = os.path.join(self.feature_path, file_name)
|
||||
#TODO: check path
|
||||
video_feat = np.load(feat_file_path)
|
||||
|
||||
# load label
|
||||
target_file_path = os.path.join(self.gt_path, video_name)
|
||||
file_ptr = open(target_file_path, 'r')
|
||||
content = file_ptr.read().split('\n')[:-1]
|
||||
classes = np.zeros(min(np.shape(video_feat)[1], len(content)))
|
||||
for i in range(len(classes)):
|
||||
classes[i] = self.actions_dict[content[i]]
|
||||
# classes = classes * (-100)
|
||||
|
||||
results['video_feat'] = copy.deepcopy(video_feat)
|
||||
results['video_gt'] = copy.deepcopy(classes)
|
||||
|
||||
results = self.pipeline(results)
|
||||
return results['video_feat'], results['video_gt']
|
@ -0,0 +1,220 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
try:
|
||||
import lmdb
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT."
|
||||
)
|
||||
import pickle
|
||||
try:
|
||||
from paddlenlp.transformers import BertTokenizer
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
|
||||
)
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class MSRVTTDataset(BaseDataset):
|
||||
"""MSR-VTT dataset for text-video clip retrieval.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
features_path,
|
||||
bert_model="bert-base-uncased",
|
||||
padding_index=0,
|
||||
max_seq_length=36,
|
||||
max_region_num=36,
|
||||
max_action_num=5,
|
||||
vision_feature_dim=2048,
|
||||
action_feature_dim=2048,
|
||||
spatials_dim=5,
|
||||
data_prefix=None,
|
||||
test_mode=False,
|
||||
):
|
||||
self.features_path = features_path
|
||||
self.bert_model = bert_model
|
||||
self.padding_index = padding_index
|
||||
self.max_seq_length = max_seq_length
|
||||
self.max_region_num = max_region_num
|
||||
self._max_action_num = max_action_num
|
||||
self.vision_feature_dim = vision_feature_dim
|
||||
self.action_feature_dim = action_feature_dim
|
||||
self.spatials_dim = spatials_dim
|
||||
self._tokenizer = BertTokenizer.from_pretrained(bert_model,
|
||||
do_lower_case=True)
|
||||
super().__init__(file_path, pipeline, data_prefix, test_mode)
|
||||
self.tokenize()
|
||||
self.gen_feature()
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
with open(self.file_path) as fin:
|
||||
self.image_entries = []
|
||||
self.caption_entries = []
|
||||
for line in fin.readlines():
|
||||
line = line.strip()
|
||||
vid_id = line.split(',')[0]
|
||||
self.image_entries.append(vid_id)
|
||||
self.caption_entries.append({
|
||||
"caption": line.split(',')[1],
|
||||
"vid_id": vid_id
|
||||
})
|
||||
self.env = lmdb.open(self.features_path)
|
||||
|
||||
def tokenize(self):
|
||||
for entry in self.caption_entries:
|
||||
tokens = []
|
||||
tokens.append("[CLS]")
|
||||
for token in self._tokenizer.tokenize(entry["caption"]):
|
||||
tokens.append(token)
|
||||
tokens.append("[SEP]")
|
||||
tokens = self._tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
segment_ids = [0] * len(tokens)
|
||||
input_mask = [1] * len(tokens)
|
||||
|
||||
if len(tokens) < self.max_seq_length:
|
||||
padding = [self.padding_index
|
||||
] * (self.max_seq_length - len(tokens))
|
||||
tokens = tokens + padding
|
||||
input_mask += padding
|
||||
segment_ids += padding
|
||||
|
||||
entry["token"] = np.array(tokens).astype('int64')
|
||||
entry["input_mask"] = np.array(input_mask)
|
||||
entry["segment_ids"] = np.array(segment_ids).astype('int64')
|
||||
|
||||
def get_image_feature(self, video_id):
|
||||
video_id = str(video_id).encode()
|
||||
with self.env.begin(write=False) as txn:
|
||||
item = pickle.loads(txn.get(video_id))
|
||||
video_id = item["video_id"]
|
||||
image_h = int(item["image_h"])
|
||||
image_w = int(item["image_w"])
|
||||
|
||||
features = item["features"].reshape(-1, self.vision_feature_dim)
|
||||
boxes = item["boxes"].reshape(-1, 4)
|
||||
|
||||
num_boxes = features.shape[0]
|
||||
g_feat = np.sum(features, axis=0) / num_boxes
|
||||
num_boxes = num_boxes + 1
|
||||
features = np.concatenate(
|
||||
[np.expand_dims(g_feat, axis=0), features], axis=0)
|
||||
|
||||
action_features = item["action_features"].reshape(
|
||||
-1, self.action_feature_dim)
|
||||
|
||||
image_location = np.zeros((boxes.shape[0], self.spatials_dim),
|
||||
dtype=np.float32)
|
||||
image_location[:, :4] = boxes
|
||||
image_location[:,
|
||||
4] = ((image_location[:, 3] - image_location[:, 1]) *
|
||||
(image_location[:, 2] - image_location[:, 0]) /
|
||||
(float(image_w) * float(image_h)))
|
||||
|
||||
image_location[:, 0] = image_location[:, 0] / float(image_w)
|
||||
image_location[:, 1] = image_location[:, 1] / float(image_h)
|
||||
image_location[:, 2] = image_location[:, 2] / float(image_w)
|
||||
image_location[:, 3] = image_location[:, 3] / float(image_h)
|
||||
|
||||
g_location = np.array([0, 0, 1, 1, 1])
|
||||
image_location = np.concatenate(
|
||||
[np.expand_dims(g_location, axis=0), image_location], axis=0)
|
||||
return features, num_boxes, image_location, action_features
|
||||
|
||||
def gen_feature(self):
|
||||
num_inst = len(self.image_entries) #1000
|
||||
self.features_all = np.zeros(
|
||||
(num_inst, self.max_region_num, self.vision_feature_dim))
|
||||
self.action_features_all = np.zeros(
|
||||
(num_inst, self._max_action_num, self.action_feature_dim))
|
||||
self.spatials_all = np.zeros(
|
||||
(num_inst, self.max_region_num, self.spatials_dim))
|
||||
self.image_mask_all = np.zeros((num_inst, self.max_region_num))
|
||||
self.action_mask_all = np.zeros((num_inst, self._max_action_num))
|
||||
|
||||
for i, image_id in enumerate(self.image_entries):
|
||||
features, num_boxes, boxes, action_features = self.get_image_feature(
|
||||
image_id)
|
||||
|
||||
mix_num_boxes = min(int(num_boxes), self.max_region_num)
|
||||
mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim))
|
||||
mix_features_pad = np.zeros(
|
||||
(self.max_region_num, self.vision_feature_dim))
|
||||
|
||||
image_mask = [1] * (int(mix_num_boxes))
|
||||
while len(image_mask) < self.max_region_num:
|
||||
image_mask.append(0)
|
||||
action_mask = [1] * (self._max_action_num)
|
||||
while len(action_mask) < self._max_action_num:
|
||||
action_mask.append(0)
|
||||
|
||||
mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
|
||||
mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
|
||||
|
||||
self.features_all[i] = mix_features_pad
|
||||
x = action_features.shape[0]
|
||||
self.action_features_all[i][:x] = action_features[:]
|
||||
self.image_mask_all[i] = np.array(image_mask)
|
||||
self.action_mask_all[i] = np.array(action_mask)
|
||||
self.spatials_all[i] = mix_boxes_pad
|
||||
|
||||
self.features_all = self.features_all.astype("float32")
|
||||
self.action_features_all = self.action_features_all.astype("float32")
|
||||
self.image_mask_all = self.image_mask_all.astype("int64")
|
||||
self.action_mask_all = self.action_mask_all.astype("int64")
|
||||
self.spatials_all = self.spatials_all.astype("float32")
|
||||
|
||||
def prepare_train(self, idx):
|
||||
pass
|
||||
|
||||
def prepare_test(self, idx):
|
||||
entry = self.caption_entries[idx]
|
||||
caption = entry["token"]
|
||||
input_mask = entry["input_mask"]
|
||||
segment_ids = entry["segment_ids"]
|
||||
|
||||
target_all = np.zeros(1000)
|
||||
for i, image_id in enumerate(self.image_entries):
|
||||
if image_id == entry["vid_id"]:
|
||||
target_all[i] = 1
|
||||
|
||||
return (
|
||||
caption,
|
||||
self.action_features_all,
|
||||
self.features_all,
|
||||
self.spatials_all,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
self.image_mask_all,
|
||||
self.action_mask_all,
|
||||
target_all,
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.caption_entries)
|
@ -0,0 +1,62 @@
|
||||
# Copyright Niantic 2019. Patent Pending. All rights reserved.
|
||||
#
|
||||
# This software is licensed under the terms of the Monodepth2 licence
|
||||
# which allows for non-commercial use only, the full terms of which are made
|
||||
# available in the LICENSE file.
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import copy
|
||||
from os import path as osp
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
def pil_loader(path):
|
||||
# open path as file to avoid ResourceWarning
|
||||
# (https://github.com/python-pillow/Pillow/issues/835)
|
||||
with open(path, 'rb') as f:
|
||||
with Image.open(f) as img:
|
||||
return img.convert('RGB')
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class MonoDataset(BaseDataset):
|
||||
def __init__(self,
|
||||
file_path,
|
||||
data_prefix,
|
||||
pipeline,
|
||||
num_retries=0,
|
||||
suffix='.png',
|
||||
**kwargs):
|
||||
self.num_retries = num_retries
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, data_prefix, **kwargs)
|
||||
|
||||
def load_file(self):
|
||||
info = []
|
||||
with open(self.file_path, 'r') as f:
|
||||
for line in f:
|
||||
filename = line.strip() + self.suffix
|
||||
folder = osp.dirname(filename)
|
||||
frame_index = line.strip().split('/')[1]
|
||||
info.append(
|
||||
dict(data_path=self.data_prefix,
|
||||
filename=filename,
|
||||
folder=folder,
|
||||
frame_index=int(frame_index)))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
results['imgs']['idx'] = idx
|
||||
return results['imgs'], results['day_or_night']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
return results['imgs'], results['day_or_night']
|
@ -0,0 +1,78 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
import pickle
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class SkeletonDataset(BaseDataset):
|
||||
"""
|
||||
Skeleton dataset for action recognition.
|
||||
The dataset loads skeleton feature, and apply norm operatations.
|
||||
Args:
|
||||
file_path (str): Path to the index file.
|
||||
pipeline(obj): Define the pipeline of data preprocessing.
|
||||
data_prefix (str): directory path of the data. Default: None.
|
||||
test_mode (bool): Whether to bulid the test dataset. Default: False.
|
||||
"""
|
||||
def __init__(self, file_path, pipeline, label_path=None, test_mode=False):
|
||||
self.label_path = label_path
|
||||
super().__init__(file_path, pipeline, test_mode=test_mode)
|
||||
|
||||
def load_file(self):
|
||||
"""Load feature file to get skeleton information."""
|
||||
logger.info("Loading data, it will take some moment...")
|
||||
self.data = np.load(self.file_path)
|
||||
if self.label_path:
|
||||
if self.label_path.endswith('npy'):
|
||||
self.label = np.load(self.label_path)
|
||||
elif self.label_path.endswith('pkl'):
|
||||
with open(self.label_path, 'rb') as f:
|
||||
sample_name, self.label = pickle.load(f)
|
||||
else:
|
||||
logger.info(
|
||||
"Label path not provided when test_mode={}, here just output predictions."
|
||||
.format(self.test_mode))
|
||||
logger.info("Data Loaded!")
|
||||
return self.data # used for __len__
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""Prepare the feature for training/valid given index. """
|
||||
results = dict()
|
||||
results['data'] = copy.deepcopy(self.data[idx])
|
||||
results['label'] = copy.deepcopy(self.label[idx])
|
||||
results = self.pipeline(results)
|
||||
return results['data'], results['label']
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""Prepare the feature for test given index. """
|
||||
results = dict()
|
||||
results['data'] = copy.deepcopy(self.data[idx])
|
||||
if self.label_path:
|
||||
results['label'] = copy.deepcopy(self.label[idx])
|
||||
results = self.pipeline(results)
|
||||
return results['data'], results['label']
|
||||
else:
|
||||
results = self.pipeline(results)
|
||||
return [results['data']]
|
@ -0,0 +1,143 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
@DATASETS.register()
|
||||
class SFVideoDataset(BaseDataset):
|
||||
"""Video dataset for action recognition
|
||||
The dataset loads raw videos and apply specified transforms on them.
|
||||
|
||||
The index file is a file with multiple lines, and each line indicates
|
||||
a sample video with the filepath and label, which are split with a whitesapce.
|
||||
Example of a inde file:
|
||||
|
||||
.. code-block:: txt
|
||||
|
||||
path/000.mp4 1
|
||||
path/001.mp4 1
|
||||
path/002.mp4 2
|
||||
path/003.mp4 2
|
||||
|
||||
Args:
|
||||
file_path(str): Path to the index file.
|
||||
pipeline(XXX): A sequence of data transforms.
|
||||
num_ensemble_views(int): temporal segment when multi-crop test
|
||||
num_spatial_crops(int): spatial crop number when multi-crop test
|
||||
**kwargs: Keyword arguments for ```BaseDataset```.
|
||||
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
pipeline,
|
||||
num_ensemble_views=1,
|
||||
num_spatial_crops=1,
|
||||
num_retries=5,
|
||||
num_samples_precise_bn=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.num_ensemble_views = num_ensemble_views
|
||||
self.num_spatial_crops = num_spatial_crops
|
||||
self.num_retries = num_retries
|
||||
self.num_samples_precise_bn = num_samples_precise_bn
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
#set random seed
|
||||
random.seed(0)
|
||||
np.random.seed(0)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split()
|
||||
filename, labels = line_split
|
||||
if self.data_prefix is not None:
|
||||
filename = osp.join(self.data_prefix, filename)
|
||||
for tidx in range(self.num_ensemble_views):
|
||||
for sidx in range(self.num_spatial_crops):
|
||||
info.append(
|
||||
dict(
|
||||
filename=filename,
|
||||
labels=int(labels),
|
||||
temporal_sample_index=tidx,
|
||||
spatial_sample_index=sidx,
|
||||
temporal_num_clips=self.num_ensemble_views,
|
||||
spatial_num_clips=self.num_spatial_crops,
|
||||
))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID. Prepare the data for training given the index."""
|
||||
#Try to catch Exception caused by reading corrupted video file
|
||||
short_cycle = False
|
||||
if isinstance(idx, tuple):
|
||||
idx, short_cycle_idx = idx
|
||||
short_cycle = True
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
#Multi-grid short cycle
|
||||
if short_cycle:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results['short_cycle_idx'] = short_cycle_idx
|
||||
else:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['filename'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
|
||||
return results['imgs'][0], results['imgs'][1], np.array(
|
||||
[results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST. Prepare the data for test given the index."""
|
||||
#Try to catch Exception caused by reading corrupted video file
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['filename'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'][0], results['imgs'][1], np.array(
|
||||
[results['labels']]), np.array([idx])
|
||||
|
||||
def __len__(self):
|
||||
"""get the size of the dataset."""
|
||||
if self.num_samples_precise_bn is None:
|
||||
return len(self.info)
|
||||
else:
|
||||
random.shuffle(self.info)
|
||||
return min(self.num_samples_precise_bn, len(self.info))
|
@ -0,0 +1,89 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
import pickle
|
||||
|
||||
import paddle
|
||||
from paddle.io import Dataset
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class UCF101SkeletonDataset(BaseDataset):
|
||||
"""
|
||||
Skeleton dataset for action recognition.
|
||||
The dataset loads skeleton feature, and apply norm operatations.
|
||||
Args:
|
||||
file_path (str): Path to the index file.
|
||||
pipeline(obj): Define the pipeline of data preprocessing.
|
||||
test_mode (bool): Whether to bulid the test dataset. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
file_path,
|
||||
pipeline,
|
||||
split,
|
||||
repeat_times,
|
||||
test_mode=False):
|
||||
self.split = split
|
||||
self.repeat_times = repeat_times
|
||||
super().__init__(file_path, pipeline, test_mode=test_mode)
|
||||
self._ori_len = len(self.info)
|
||||
self.start_index = 0
|
||||
self.modality = "Pose"
|
||||
|
||||
def load_file(self):
|
||||
"""Load annotation file to get video information."""
|
||||
assert self.file_path.endswith('.pkl')
|
||||
return self.load_pkl_annotations()
|
||||
|
||||
def load_pkl_annotations(self):
|
||||
with open(self.file_path, "rb") as f:
|
||||
data = pickle.load(f)
|
||||
|
||||
if self.split:
|
||||
split, data = data['split'], data['annotations']
|
||||
identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
|
||||
data = [x for x in data if x[identifier] in split[self.split]]
|
||||
|
||||
return data
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""Prepare the frames for training given the index."""
|
||||
results = copy.deepcopy(self.info[idx % self._ori_len])
|
||||
results['modality'] = self.modality
|
||||
results['start_index'] = self.start_index
|
||||
|
||||
return self.pipeline(results)
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""Prepare the frames for testing given the index."""
|
||||
results = copy.deepcopy(self.info[idx % self._ori_len])
|
||||
results['modality'] = self.modality
|
||||
results['start_index'] = self.start_index
|
||||
|
||||
return self.pipeline(results)
|
||||
|
||||
def __len__(self):
|
||||
"""get the size of the dataset."""
|
||||
return len(self.info) * self.repeat_times
|
@ -0,0 +1,76 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class UCF24Dataset(BaseDataset):
|
||||
"""Dataset for YOWO
|
||||
The dataset loads raw videos and apply specified transforms on them.
|
||||
The index file is a file with multiple lines, and each line indicates
|
||||
a sample video with the filepath and label, which are split with a whitesapce.
|
||||
Example of a inde file:
|
||||
.. code-block:: txt
|
||||
|
||||
Args:
|
||||
file_path(str): Path to the index file.
|
||||
pipeline(XXX): A sequence of data transforms.
|
||||
**kwargs: Keyword arguments for ```BaseDataset```.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path, pipeline, num_retries=5, **kwargs):
|
||||
self.num_retries = num_retries
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
lines = fin.readlines()
|
||||
for line in lines:
|
||||
line = line.strip() # 'data/ucf24/labels/class_name/video_name/key_frame.txt'
|
||||
filename = line.replace('txt', 'jpg').replace(
|
||||
'labels', 'rgb-images') # key frame path
|
||||
|
||||
info.append(dict(filename=filename))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
im_path = results['filename']
|
||||
im_path = im_path.replace('jpg', 'txt')
|
||||
im_split = im_path.split('/')
|
||||
frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
|
||||
return results['imgs'], np.array([results['labels']]), frame_index
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST. Prepare the data for test given the index."""
|
||||
# Try to catch Exception caused by reading corrupted video file
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
im_path = results['filename']
|
||||
im_path = im_path.replace('jpg', 'txt')
|
||||
im_split = im_path.split('/')
|
||||
frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5]
|
||||
return results['imgs'], np.array([results['labels']]), frame_index
|
@ -0,0 +1,95 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os.path as osp
|
||||
import copy
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
from ..registry import DATASETS
|
||||
from .base import BaseDataset
|
||||
from ...utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
@DATASETS.register()
|
||||
class VideoDataset(BaseDataset):
|
||||
"""Video dataset for action recognition
|
||||
The dataset loads raw videos and apply specified transforms on them.
|
||||
The index file is a file with multiple lines, and each line indicates
|
||||
a sample video with the filepath and label, which are split with a whitesapce.
|
||||
Example of a inde file:
|
||||
.. code-block:: txt
|
||||
path/000.mp4 1
|
||||
path/001.mp4 1
|
||||
path/002.mp4 2
|
||||
path/003.mp4 2
|
||||
Args:
|
||||
file_path(str): Path to the index file.
|
||||
pipeline(XXX): A sequence of data transforms.
|
||||
**kwargs: Keyword arguments for ```BaseDataset```.
|
||||
"""
|
||||
def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs):
|
||||
self.num_retries = num_retries
|
||||
self.suffix = suffix
|
||||
super().__init__(file_path, pipeline, **kwargs)
|
||||
|
||||
def load_file(self):
|
||||
"""Load index file to get video information."""
|
||||
info = []
|
||||
with open(self.file_path, 'r') as fin:
|
||||
for line in fin:
|
||||
line_split = line.strip().split()
|
||||
filename, labels = line_split
|
||||
#TODO(hj): Required suffix format: may mp4/avi/wmv
|
||||
filename = filename + self.suffix
|
||||
if self.data_prefix is not None:
|
||||
filename = osp.join(self.data_prefix, filename)
|
||||
info.append(dict(filename=filename, labels=int(labels)))
|
||||
return info
|
||||
|
||||
def prepare_train(self, idx):
|
||||
"""TRAIN & VALID. Prepare the data for training/valid given the index."""
|
||||
#Try to catch Exception caused by reading corrupted video file
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['filename'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'], np.array([results['labels']])
|
||||
|
||||
def prepare_test(self, idx):
|
||||
"""TEST. Prepare the data for test given the index."""
|
||||
#Try to catch Exception caused by reading corrupted video file
|
||||
for ir in range(self.num_retries):
|
||||
try:
|
||||
results = copy.deepcopy(self.info[idx])
|
||||
results = self.pipeline(results)
|
||||
except Exception as e:
|
||||
#logger.info(e)
|
||||
if ir < self.num_retries - 1:
|
||||
logger.info(
|
||||
"Error when loading {}, have {} trys, will try again".
|
||||
format(results['filename'], ir))
|
||||
idx = random.randint(0, len(self.info) - 1)
|
||||
continue
|
||||
return results['imgs'], np.array([results['labels']])
|
@ -0,0 +1,56 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat
|
||||
from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip,
|
||||
GroupResize, Image2Array, JitterScale, MultiCrop,
|
||||
Normalization, PackOutput, RandomCrop, RandomFlip,
|
||||
RandomResizedCrop, Scale, TenCrop, ToArray,
|
||||
UniformCrop, RandomGamma, MultiCenterCrop,
|
||||
RandomBrightness, RandomHue, RandomSaturation, YowoAug)
|
||||
from .augmentations_ava import *
|
||||
from .compose import Compose
|
||||
from .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder
|
||||
from .decode_image import ImageDecoder
|
||||
from .decode_sampler import DecodeSampler
|
||||
from .mix import Cutmix, Mixup, VideoMix
|
||||
from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize
|
||||
from .sample import Sampler, SamplerPkl
|
||||
from .sample_ava import *
|
||||
from .segmentation import MultiNorm, MultiRestrictSize
|
||||
from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm
|
||||
from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation
|
||||
from .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact,
|
||||
RandomResizedCrop_V2, Flip_V2, CenterCrop_V2,
|
||||
GeneratePoseTarget, FormatShape, Collect)
|
||||
from .decode_sampler_MRI import SFMRI_DecodeSampler
|
||||
from .segmentation_pipline import SegmentationSampler
|
||||
from .sample_ucf24 import SamplerUCF24
|
||||
|
||||
__all__ = [
|
||||
'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize',
|
||||
'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose',
|
||||
'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale',
|
||||
'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput',
|
||||
'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop',
|
||||
'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix',
|
||||
'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap',
|
||||
'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize',
|
||||
'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler',
|
||||
'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation',
|
||||
'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue',
|
||||
'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact',
|
||||
'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget',
|
||||
'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug'
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,150 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from ..registry import PIPELINES
|
||||
"""pipeline ops for Activity Net.
|
||||
"""
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class LoadFeat(object):
|
||||
def __init__(self, feat_path):
|
||||
self.feat_path = feat_path
|
||||
|
||||
def __call__(self, results):
|
||||
video_name = results['video_name']
|
||||
file_name = video_name + ".npy"
|
||||
file_path = os.path.join(self.feat_path, file_name)
|
||||
#TODO: check path
|
||||
video_feat = np.load(file_path)
|
||||
video_feat = video_feat.T
|
||||
video_feat = video_feat.astype("float32")
|
||||
results['video_feat'] = video_feat
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class GetMatchMap(object):
|
||||
def __init__(self, tscale):
|
||||
self.tscale = tscale
|
||||
self.tgap = 1. / self.tscale
|
||||
|
||||
def __call__(self, results):
|
||||
match_map = []
|
||||
for idx in range(self.tscale):
|
||||
tmp_match_window = []
|
||||
xmin = self.tgap * idx
|
||||
for jdx in range(1, self.tscale + 1):
|
||||
xmax = xmin + self.tgap * jdx
|
||||
tmp_match_window.append([xmin, xmax])
|
||||
match_map.append(tmp_match_window)
|
||||
match_map = np.array(match_map)
|
||||
match_map = np.transpose(match_map, [1, 0, 2])
|
||||
match_map = np.reshape(match_map, [-1, 2])
|
||||
|
||||
anchor_xmin = [self.tgap * i for i in range(self.tscale)]
|
||||
anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)]
|
||||
|
||||
results['match_map'] = match_map
|
||||
results['anchor_xmin'] = anchor_xmin
|
||||
results['anchor_xmax'] = anchor_xmax
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class GetVideoLabel(object):
|
||||
def __init__(self, tscale, dscale, datatype="float32"):
|
||||
self.tscale = tscale
|
||||
self.dscale = dscale
|
||||
self.tgap = 1. / self.tscale
|
||||
self.datatype = datatype
|
||||
|
||||
def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
|
||||
"""Compute jaccard score between a box and the anchors.
|
||||
"""
|
||||
len_anchors = anchors_max - anchors_min
|
||||
int_xmin = np.maximum(anchors_min, box_min)
|
||||
int_xmax = np.minimum(anchors_max, box_max)
|
||||
inter_len = np.maximum(int_xmax - int_xmin, 0.)
|
||||
union_len = len_anchors - inter_len + box_max - box_min
|
||||
jaccard = np.divide(inter_len, union_len)
|
||||
return jaccard
|
||||
|
||||
def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max):
|
||||
"""Compute intersection between score a box and the anchors.
|
||||
"""
|
||||
len_anchors = anchors_max - anchors_min
|
||||
int_xmin = np.maximum(anchors_min, box_min)
|
||||
int_xmax = np.minimum(anchors_max, box_max)
|
||||
inter_len = np.maximum(int_xmax - int_xmin, 0.)
|
||||
scores = np.divide(inter_len, len_anchors)
|
||||
return scores
|
||||
|
||||
def __call__(self, results):
|
||||
video_info = results['video_info']
|
||||
match_map = results['match_map']
|
||||
anchor_xmin = results['anchor_xmin']
|
||||
anchor_xmax = results['anchor_xmax']
|
||||
|
||||
video_second = video_info['duration_second']
|
||||
video_labels = video_info['annotations']
|
||||
|
||||
gt_bbox = []
|
||||
gt_iou_map = []
|
||||
for gt in video_labels:
|
||||
tmp_start = max(min(1, gt["segment"][0] / video_second), 0)
|
||||
tmp_end = max(min(1, gt["segment"][1] / video_second), 0)
|
||||
gt_bbox.append([tmp_start, tmp_end])
|
||||
tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0],
|
||||
match_map[:, 1], tmp_start,
|
||||
tmp_end)
|
||||
tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
|
||||
[self.dscale, self.tscale])
|
||||
gt_iou_map.append(tmp_gt_iou_map)
|
||||
gt_iou_map = np.array(gt_iou_map)
|
||||
gt_iou_map = np.max(gt_iou_map, axis=0)
|
||||
|
||||
gt_bbox = np.array(gt_bbox)
|
||||
gt_xmins = gt_bbox[:, 0]
|
||||
gt_xmaxs = gt_bbox[:, 1]
|
||||
gt_len_small = 3 * self.tgap
|
||||
gt_start_bboxs = np.stack(
|
||||
(gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
|
||||
gt_end_bboxs = np.stack(
|
||||
(gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
|
||||
|
||||
match_score_start = []
|
||||
for jdx in range(len(anchor_xmin)):
|
||||
match_score_start.append(
|
||||
np.max(
|
||||
self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
|
||||
gt_start_bboxs[:, 0],
|
||||
gt_start_bboxs[:, 1])))
|
||||
match_score_end = []
|
||||
for jdx in range(len(anchor_xmin)):
|
||||
match_score_end.append(
|
||||
np.max(
|
||||
self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
|
||||
gt_end_bboxs[:, 0], gt_end_bboxs[:,
|
||||
1])))
|
||||
|
||||
gt_start = np.array(match_score_start)
|
||||
gt_end = np.array(match_score_end)
|
||||
|
||||
results['gt_iou_map'] = gt_iou_map.astype(self.datatype)
|
||||
results['gt_start'] = gt_start.astype(self.datatype)
|
||||
results['gt_end'] = gt_end.astype(self.datatype)
|
||||
return results
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,749 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
import math
|
||||
from PIL import Image
|
||||
from ..registry import PIPELINES
|
||||
from collections.abc import Sequence
|
||||
import cv2
|
||||
|
||||
pillow_interp_codes = {
|
||||
'nearest': Image.NEAREST,
|
||||
'bilinear': Image.BILINEAR,
|
||||
'bicubic': Image.BICUBIC,
|
||||
'box': Image.BOX,
|
||||
'lanczos': Image.LANCZOS,
|
||||
'hamming': Image.HAMMING
|
||||
}
|
||||
|
||||
cv2_interp_codes = {
|
||||
'nearest': cv2.INTER_NEAREST,
|
||||
'bilinear': cv2.INTER_LINEAR,
|
||||
'bicubic': cv2.INTER_CUBIC,
|
||||
'area': cv2.INTER_AREA,
|
||||
'lanczos': cv2.INTER_LANCZOS4
|
||||
}
|
||||
|
||||
|
||||
def _init_lazy_if_proper(results, lazy):
|
||||
"""Initialize lazy operation properly.
|
||||
|
||||
Make sure that a lazy operation is properly initialized,
|
||||
and avoid a non-lazy operation accidentally getting mixed in.
|
||||
|
||||
Required keys in results are "imgs" if "img_shape" not in results,
|
||||
otherwise, Required keys in results are "img_shape", add or modified keys
|
||||
are "img_shape", "lazy".
|
||||
Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip",
|
||||
"flip_direction", "interpolation".
|
||||
|
||||
Args:
|
||||
results (dict): A dict stores data pipeline result.
|
||||
lazy (bool): Determine whether to apply lazy operation. Default: False.
|
||||
"""
|
||||
|
||||
if 'img_shape' not in results:
|
||||
results['img_shape'] = results['imgs'][0].shape[:2]
|
||||
if lazy:
|
||||
if 'lazy' not in results:
|
||||
img_h, img_w = results['img_shape']
|
||||
lazyop = dict()
|
||||
lazyop['original_shape'] = results['img_shape']
|
||||
lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h],
|
||||
dtype=np.float32)
|
||||
lazyop['flip'] = False
|
||||
lazyop['flip_direction'] = None
|
||||
lazyop['interpolation'] = None
|
||||
results['lazy'] = lazyop
|
||||
else:
|
||||
assert 'lazy' not in results, 'Use Fuse after lazy operations'
|
||||
|
||||
|
||||
def _scale_size(size, scale):
|
||||
"""Rescale a size by a ratio.
|
||||
|
||||
Args:
|
||||
size (tuple[int]): (w, h).
|
||||
scale (float): Scaling factor.
|
||||
|
||||
Returns:
|
||||
tuple[int]: scaled size.
|
||||
"""
|
||||
w, h = size
|
||||
return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)
|
||||
|
||||
|
||||
def rescale_size(old_size, scale, return_scale=False):
|
||||
"""Calculate the new size to be rescaled to.
|
||||
|
||||
Args:
|
||||
old_size (tuple[int]): The old size (w, h) of image.
|
||||
scale (float | tuple[int]): The scaling factor or maximum size.
|
||||
If it is a float number, then the image will be rescaled by this
|
||||
factor, else if it is a tuple of 2 integers, then the image will
|
||||
be rescaled as large as possible within the scale.
|
||||
return_scale (bool): Whether to return the scaling factor besides the
|
||||
rescaled image size.
|
||||
|
||||
Returns:
|
||||
tuple[int]: The new rescaled image size.
|
||||
"""
|
||||
w, h = old_size
|
||||
if isinstance(scale, (float, int)):
|
||||
if scale <= 0:
|
||||
raise ValueError(f'Invalid scale {scale}, must be positive.')
|
||||
scale_factor = scale
|
||||
elif isinstance(scale, tuple):
|
||||
max_long_edge = max(scale)
|
||||
max_short_edge = min(scale)
|
||||
scale_factor = min(max_long_edge / max(h, w),
|
||||
max_short_edge / min(h, w))
|
||||
else:
|
||||
raise TypeError(
|
||||
f'Scale must be a number or tuple of int, but got {type(scale)}')
|
||||
|
||||
new_size = _scale_size((w, h), scale_factor)
|
||||
|
||||
if return_scale:
|
||||
return new_size, scale_factor
|
||||
else:
|
||||
return new_size
|
||||
|
||||
|
||||
def imresize(img,
|
||||
size,
|
||||
return_scale=False,
|
||||
interpolation='bilinear',
|
||||
out=None,
|
||||
backend=None):
|
||||
"""Resize image to a given size. """
|
||||
h, w = img.shape[:2]
|
||||
if backend is None:
|
||||
backend = 'cv2'
|
||||
if backend not in ['cv2', 'pillow']:
|
||||
raise ValueError(f'backend: {backend} is not supported for resize.'
|
||||
f"Supported backends are 'cv2', 'pillow'")
|
||||
|
||||
if backend == 'pillow':
|
||||
assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
|
||||
pil_image = Image.fromarray(img)
|
||||
pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
|
||||
resized_img = np.array(pil_image)
|
||||
else:
|
||||
resized_img = cv2.resize(
|
||||
img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
|
||||
if not return_scale:
|
||||
return resized_img
|
||||
else:
|
||||
w_scale = size[0] / w
|
||||
h_scale = size[1] / h
|
||||
return resized_img, w_scale, h_scale
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class EntityBoxRescale:
|
||||
"""Rescale the entity box and proposals according to the image shape.
|
||||
|
||||
Required keys are "proposals", "gt_bboxes", added or modified keys are
|
||||
"gt_bboxes". If original "proposals" is not None, "proposals" and
|
||||
will be added or modified.
|
||||
|
||||
Args:
|
||||
scale_factor (np.ndarray): The scale factor used entity_box rescaling.
|
||||
"""
|
||||
|
||||
def __init__(self, scale_factor):
|
||||
self.scale_factor = scale_factor
|
||||
|
||||
def __call__(self, results):
|
||||
scale_factor = np.concatenate([self.scale_factor, self.scale_factor])
|
||||
|
||||
if 'gt_bboxes' in results:
|
||||
gt_bboxes = results['gt_bboxes']
|
||||
results['gt_bboxes'] = gt_bboxes * scale_factor
|
||||
|
||||
if 'proposals' in results:
|
||||
proposals = results['proposals']
|
||||
if proposals is not None:
|
||||
assert proposals.shape[1] == 4, (
|
||||
'proposals shape should be in '
|
||||
f'(n, 4), but got {proposals.shape}')
|
||||
results['proposals'] = proposals * scale_factor
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}(scale_factor={self.scale_factor})'
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class EntityBoxCrop:
|
||||
"""Crop the entity boxes and proposals according to the cropped images.
|
||||
|
||||
Required keys are "proposals", "gt_bboxes", added or modified keys are
|
||||
"gt_bboxes". If original "proposals" is not None, "proposals" will be
|
||||
modified.
|
||||
|
||||
Args:
|
||||
crop_bbox(np.ndarray | None): The bbox used to crop the original image.
|
||||
"""
|
||||
|
||||
def __init__(self, crop_bbox):
|
||||
self.crop_bbox = crop_bbox
|
||||
|
||||
def __call__(self, results):
|
||||
proposals = results['proposals']
|
||||
gt_bboxes = results['gt_bboxes']
|
||||
|
||||
if self.crop_bbox is None:
|
||||
return results
|
||||
|
||||
x1, y1, x2, y2 = self.crop_bbox
|
||||
img_w, img_h = x2 - x1, y2 - y1
|
||||
|
||||
assert gt_bboxes.shape[-1] == 4
|
||||
gt_bboxes_ = gt_bboxes.copy()
|
||||
gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1)
|
||||
gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1)
|
||||
results['gt_bboxes'] = gt_bboxes_
|
||||
|
||||
if proposals is not None:
|
||||
assert proposals.shape[-1] == 4
|
||||
proposals_ = proposals.copy()
|
||||
proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0,
|
||||
img_w - 1)
|
||||
proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0,
|
||||
img_h - 1)
|
||||
results['proposals'] = proposals_
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})'
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class EntityBoxFlip:
|
||||
"""Flip the entity boxes and proposals with a probability.
|
||||
|
||||
Reverse the order of elements in the given bounding boxes and proposals
|
||||
with a specific direction. The shape of them are preserved, but the
|
||||
elements are reordered. Only the horizontal flip is supported (seems
|
||||
vertical flipping makes no sense). Required keys are "proposals",
|
||||
"gt_bboxes", added or modified keys are "gt_bboxes". If "proposals"
|
||||
is not None, it will also be modified.
|
||||
|
||||
Args:
|
||||
img_shape (tuple[int]): The img shape.
|
||||
"""
|
||||
|
||||
def __init__(self, img_shape):
|
||||
self.img_shape = img_shape
|
||||
|
||||
def __call__(self, results):
|
||||
proposals = results['proposals']
|
||||
gt_bboxes = results['gt_bboxes']
|
||||
img_h, img_w = self.img_shape
|
||||
|
||||
assert gt_bboxes.shape[-1] == 4
|
||||
gt_bboxes_ = gt_bboxes.copy()
|
||||
gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1
|
||||
gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1
|
||||
if proposals is not None:
|
||||
assert proposals.shape[-1] == 4
|
||||
proposals_ = proposals.copy()
|
||||
proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1
|
||||
proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1
|
||||
else:
|
||||
proposals_ = None
|
||||
|
||||
results['proposals'] = proposals_
|
||||
results['gt_bboxes'] = gt_bboxes_
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})'
|
||||
return repr_str
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Resize:
|
||||
"""Resize images to a specific size.
|
||||
|
||||
Required keys are "imgs", "img_shape", "modality", added or modified
|
||||
keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
|
||||
"resize_size". Required keys in "lazy" is None, added or modified key is
|
||||
"interpolation".
|
||||
|
||||
Args:
|
||||
scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
|
||||
factor or maximum size:
|
||||
If it is a float number, the image will be rescaled by this
|
||||
factor, else if it is a tuple of 2 integers, the image will
|
||||
be rescaled as large as possible within the scale.
|
||||
Otherwise, it serves as (w, h) of output size.
|
||||
keep_ratio (bool): If set to True, Images will be resized without
|
||||
changing the aspect ratio. Otherwise, it will resize images to a
|
||||
given size. Default: True.
|
||||
interpolation (str): Algorithm used for interpolation:
|
||||
"nearest" | "bilinear". Default: "bilinear".
|
||||
lazy (bool): Determine whether to apply lazy operation. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
scale,
|
||||
keep_ratio=True,
|
||||
interpolation='bilinear',
|
||||
lazy=False):
|
||||
if isinstance(scale, str):
|
||||
scale = eval(scale)
|
||||
if isinstance(scale, float):
|
||||
if scale <= 0:
|
||||
raise ValueError(f'Invalid scale {scale}, must be positive.')
|
||||
elif isinstance(scale, tuple):
|
||||
max_long_edge = max(scale)
|
||||
max_short_edge = min(scale)
|
||||
if max_short_edge == -1:
|
||||
# assign np.inf to long edge for rescaling short edge later.
|
||||
scale = (np.inf, max_long_edge)
|
||||
else:
|
||||
raise TypeError(
|
||||
f'Scale must be float or tuple of int, but got {type(scale)}')
|
||||
self.scale = scale
|
||||
self.keep_ratio = keep_ratio
|
||||
self.interpolation = interpolation
|
||||
self.lazy = lazy
|
||||
|
||||
def __call__(self, results):
|
||||
"""Performs the Resize augmentation.
|
||||
|
||||
Args:
|
||||
results (dict): The resulting dict to be modified and passed
|
||||
to the next transform in pipeline.
|
||||
"""
|
||||
|
||||
_init_lazy_if_proper(results, self.lazy)
|
||||
|
||||
if 'scale_factor' not in results:
|
||||
results['scale_factor'] = np.array([1, 1], dtype=np.float32)
|
||||
img_h, img_w = results['img_shape']
|
||||
|
||||
if self.keep_ratio:
|
||||
new_w, new_h = rescale_size((img_w, img_h), self.scale)
|
||||
else:
|
||||
new_w, new_h = self.scale
|
||||
|
||||
self.scale_factor = np.array([new_w / img_w, new_h / img_h],
|
||||
dtype=np.float32)
|
||||
results['img_shape'] = (new_h, new_w)
|
||||
results['keep_ratio'] = self.keep_ratio
|
||||
results['scale_factor'] = results['scale_factor'] * self.scale_factor
|
||||
|
||||
if not self.lazy:
|
||||
if 'imgs' in results:
|
||||
results['imgs'] = [
|
||||
imresize(
|
||||
img, (new_w, new_h), interpolation=self.interpolation)
|
||||
for img in results['imgs']
|
||||
]
|
||||
if 'keypoint' in results:
|
||||
results['keypoint'] = results['keypoint'] * self.scale_factor
|
||||
else:
|
||||
lazyop = results['lazy']
|
||||
if lazyop['flip']:
|
||||
raise NotImplementedError('Put Flip at last for now')
|
||||
lazyop['interpolation'] = self.interpolation
|
||||
|
||||
#if 'gt_bboxes' in results:
|
||||
assert not self.lazy
|
||||
entity_box_rescale = EntityBoxRescale(self.scale_factor)
|
||||
results = entity_box_rescale(results)
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'scale={self.scale}, keep_ratio={self.keep_ratio}, '
|
||||
f'interpolation={self.interpolation}, '
|
||||
f'lazy={self.lazy})')
|
||||
return repr_str
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class RandomRescale:
|
||||
"""Randomly resize images so that the short_edge is resized to a specific
|
||||
size in a given range. The scale ratio is unchanged after resizing.
|
||||
"""
|
||||
|
||||
def __init__(self, scale_range, interpolation='bilinear'):
|
||||
scale_range = eval(scale_range)
|
||||
self.scale_range = scale_range
|
||||
|
||||
assert len(scale_range) == 2
|
||||
assert scale_range[0] < scale_range[1]
|
||||
assert np.all([x > 0 for x in scale_range])
|
||||
|
||||
self.keep_ratio = True
|
||||
self.interpolation = interpolation
|
||||
|
||||
def __call__(self, results):
|
||||
"""Performs the Resize augmentation.
|
||||
|
||||
Args:
|
||||
results (dict): The resulting dict to be modified and passed
|
||||
to the next transform in pipeline.
|
||||
"""
|
||||
short_edge = np.random.randint(self.scale_range[0],
|
||||
self.scale_range[1] + 1)
|
||||
resize = Resize((-1, short_edge),
|
||||
keep_ratio=True,
|
||||
interpolation=self.interpolation,
|
||||
lazy=False)
|
||||
results = resize(results)
|
||||
|
||||
results['short_edge'] = short_edge
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
scale_range = self.scale_range
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'scale_range=({scale_range[0]}, {scale_range[1]}), '
|
||||
f'interpolation={self.interpolation})')
|
||||
return repr_str
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Rescale:
|
||||
"""resize images so that the short_edge is resized to a specific
|
||||
size in a given range. The scale ratio is unchanged after resizing.
|
||||
|
||||
Required keys are "imgs", "img_shape", "modality", added or modified
|
||||
keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size",
|
||||
"short_edge".
|
||||
|
||||
Args:
|
||||
scale_range (tuple[int]): The range of short edge length. A closed
|
||||
interval.
|
||||
interpolation (str): Algorithm used for interpolation:
|
||||
"nearest" | "bilinear". Default: "bilinear".
|
||||
"""
|
||||
|
||||
def __init__(self, scale_range, interpolation='bilinear'):
|
||||
scale_range = eval(scale_range)
|
||||
self.scale_range = scale_range
|
||||
|
||||
self.keep_ratio = True
|
||||
self.interpolation = interpolation
|
||||
|
||||
def __call__(self, results):
|
||||
"""Performs the Resize augmentation.
|
||||
|
||||
Args:
|
||||
results (dict): The resulting dict to be modified and passed
|
||||
to the next transform in pipeline.
|
||||
"""
|
||||
resize = Resize(
|
||||
self.scale_range,
|
||||
keep_ratio=True,
|
||||
interpolation=self.interpolation,
|
||||
lazy=False)
|
||||
results = resize(results)
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
scale_range = self.scale_range
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'scale_range=({scale_range[0]}, {scale_range[1]}), '
|
||||
f'interpolation={self.interpolation})')
|
||||
return repr_str
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class RandomCrop_v2:
|
||||
"""Vanilla square random crop that specifics the output size.
|
||||
|
||||
Required keys in results are "imgs" and "img_shape", added or
|
||||
modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip",
|
||||
"crop_bbox", added or modified key is "crop_bbox".
|
||||
|
||||
Args:
|
||||
size (int): The output size of the images.
|
||||
lazy (bool): Determine whether to apply lazy operation. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self, size, lazy=False):
|
||||
if not isinstance(size, int):
|
||||
raise TypeError(f'Size must be an int, but got {type(size)}')
|
||||
self.size = size
|
||||
self.lazy = lazy
|
||||
|
||||
def __call__(self, results):
|
||||
"""Performs the RandomCrop augmentation.
|
||||
|
||||
Args:
|
||||
results (dict): The resulting dict to be modified and passed
|
||||
to the next transform in pipeline.
|
||||
"""
|
||||
_init_lazy_if_proper(results, self.lazy)
|
||||
|
||||
img_h, img_w = results['img_shape']
|
||||
assert self.size <= img_h and self.size <= img_w
|
||||
|
||||
y_offset = 0
|
||||
x_offset = 0
|
||||
if img_h > self.size:
|
||||
y_offset = int(np.random.randint(0, img_h - self.size))
|
||||
if img_w > self.size:
|
||||
x_offset = int(np.random.randint(0, img_w - self.size))
|
||||
if 'crop_quadruple' not in results:
|
||||
results['crop_quadruple'] = np.array(
|
||||
[0, 0, 1, 1], # x, y, w, h
|
||||
dtype=np.float32)
|
||||
|
||||
x_ratio, y_ratio = x_offset / img_w, y_offset / img_h
|
||||
w_ratio, h_ratio = self.size / img_w, self.size / img_h
|
||||
|
||||
old_crop_quadruple = results['crop_quadruple']
|
||||
old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1]
|
||||
old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3]
|
||||
new_crop_quadruple = [
|
||||
old_x_ratio + x_ratio * old_w_ratio,
|
||||
old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio,
|
||||
h_ratio * old_x_ratio
|
||||
]
|
||||
results['crop_quadruple'] = np.array(
|
||||
new_crop_quadruple, dtype=np.float32)
|
||||
|
||||
new_h, new_w = self.size, self.size
|
||||
|
||||
results['crop_bbox'] = np.array(
|
||||
[x_offset, y_offset, x_offset + new_w, y_offset + new_h])
|
||||
results['img_shape'] = (new_h, new_w)
|
||||
|
||||
if not self.lazy:
|
||||
results['imgs'] = [
|
||||
img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
|
||||
for img in results['imgs']
|
||||
]
|
||||
else:
|
||||
lazyop = results['lazy']
|
||||
if lazyop['flip']:
|
||||
raise NotImplementedError('Put Flip at last for now')
|
||||
|
||||
# record crop_bbox in lazyop dict to ensure only crop once in Fuse
|
||||
lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox']
|
||||
left = x_offset * (lazy_right - lazy_left) / img_w
|
||||
right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w
|
||||
top = y_offset * (lazy_bottom - lazy_top) / img_h
|
||||
bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h
|
||||
lazyop['crop_bbox'] = np.array(
|
||||
[(lazy_left + left), (lazy_top + top), (lazy_left + right),
|
||||
(lazy_top + bottom)],
|
||||
dtype=np.float32)
|
||||
|
||||
# Process entity boxes
|
||||
if 'gt_bboxes' in results:
|
||||
assert not self.lazy
|
||||
entity_box_crop = EntityBoxCrop(results['crop_bbox'])
|
||||
results = entity_box_crop(results)
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (f'{self.__class__.__name__}(size={self.size}, '
|
||||
f'lazy={self.lazy})')
|
||||
return repr_str
|
||||
|
||||
|
||||
def imflip_(img, direction='horizontal'):
|
||||
"""Inplace flip an image horizontally or vertically.
|
||||
|
||||
Args:
|
||||
img (ndarray): Image to be flipped.
|
||||
direction (str): The flip direction, either "horizontal" or
|
||||
"vertical" or "diagonal".
|
||||
|
||||
Returns:
|
||||
ndarray: The flipped image (inplace).
|
||||
"""
|
||||
assert direction in ['horizontal', 'vertical', 'diagonal']
|
||||
if direction == 'horizontal':
|
||||
return cv2.flip(img, 1, img)
|
||||
elif direction == 'vertical':
|
||||
return cv2.flip(img, 0, img)
|
||||
else:
|
||||
return cv2.flip(img, -1, img)
|
||||
|
||||
|
||||
def iminvert(img):
|
||||
"""Invert (negate) an image.
|
||||
|
||||
Args:
|
||||
img (ndarray): Image to be inverted.
|
||||
|
||||
Returns:
|
||||
ndarray: The inverted image.
|
||||
"""
|
||||
return np.full_like(img, 255) - img
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Flip:
|
||||
"""Flip the input images with a probability.
|
||||
|
||||
Reverse the order of elements in the given imgs with a specific direction.
|
||||
The shape of the imgs is preserved, but the elements are reordered.
|
||||
Required keys are "imgs", "img_shape", "modality", added or modified
|
||||
keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
|
||||
None, added or modified key are "flip" and "flip_direction". The Flip
|
||||
augmentation should be placed after any cropping / reshaping augmentations,
|
||||
to make sure crop_quadruple is calculated properly.
|
||||
|
||||
Args:
|
||||
flip_ratio (float): Probability of implementing flip. Default: 0.5.
|
||||
direction (str): Flip imgs horizontally or vertically. Options are
|
||||
"horizontal" | "vertical". Default: "horizontal".
|
||||
lazy (bool): Determine whether to apply lazy operation. Default: False.
|
||||
"""
|
||||
_directions = ['horizontal', 'vertical']
|
||||
|
||||
def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False):
|
||||
if direction not in self._directions:
|
||||
raise ValueError(f'Direction {direction} is not supported. '
|
||||
f'Currently support ones are {self._directions}')
|
||||
self.flip_ratio = flip_ratio
|
||||
self.direction = direction
|
||||
self.lazy = lazy
|
||||
|
||||
def __call__(self, results):
|
||||
"""Performs the Flip augmentation.
|
||||
|
||||
Args:
|
||||
results (dict): The resulting dict to be modified and passed
|
||||
to the next transform in pipeline.
|
||||
"""
|
||||
_init_lazy_if_proper(results, self.lazy)
|
||||
flip = np.random.rand() < self.flip_ratio
|
||||
|
||||
results['flip'] = flip
|
||||
results['flip_direction'] = self.direction
|
||||
|
||||
if not self.lazy:
|
||||
if flip:
|
||||
for i, img in enumerate(results['imgs']):
|
||||
imflip_(img, self.direction)
|
||||
lt = len(results['imgs'])
|
||||
else:
|
||||
results['imgs'] = list(results['imgs'])
|
||||
else:
|
||||
lazyop = results['lazy']
|
||||
if lazyop['flip']:
|
||||
raise NotImplementedError('Use one Flip please')
|
||||
lazyop['flip'] = flip
|
||||
lazyop['flip_direction'] = self.direction
|
||||
|
||||
if 'gt_bboxes' in results and flip:
|
||||
assert not self.lazy and self.direction == 'horizontal'
|
||||
entity_box_flip = EntityBoxFlip(results['img_shape'])
|
||||
results = entity_box_flip(results)
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (
|
||||
f'{self.__class__.__name__}('
|
||||
f'flip_ratio={self.flip_ratio}, direction={self.direction}, '
|
||||
f'lazy={self.lazy})')
|
||||
return repr_str
|
||||
|
||||
|
||||
def imnormalize_(img, mean, std, to_rgb=True):
|
||||
"""Inplace normalize an image with mean and std.
|
||||
|
||||
Args:
|
||||
img (ndarray): Image to be normalized.
|
||||
mean (ndarray): The mean to be used for normalize.
|
||||
std (ndarray): The std to be used for normalize.
|
||||
to_rgb (bool): Whether to convert to rgb.
|
||||
|
||||
Returns:
|
||||
ndarray: The normalized image.
|
||||
"""
|
||||
# cv2 inplace normalization does not accept uint8
|
||||
assert img.dtype != np.uint8
|
||||
mean = np.float64(mean.reshape(1, -1))
|
||||
stdinv = 1 / np.float64(std.reshape(1, -1))
|
||||
if to_rgb:
|
||||
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
|
||||
cv2.subtract(img, mean, img) # inplace
|
||||
cv2.multiply(img, stdinv, img) # inplace
|
||||
return img
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Normalize:
|
||||
"""Normalize images with the given mean and std value.
|
||||
|
||||
Required keys are "imgs", "img_shape", "modality", added or modified
|
||||
keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
|
||||
keys "scale_factor" is required
|
||||
|
||||
Args:
|
||||
mean (Sequence[float]): Mean values of different channels.
|
||||
std (Sequence[float]): Std values of different channels.
|
||||
to_bgr (bool): Whether to convert channels from RGB to BGR.
|
||||
Default: False.
|
||||
adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
|
||||
on 'scale_factor' when modality is 'Flow'. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
|
||||
if not isinstance(mean, Sequence):
|
||||
raise TypeError(
|
||||
f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
|
||||
|
||||
if not isinstance(std, Sequence):
|
||||
raise TypeError(
|
||||
f'Std must be list, tuple or np.ndarray, but got {type(std)}')
|
||||
|
||||
self.mean = np.array(mean, dtype=np.float32)
|
||||
self.std = np.array(std, dtype=np.float32)
|
||||
self.to_bgr = to_bgr
|
||||
self.adjust_magnitude = adjust_magnitude
|
||||
|
||||
def __call__(self, results):
|
||||
n = len(results['imgs'])
|
||||
h, w, c = results['imgs'][0].shape
|
||||
imgs = np.empty((n, h, w, c), dtype=np.float32)
|
||||
for i, img in enumerate(results['imgs']):
|
||||
imgs[i] = img
|
||||
|
||||
for img in imgs:
|
||||
imnormalize_(img, self.mean, self.std, self.to_bgr)
|
||||
|
||||
results['imgs'] = imgs
|
||||
results['img_norm_cfg'] = dict(
|
||||
mean=self.mean, std=self.std, to_bgr=self.to_bgr)
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'mean={self.mean}, '
|
||||
f'std={self.std}, '
|
||||
f'to_bgr={self.to_bgr}, '
|
||||
f'adjust_magnitude={self.adjust_magnitude})')
|
||||
return repr_str
|
@ -0,0 +1,76 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections.abc import Sequence
|
||||
from ..registry import PIPELINES
|
||||
import traceback
|
||||
from ...utils import build
|
||||
from ...utils import get_logger
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Compose(object):
|
||||
"""
|
||||
Composes several pipelines(include decode func, sample func, and transforms) together.
|
||||
|
||||
Note: To deal with ```list``` type cfg temporaray, like:
|
||||
|
||||
transform:
|
||||
- Crop: # A list
|
||||
attribute: 10
|
||||
- Resize: # A list
|
||||
attribute: 20
|
||||
|
||||
every key of list will pass as the key name to build a module.
|
||||
XXX: will be improved in the future.
|
||||
|
||||
Args:
|
||||
pipelines (list): List of transforms to compose.
|
||||
Returns:
|
||||
A compose object which is callable, __call__ for this Compose
|
||||
object will call each given :attr:`transforms` sequencely.
|
||||
"""
|
||||
def __init__(self, pipelines):
|
||||
#assert isinstance(pipelines, Sequence)
|
||||
self.pipelines = []
|
||||
for p in pipelines.values():
|
||||
if isinstance(p, dict):
|
||||
p = build(p, PIPELINES)
|
||||
self.pipelines.append(p)
|
||||
elif isinstance(p, list):
|
||||
for t in p:
|
||||
#XXX: to deal with old format cfg, ugly code here!
|
||||
temp_dict = dict(name=list(t.keys())[0])
|
||||
for all_sub_t in t.values():
|
||||
if all_sub_t is not None:
|
||||
temp_dict.update(all_sub_t)
|
||||
|
||||
t = build(temp_dict, PIPELINES)
|
||||
self.pipelines.append(t)
|
||||
elif callable(p):
|
||||
self.pipelines.append(p)
|
||||
else:
|
||||
raise TypeError(f'pipelines must be callable or a dict,'
|
||||
f'but got {type(p)}')
|
||||
def __call__(self, data):
|
||||
for p in self.pipelines:
|
||||
try:
|
||||
data = p(data)
|
||||
except Exception as e:
|
||||
stack_info = traceback.format_exc()
|
||||
logger = get_logger("paddlevideo")
|
||||
logger.info("fail to perform transform [{}] with error: "
|
||||
"{} and stack:\n{}".format(p, e, str(stack_info)))
|
||||
raise e
|
||||
return data
|
@ -0,0 +1,348 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
try:
|
||||
import av
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models."
|
||||
)
|
||||
import cv2
|
||||
import pickle
|
||||
import decord as de
|
||||
import math
|
||||
import random
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
|
||||
delta = max(video_size - clip_size, 0)
|
||||
if clip_idx == -1: # here
|
||||
# Random temporal sampling.
|
||||
start_idx = random.uniform(0, delta)
|
||||
else: # ignore
|
||||
# Uniformly sample the clip with the given index.
|
||||
start_idx = delta * clip_idx / num_clips
|
||||
end_idx = start_idx + clip_size - 1
|
||||
return start_idx, end_idx
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class VideoDecoder(object):
|
||||
"""
|
||||
Decode mp4 file to frames.
|
||||
Args:
|
||||
filepath: the file path of mp4 file
|
||||
"""
|
||||
def __init__(self,
|
||||
backend='cv2',
|
||||
mode='train',
|
||||
sampling_rate=32,
|
||||
num_seg=8,
|
||||
num_clips=1,
|
||||
target_fps=30):
|
||||
|
||||
self.backend = backend
|
||||
# params below only for TimeSformer
|
||||
self.mode = mode
|
||||
self.sampling_rate = sampling_rate
|
||||
self.num_seg = num_seg
|
||||
self.num_clips = num_clips
|
||||
self.target_fps = target_fps
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Perform mp4 decode operations.
|
||||
return:
|
||||
List where each item is a numpy array after decoder.
|
||||
"""
|
||||
file_path = results['filename']
|
||||
results['format'] = 'video'
|
||||
results['backend'] = self.backend
|
||||
|
||||
if self.backend == 'cv2':
|
||||
cap = cv2.VideoCapture(file_path)
|
||||
videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
sampledFrames = []
|
||||
for i in range(videolen):
|
||||
ret, frame = cap.read()
|
||||
# maybe first frame is empty
|
||||
if ret == False:
|
||||
continue
|
||||
img = frame[:, :, ::-1]
|
||||
sampledFrames.append(img)
|
||||
results['frames'] = sampledFrames
|
||||
results['frames_len'] = len(sampledFrames)
|
||||
|
||||
elif self.backend == 'decord':
|
||||
container = de.VideoReader(file_path)
|
||||
frames_len = len(container)
|
||||
results['frames'] = container
|
||||
results['frames_len'] = frames_len
|
||||
|
||||
elif self.backend == 'pyav': # for TimeSformer
|
||||
if self.mode in ["train", "valid"]:
|
||||
clip_idx = -1
|
||||
elif self.mode in ["test"]:
|
||||
clip_idx = 0
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
container = av.open(file_path)
|
||||
|
||||
num_clips = 1 # always be 1
|
||||
|
||||
# decode process
|
||||
fps = float(container.streams.video[0].average_rate)
|
||||
|
||||
frames_length = container.streams.video[0].frames
|
||||
duration = container.streams.video[0].duration
|
||||
|
||||
if duration is None:
|
||||
# If failed to fetch the decoding information, decode the entire video.
|
||||
decode_all_video = True
|
||||
video_start_pts, video_end_pts = 0, math.inf
|
||||
else:
|
||||
decode_all_video = False
|
||||
start_idx, end_idx = get_start_end_idx(
|
||||
frames_length,
|
||||
self.sampling_rate * self.num_seg / self.target_fps * fps,
|
||||
clip_idx, num_clips)
|
||||
timebase = duration / frames_length
|
||||
video_start_pts = int(start_idx * timebase)
|
||||
video_end_pts = int(end_idx * timebase)
|
||||
|
||||
frames = None
|
||||
# If video stream was found, fetch video frames from the video.
|
||||
if container.streams.video:
|
||||
margin = 1024
|
||||
seek_offset = max(video_start_pts - margin, 0)
|
||||
|
||||
container.seek(seek_offset,
|
||||
any_frame=False,
|
||||
backward=True,
|
||||
stream=container.streams.video[0])
|
||||
tmp_frames = {}
|
||||
buffer_count = 0
|
||||
max_pts = 0
|
||||
for frame in container.decode(**{"video": 0}):
|
||||
max_pts = max(max_pts, frame.pts)
|
||||
if frame.pts < video_start_pts:
|
||||
continue
|
||||
if frame.pts <= video_end_pts:
|
||||
tmp_frames[frame.pts] = frame
|
||||
else:
|
||||
buffer_count += 1
|
||||
tmp_frames[frame.pts] = frame
|
||||
if buffer_count >= 0:
|
||||
break
|
||||
video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
|
||||
|
||||
container.close()
|
||||
|
||||
frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
|
||||
clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
|
||||
|
||||
start_idx, end_idx = get_start_end_idx(
|
||||
len(frames), # frame_len
|
||||
clip_sz,
|
||||
clip_idx if decode_all_video else
|
||||
0, # If decode all video, -1 in train and valid, 0 in test;
|
||||
# else, always 0 in train, valid and test, as we has selected clip size frames when decode.
|
||||
1)
|
||||
results['frames'] = frames
|
||||
results['frames_len'] = len(frames)
|
||||
results['start_idx'] = start_idx
|
||||
results['end_idx'] = end_idx
|
||||
else:
|
||||
raise NotImplementedError
|
||||
# pass
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class FrameDecoder(object):
|
||||
"""just parse results
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __call__(self, results):
|
||||
results['format'] = 'frame'
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class MRIDecoder(object):
|
||||
"""just parse results
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __call__(self, results):
|
||||
results['format'] = 'MRI'
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class FeatureDecoder(object):
|
||||
"""
|
||||
Perform feature decode operations.e.g.youtube8m
|
||||
"""
|
||||
def __init__(self, num_classes, max_len=512, has_label=True):
|
||||
self.max_len = max_len
|
||||
self.num_classes = num_classes
|
||||
self.has_label = has_label
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Perform feature decode operations.
|
||||
return:
|
||||
List where each item is a numpy array after decoder.
|
||||
"""
|
||||
#1. load pkl
|
||||
#2. parse to rgb/audio/
|
||||
#3. padding
|
||||
|
||||
filepath = results['filename']
|
||||
data = pickle.load(open(filepath, 'rb'), encoding='bytes')
|
||||
|
||||
record = data
|
||||
nframes = record['nframes'] if 'nframes' in record else record[
|
||||
b'nframes']
|
||||
rgb = record['feature'].astype(
|
||||
float) if 'feature' in record else record[b'feature'].astype(float)
|
||||
audio = record['audio'].astype(
|
||||
float) if 'audio' in record else record[b'audio'].astype(float)
|
||||
if self.has_label:
|
||||
label = record['label'] if 'label' in record else record[b'label']
|
||||
one_hot_label = self.make_one_hot(label, self.num_classes)
|
||||
|
||||
rgb = rgb[0:nframes, :]
|
||||
audio = audio[0:nframes, :]
|
||||
|
||||
rgb = self.dequantize(rgb,
|
||||
max_quantized_value=2.,
|
||||
min_quantized_value=-2.)
|
||||
audio = self.dequantize(audio,
|
||||
max_quantized_value=2,
|
||||
min_quantized_value=-2)
|
||||
|
||||
if self.has_label:
|
||||
results['labels'] = one_hot_label.astype("float32")
|
||||
|
||||
feat_pad_list = []
|
||||
feat_len_list = []
|
||||
mask_list = []
|
||||
vitem = [rgb, audio]
|
||||
for vi in range(2): #rgb and audio
|
||||
if vi == 0:
|
||||
prefix = "rgb_"
|
||||
else:
|
||||
prefix = "audio_"
|
||||
feat = vitem[vi]
|
||||
results[prefix + 'len'] = feat.shape[0]
|
||||
#feat pad step 1. padding
|
||||
feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
|
||||
dtype=np.float32)
|
||||
feat_pad = np.concatenate((feat, feat_add), axis=0)
|
||||
results[prefix + 'data'] = feat_pad.astype("float32")
|
||||
#feat pad step 2. mask
|
||||
feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
|
||||
feat_mask_add = feat_add
|
||||
feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
|
||||
axis=0)
|
||||
results[prefix + 'mask'] = feat_mask.astype("float32")
|
||||
|
||||
return results
|
||||
|
||||
def dequantize(self,
|
||||
feat_vector,
|
||||
max_quantized_value=2.,
|
||||
min_quantized_value=-2.):
|
||||
"""
|
||||
Dequantize the feature from the byte format to the float format
|
||||
"""
|
||||
|
||||
assert max_quantized_value > min_quantized_value
|
||||
quantized_range = max_quantized_value - min_quantized_value
|
||||
scalar = quantized_range / 255.0
|
||||
bias = (quantized_range / 512.0) + min_quantized_value
|
||||
|
||||
return feat_vector * scalar + bias
|
||||
|
||||
def make_one_hot(self, label, dim=3862):
|
||||
one_hot_label = np.zeros(dim)
|
||||
one_hot_label = one_hot_label.astype(float)
|
||||
for ind in label:
|
||||
one_hot_label[int(ind)] = 1
|
||||
return one_hot_label
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class ActionFeatureDecoder(object):
|
||||
"""
|
||||
Perform feature decode operations on footballaction
|
||||
"""
|
||||
def __init__(self, num_classes, max_len=512, has_label=True):
|
||||
self.max_len = max_len
|
||||
self.num_classes = num_classes
|
||||
self.has_label = has_label
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Perform feature decode operations.
|
||||
return:
|
||||
List where each item is a numpy array after decoder.
|
||||
"""
|
||||
#1. load pkl
|
||||
#2. parse to rgb/audio/
|
||||
#3. padding
|
||||
|
||||
filepath = results['filename']
|
||||
data = pickle.load(open(filepath, 'rb'), encoding='bytes')
|
||||
|
||||
pkl_data = data
|
||||
rgb = pkl_data['image_feature'].astype(float)
|
||||
audio = pkl_data['audio_feature'].astype(float)
|
||||
label_id_info = pkl_data['label_info']
|
||||
label_cls = [label_id_info['label']]
|
||||
label_one = int(label_cls[0])
|
||||
if len(label_cls) > 1:
|
||||
label_index = random.randint(0, 1)
|
||||
label_one = int(label_cls[label_index])
|
||||
iou_norm = float(label_id_info['norm_iou'])
|
||||
results['labels'] = np.array([label_one])
|
||||
results['iou_norm'] = float(iou_norm)
|
||||
|
||||
vitem = [rgb, audio]
|
||||
for vi in range(2): #rgb and audio
|
||||
if vi == 0:
|
||||
prefix = "rgb_"
|
||||
else:
|
||||
prefix = "audio_"
|
||||
feat = vitem[vi]
|
||||
results[prefix + 'len'] = feat.shape[0]
|
||||
#feat pad step 1. padding
|
||||
feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
|
||||
dtype=np.float32)
|
||||
feat_pad = np.concatenate((feat, feat_add), axis=0)
|
||||
results[prefix + 'data'] = feat_pad.astype("float32")
|
||||
#feat pad step 2. mask
|
||||
feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
|
||||
feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
|
||||
results[prefix + 'mask'] = feat_mask.astype("float32")
|
||||
|
||||
return results
|
@ -0,0 +1,206 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import PIL.Image as pil
|
||||
|
||||
try:
|
||||
import skimage.transform
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS."
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class ImageDecoder(object):
|
||||
"""Decode Image
|
||||
"""
|
||||
def __init__(self,
|
||||
dataset,
|
||||
frame_idxs,
|
||||
num_scales,
|
||||
side_map,
|
||||
full_res_shape,
|
||||
img_ext,
|
||||
backend='cv2'):
|
||||
self.backend = backend
|
||||
self.dataset = dataset
|
||||
self.frame_idxs = frame_idxs
|
||||
self.num_scales = num_scales
|
||||
self.side_map = side_map
|
||||
self.full_res_shape = full_res_shape
|
||||
self.img_ext = img_ext
|
||||
|
||||
def _pil_loader(self, path):
|
||||
with open(path, 'rb') as f:
|
||||
with Image.open(f) as img:
|
||||
return img.convert('RGB')
|
||||
|
||||
def get_color(self, folder, frame_index, side):
|
||||
color = self._pil_loader(
|
||||
self.get_image_path(self.dataset, folder, frame_index, side))
|
||||
return color
|
||||
|
||||
def get_image_path(self, dataset, folder, frame_index, side):
|
||||
if dataset == "kitti":
|
||||
f_str = "{:010d}{}".format(frame_index, self.img_ext)
|
||||
image_path = os.path.join(self.data_path, folder, f_str)
|
||||
elif dataset == "kitti_odom":
|
||||
f_str = "{:06d}{}".format(frame_index, self.img_ext)
|
||||
image_path = os.path.join(self.data_path,
|
||||
"sequences/{:02d}".format(int(folder)),
|
||||
"image_{}".format(self.side_map[side]),
|
||||
f_str)
|
||||
elif dataset == "kitti_depth":
|
||||
f_str = "{:010d}{}".format(frame_index, self.img_ext)
|
||||
image_path = os.path.join(
|
||||
self.data_path, folder,
|
||||
"image_0{}/data".format(self.side_map[side]), f_str)
|
||||
|
||||
return image_path
|
||||
|
||||
def get_depth(self, dataset, folder, frame_index, side):
|
||||
if dataset == "kitii_depth":
|
||||
f_str = "{:010d}.png".format(frame_index)
|
||||
depth_path = os.path.join(
|
||||
self.data_path, folder,
|
||||
"proj_depth/groundtruth/image_0{}".format(self.side_map[side]),
|
||||
f_str)
|
||||
|
||||
depth_gt = pil.open(depth_path)
|
||||
depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST)
|
||||
depth_gt = np.array(depth_gt).astype(np.float32) / 256
|
||||
|
||||
else:
|
||||
f_str = "{:010d}{}".format(frame_index, self.img_ext)
|
||||
depth_path = os.path.join(self.data_path, folder + '_gt', f_str)
|
||||
|
||||
img_file = Image.open(depth_path)
|
||||
depth_png = np.array(img_file, dtype=int)
|
||||
img_file.close()
|
||||
# make sure we have a proper 16bit depth map here.. not 8bit!
|
||||
assert np.max(depth_png) > 255, \
|
||||
"np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path)
|
||||
|
||||
depth_gt = depth_png.astype(np.float) / 256.
|
||||
|
||||
depth_gt = depth_gt[160:960 - 160, :]
|
||||
|
||||
depth_gt = skimage.transform.resize(depth_gt,
|
||||
self.full_res_shape[::-1],
|
||||
order=0,
|
||||
preserve_range=True,
|
||||
mode='constant')
|
||||
|
||||
return depth_gt
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Perform mp4 decode operations.
|
||||
return:
|
||||
List where each item is a numpy array after decoder.
|
||||
"""
|
||||
if results.get('mode', None) == 'infer':
|
||||
imgs = {}
|
||||
imgs[("color", 0,
|
||||
-1)] = Image.open(results["filename"]).convert("RGB")
|
||||
results['imgs'] = imgs
|
||||
return results
|
||||
|
||||
self.data_path = results['data_path']
|
||||
results['backend'] = self.backend
|
||||
|
||||
imgs = {}
|
||||
|
||||
results['frame_idxs'] = self.frame_idxs
|
||||
results['num_scales'] = self.num_scales
|
||||
|
||||
file_name = results['filename']
|
||||
folder = results['folder']
|
||||
frame_index = results['frame_index']
|
||||
line = file_name.split('/')
|
||||
istrain = folder.split('_')[1]
|
||||
if 'mode' not in results:
|
||||
results['mode'] = istrain
|
||||
results['day_or_night'] = folder.split('_')[0]
|
||||
|
||||
if istrain == "train":
|
||||
if folder[0] == 'd':
|
||||
folder2 = folder + '_fake_night'
|
||||
flag = 0
|
||||
else:
|
||||
folder2 = folder + '_fake_day'
|
||||
tmp = folder
|
||||
folder = folder2
|
||||
folder2 = tmp
|
||||
flag = 1
|
||||
|
||||
if len(line) == 3:
|
||||
side = line[2]
|
||||
else:
|
||||
side = None
|
||||
|
||||
results['side'] = side
|
||||
|
||||
for i in self.frame_idxs:
|
||||
|
||||
if i == "s":
|
||||
other_side = {"r": "l", "l": "r"}[side]
|
||||
imgs[("color", i,
|
||||
-1)] = self.get_color(folder, frame_index, other_side)
|
||||
imgs[("color_n", i,
|
||||
-1)] = self.get_color(folder2, frame_index,
|
||||
other_side)
|
||||
else:
|
||||
imgs[("color", i,
|
||||
-1)] = self.get_color(folder, frame_index + i, side)
|
||||
imgs[("color_n", i,
|
||||
-1)] = self.get_color(folder2, frame_index + i, side)
|
||||
|
||||
istrain = folder.split('_')[1]
|
||||
if istrain != 'train':
|
||||
if flag:
|
||||
depth_gt = self.get_depth(folder2, frame_index, side)
|
||||
else:
|
||||
depth_gt = self.get_depth(folder, frame_index, side)
|
||||
imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
|
||||
elif istrain == 'val':
|
||||
if len(line) == 3:
|
||||
side = line[2]
|
||||
else:
|
||||
side = None
|
||||
|
||||
for i in self.frame_idxs:
|
||||
if i == "s":
|
||||
other_side = {"r": "l", "l": "r"}[side]
|
||||
imgs[("color", i,
|
||||
-1)] = self.get_color(folder, frame_index, other_side)
|
||||
else:
|
||||
|
||||
imgs[("color", i,
|
||||
-1)] = self.get_color(folder, frame_index + i, side)
|
||||
|
||||
# adjusting intrinsics to match each scale in the pyramid
|
||||
|
||||
depth_gt = self.get_depth(self.dataset, folder, frame_index, side)
|
||||
imgs["depth_gt"] = np.expand_dims(depth_gt, 0)
|
||||
results['imgs'] = imgs
|
||||
|
||||
return results
|
@ -0,0 +1,93 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import decord as de
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class DecodeSampler(object):
|
||||
"""
|
||||
We use 'decord' for decode and sampling, which is faster than opencv.
|
||||
This is used in slowfast model.
|
||||
Args:
|
||||
num_frames(int): the number of frames we want to sample.
|
||||
sampling_rate(int): sampling rate for video data.
|
||||
target_fps(int): desired fps, default 30
|
||||
test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_frames,
|
||||
sampling_rate,
|
||||
default_sampling_rate=2,
|
||||
target_fps=30,
|
||||
test_mode=False):
|
||||
self.num_frames = num_frames
|
||||
self.orig_sampling_rate = self.sampling_rate = sampling_rate
|
||||
self.default_sampling_rate = default_sampling_rate
|
||||
self.target_fps = target_fps
|
||||
self.test_mode = test_mode
|
||||
|
||||
def get_start_end_idx(self, video_size, clip_size, clip_idx,
|
||||
temporal_num_clips):
|
||||
delta = max(video_size - clip_size, 0)
|
||||
if not self.test_mode:
|
||||
# Random temporal sampling.
|
||||
start_idx = random.uniform(0, delta)
|
||||
else:
|
||||
# Uniformly sample the clip with the given index.
|
||||
start_idx = delta * clip_idx / temporal_num_clips
|
||||
end_idx = start_idx + clip_size - 1
|
||||
return start_idx, end_idx
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Perform mp4 decode operations.
|
||||
return:
|
||||
List where each item is a numpy array after decoder.
|
||||
"""
|
||||
short_cycle_idx = results.get('short_cycle_idx')
|
||||
if short_cycle_idx:
|
||||
self.sampling_rate = random.randint(self.default_sampling_rate,
|
||||
self.orig_sampling_rate)
|
||||
|
||||
filepath = results['filename']
|
||||
temporal_sample_index = results['temporal_sample_index']
|
||||
temporal_num_clips = results['temporal_num_clips']
|
||||
|
||||
vr = de.VideoReader(filepath)
|
||||
videolen = len(vr)
|
||||
|
||||
# fps = vr.get_avg_fps()
|
||||
clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps
|
||||
|
||||
start_idx, end_idx = self.get_start_end_idx(videolen, clip_size,
|
||||
temporal_sample_index,
|
||||
temporal_num_clips)
|
||||
index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64")
|
||||
index = np.clip(index, 0, videolen)
|
||||
|
||||
frames_select = vr.get_batch(index) #1 for buffer
|
||||
|
||||
# dearray_to_img
|
||||
np_frames = frames_select.asnumpy()
|
||||
frames_select_list = []
|
||||
for i in range(np_frames.shape[0]):
|
||||
imgbuf = np_frames[i]
|
||||
frames_select_list.append(Image.fromarray(imgbuf, mode='RGB'))
|
||||
results['imgs'] = frames_select_list
|
||||
return results
|
@ -0,0 +1,224 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
try:
|
||||
import SimpleITK as sitk
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
|
||||
)
|
||||
import cv2
|
||||
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class SFMRI_DecodeSampler(object):
|
||||
"""
|
||||
Sample frames id.
|
||||
NOTE: Use PIL to read image here, has diff with CV2
|
||||
Args:
|
||||
num_seg(int): number of segments.
|
||||
seg_len(int): number of sampled frames in each segment.
|
||||
valid_mode(bool): True or False.
|
||||
select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
|
||||
Returns:
|
||||
frames_idx: the index of sampled #frames.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_seg,
|
||||
seg_len,
|
||||
valid_mode=False,
|
||||
select_left=False,
|
||||
dense_sample=False,
|
||||
linspace_sample=False):
|
||||
self.num_seg = num_seg
|
||||
self.seg_len = seg_len
|
||||
self.valid_mode = valid_mode
|
||||
self.select_left = select_left
|
||||
self.dense_sample = dense_sample
|
||||
self.linspace_sample = linspace_sample
|
||||
|
||||
def _get(self, frames_idx_s, frames_idx_f, results):
|
||||
|
||||
frame_dir = results['frame_dir']
|
||||
imgs_s = []
|
||||
imgs_f = []
|
||||
MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
|
||||
for idx in frames_idx_s:
|
||||
item = MRI[idx]
|
||||
item = cv2.resize(item, (224, 224))
|
||||
imgs_s.append(item)
|
||||
|
||||
for idx in frames_idx_f:
|
||||
item = MRI[idx]
|
||||
item = cv2.resize(item, (224, 224))
|
||||
imgs_f.append(item)
|
||||
|
||||
results['imgs'] = [imgs_s, imgs_f]
|
||||
return results
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Args:
|
||||
frames_len: length of frames.
|
||||
return:
|
||||
sampling id.
|
||||
"""
|
||||
frames_len = int(results['frames_len'])
|
||||
average_dur1 = int(frames_len / self.num_seg[0])
|
||||
average_dur2 = int(frames_len / self.num_seg[1])
|
||||
frames_idx_s = []
|
||||
frames_idx_f = []
|
||||
if self.linspace_sample:
|
||||
if 'start_idx' in results and 'end_idx' in results:
|
||||
offsets_s = np.linspace(results['start_idx'],
|
||||
results['end_idx'], self.num_seg[0])
|
||||
offsets_f = np.linspace(results['start_idx'],
|
||||
results['end_idx'], self.num_seg[1])
|
||||
else:
|
||||
offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0])
|
||||
offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1])
|
||||
offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64)
|
||||
offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64)
|
||||
|
||||
frames_idx_s = list(offsets_s)
|
||||
frames_idx_f = list(offsets_f)
|
||||
|
||||
return self._get(frames_idx_s, frames_idx_f, results)
|
||||
|
||||
if not self.select_left:
|
||||
if self.dense_sample: # For ppTSM
|
||||
if not self.valid_mode: # train
|
||||
sample_pos = max(1, 1 + frames_len - 64)
|
||||
t_stride1 = 64 // self.num_seg[0]
|
||||
t_stride2 = 64 // self.num_seg[1]
|
||||
start_idx = 0 if sample_pos == 1 else np.random.randint(
|
||||
0, sample_pos - 1)
|
||||
offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1
|
||||
for idx in range(self.num_seg[0])]
|
||||
offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1
|
||||
for idx in range(self.num_seg[1])]
|
||||
frames_idx_s = offsets_s
|
||||
frames_idx_f = offsets_f
|
||||
else:
|
||||
sample_pos = max(1, 1 + frames_len - 64)
|
||||
t_stride1 = 64 // self.num_seg[0]
|
||||
t_stride2 = 64 // self.num_seg[1]
|
||||
start_list = np.linspace(0,
|
||||
sample_pos - 1,
|
||||
num=10,
|
||||
dtype=int)
|
||||
offsets_s = []
|
||||
offsets_f = []
|
||||
for start_idx in start_list.tolist():
|
||||
offsets_s += [
|
||||
(idx * t_stride1 + start_idx) % frames_len + 1
|
||||
for idx in range(self.num_seg[0])
|
||||
]
|
||||
for start_idx in start_list.tolist():
|
||||
offsets_f += [
|
||||
(idx * t_stride2 + start_idx) % frames_len + 1
|
||||
for idx in range(self.num_seg[1])
|
||||
]
|
||||
frames_idx_s = offsets_s
|
||||
frames_idx_f = offsets_f
|
||||
else:
|
||||
for i in range(self.num_seg[0]):
|
||||
idx = 0
|
||||
if not self.valid_mode:
|
||||
if average_dur1 >= self.seg_len:
|
||||
idx = random.randint(0, average_dur1 - self.seg_len)
|
||||
idx += i * average_dur1
|
||||
elif average_dur1 >= 1:
|
||||
idx += i * average_dur1
|
||||
else:
|
||||
idx = i
|
||||
else:
|
||||
if average_dur1 >= self.seg_len:
|
||||
idx = (average_dur1 - 1) // 2
|
||||
idx += i * average_dur1
|
||||
elif average_dur1 >= 1:
|
||||
idx += i * average_dur1
|
||||
else:
|
||||
idx = i
|
||||
for jj in range(idx, idx + self.seg_len):
|
||||
frames_idx_s.append(jj)
|
||||
|
||||
for i in range(self.num_seg[1]):
|
||||
idx = 0
|
||||
if not self.valid_mode:
|
||||
if average_dur2 >= self.seg_len:
|
||||
idx = random.randint(0, average_dur2 - self.seg_len)
|
||||
idx += i * average_dur2
|
||||
elif average_dur2 >= 1:
|
||||
idx += i * average_dur2
|
||||
else:
|
||||
idx = i
|
||||
else:
|
||||
if average_dur2 >= self.seg_len:
|
||||
idx = (average_dur2 - 1) // 2
|
||||
idx += i * average_dur2
|
||||
elif average_dur2 >= 1:
|
||||
idx += i * average_dur2
|
||||
else:
|
||||
idx = i
|
||||
for jj in range(idx, idx + self.seg_len):
|
||||
frames_idx_f.append(jj)
|
||||
|
||||
return self._get(frames_idx_s, frames_idx_f, results)
|
||||
|
||||
else: # for TSM
|
||||
if not self.valid_mode:
|
||||
if average_dur2 > 0:
|
||||
offsets_s = np.multiply(list(range(
|
||||
self.num_seg[0])), average_dur1) + np.random.randint(
|
||||
average_dur1, size=self.num_seg[0])
|
||||
|
||||
offsets_f = np.multiply(list(range(
|
||||
self.num_seg[1])), average_dur2) + np.random.randint(
|
||||
average_dur2, size=self.num_seg[1])
|
||||
elif frames_len > self.num_seg[1]:
|
||||
offsets_s = np.sort(
|
||||
np.random.randint(frames_len, size=self.num_seg[0]))
|
||||
offsets_f = np.sort(
|
||||
np.random.randint(frames_len, size=self.num_seg[1]))
|
||||
else:
|
||||
offsets_s = np.zeros(shape=(self.num_seg[0], ))
|
||||
offsets_f = np.zeros(shape=(self.num_seg[1], ))
|
||||
else:
|
||||
if frames_len > self.num_seg[1]:
|
||||
average_dur_float_s = frames_len / self.num_seg[0]
|
||||
offsets_s = np.array([
|
||||
int(average_dur_float_s / 2.0 + average_dur_float_s * x)
|
||||
for x in range(self.num_seg[0])
|
||||
])
|
||||
average_dur_float_f = frames_len / self.num_seg[1]
|
||||
offsets_f = np.array([
|
||||
int(average_dur_float_f / 2.0 + average_dur_float_f * x)
|
||||
for x in range(self.num_seg[1])
|
||||
])
|
||||
else:
|
||||
offsets_s = np.zeros(shape=(self.num_seg[0], ))
|
||||
offsets_f = np.zeros(shape=(self.num_seg[1], ))
|
||||
|
||||
frames_idx_s = list(offsets_s)
|
||||
frames_idx_f = list(offsets_f)
|
||||
|
||||
return self._get(frames_idx_s, frames_idx_f, results)
|
@ -0,0 +1,116 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Mixup(object):
|
||||
"""
|
||||
Mixup operator.
|
||||
Args:
|
||||
alpha(float): alpha value.
|
||||
"""
|
||||
def __init__(self, alpha=0.2):
|
||||
assert alpha > 0., \
|
||||
'parameter alpha[%f] should > 0.0' % (alpha)
|
||||
self.alpha = alpha
|
||||
|
||||
def __call__(self, batch):
|
||||
imgs, labels = list(zip(*batch))
|
||||
imgs = np.array(imgs)
|
||||
labels = np.array(labels)
|
||||
bs = len(batch)
|
||||
idx = np.random.permutation(bs)
|
||||
lam = np.random.beta(self.alpha, self.alpha)
|
||||
lams = np.array([lam] * bs, dtype=np.float32)
|
||||
imgs = lam * imgs + (1 - lam) * imgs[idx]
|
||||
return list(zip(imgs, labels, labels[idx], lams))
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Cutmix(object):
|
||||
""" Cutmix operator
|
||||
Args:
|
||||
alpha(float): alpha value.
|
||||
"""
|
||||
def __init__(self, alpha=0.2):
|
||||
assert alpha > 0., \
|
||||
'parameter alpha[%f] should > 0.0' % (alpha)
|
||||
self.alpha = alpha
|
||||
|
||||
def rand_bbox(self, size, lam):
|
||||
""" rand_bbox """
|
||||
w = size[2]
|
||||
h = size[3]
|
||||
cut_rat = np.sqrt(1. - lam)
|
||||
cut_w = np.int(w * cut_rat)
|
||||
cut_h = np.int(h * cut_rat)
|
||||
|
||||
# uniform
|
||||
cx = np.random.randint(w)
|
||||
cy = np.random.randint(h)
|
||||
|
||||
bbx1 = np.clip(cx - cut_w // 2, 0, w)
|
||||
bby1 = np.clip(cy - cut_h // 2, 0, h)
|
||||
bbx2 = np.clip(cx + cut_w // 2, 0, w)
|
||||
bby2 = np.clip(cy + cut_h // 2, 0, h)
|
||||
|
||||
return bbx1, bby1, bbx2, bby2
|
||||
|
||||
def __call__(self, batch):
|
||||
imgs, labels = list(zip(*batch))
|
||||
imgs = np.array(imgs)
|
||||
labels = np.array(labels)
|
||||
|
||||
bs = len(batch)
|
||||
idx = np.random.permutation(bs)
|
||||
lam = np.random.beta(self.alpha, self.alpha)
|
||||
|
||||
bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam)
|
||||
imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
|
||||
lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
|
||||
(imgs.shape[-2] * imgs.shape[-1]))
|
||||
lams = np.array([lam] * bs, dtype=np.float32)
|
||||
|
||||
return list(zip(imgs, labels, labels[idx], lams))
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class VideoMix(object):
|
||||
"""
|
||||
VideoMix operator.
|
||||
Args:
|
||||
cutmix_prob(float): prob choose cutmix
|
||||
mixup_alpha(float): alpha for mixup aug
|
||||
cutmix_alpha(float): alpha for cutmix aug
|
||||
"""
|
||||
def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0):
|
||||
assert cutmix_prob > 0., \
|
||||
'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob)
|
||||
assert mixup_alpha > 0., \
|
||||
'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha)
|
||||
assert cutmix_alpha > 0., \
|
||||
'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha)
|
||||
self.cutmix_prob = cutmix_prob
|
||||
self.mixup = Mixup(mixup_alpha)
|
||||
self.cutmix = Cutmix(cutmix_alpha)
|
||||
|
||||
def __call__(self, batch):
|
||||
if np.random.random() < self.cutmix_prob:
|
||||
return self.cutmix(batch)
|
||||
else:
|
||||
return self.mixup(batch)
|
@ -0,0 +1,380 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
# import decord as de
|
||||
import copy
|
||||
import json
|
||||
from ..registry import PIPELINES
|
||||
|
||||
try:
|
||||
from paddlenlp.transformers import BertTokenizer
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT."
|
||||
)
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class FeaturePadding(object):
|
||||
"""
|
||||
Padding feature to target shape.
|
||||
"""
|
||||
def __init__(self, max_region_num=36, max_action_num=5):
|
||||
self.max_region_num = max_region_num
|
||||
self.max_action_num = max_action_num
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Padding feature.
|
||||
"""
|
||||
pack_feature = results['feature']
|
||||
tokenizer = results['tokenizer']
|
||||
image_feature_wp, image_target_wp, image_location_wp, \
|
||||
num_boxes, image_h, image_w, image_id, caption, \
|
||||
action_feature_wp, action_target_wp, num_actions = pack_feature
|
||||
|
||||
image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32)
|
||||
image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32)
|
||||
image_location = np.zeros((self.max_region_num, 5), dtype=np.float32)
|
||||
|
||||
action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32)
|
||||
action_target = np.zeros((self.max_action_num, ), dtype=np.int64)
|
||||
|
||||
num_boxes = int(num_boxes)
|
||||
image_feature[:num_boxes] = image_feature_wp
|
||||
image_target[:num_boxes] = image_target_wp
|
||||
image_location[:num_boxes, :4] = image_location_wp
|
||||
|
||||
image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * (
|
||||
image_location[:, 2] - image_location[:, 0]) / (float(image_w) *
|
||||
float(image_h))
|
||||
|
||||
image_location[:, 0] = image_location[:, 0] / float(image_w)
|
||||
image_location[:, 1] = image_location[:, 1] / float(image_h)
|
||||
image_location[:, 2] = image_location[:, 2] / float(image_w)
|
||||
image_location[:, 3] = image_location[:, 3] / float(image_h)
|
||||
|
||||
image_feature = copy.deepcopy(image_feature)
|
||||
image_target = copy.deepcopy(image_target)
|
||||
|
||||
num_actions = int(num_actions)
|
||||
action_feature[:num_actions] = action_feature_wp
|
||||
action_target[:num_actions] = action_target_wp
|
||||
action_feature = copy.deepcopy(action_feature)
|
||||
action_target = copy.deepcopy(action_target)
|
||||
|
||||
results = dict(image_feat=image_feature,
|
||||
image_target=image_target,
|
||||
caption=caption,
|
||||
image_loc=image_location,
|
||||
num_boxes=int(num_boxes),
|
||||
action_feat=action_feature,
|
||||
action_target=action_target,
|
||||
num_actions=int(num_actions),
|
||||
tokenizer=tokenizer)
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class RandomCap(object):
|
||||
def __init__(self, caption_path):
|
||||
"""
|
||||
Random Caption for NSP task
|
||||
"""
|
||||
self.caption_path = caption_path
|
||||
|
||||
def select_caption(self, caption):
|
||||
captions = caption.split('!')
|
||||
rind = random.randint(0, len(captions) - 1)
|
||||
caption = captions[rind]
|
||||
return caption
|
||||
|
||||
def get_random_caption(self, all_captions):
|
||||
num_caps = len(all_captions)
|
||||
rand_doc_idx = random.randint(0, num_caps - 1)
|
||||
caption = all_captions[rand_doc_idx]
|
||||
caption = self.select_caption(caption)
|
||||
return caption
|
||||
|
||||
def random_cap(self, caption, all_captions):
|
||||
if random.random() > 0.5:
|
||||
label = 0
|
||||
else:
|
||||
caption = self.get_random_caption(all_captions)
|
||||
label = 1
|
||||
return caption, label
|
||||
|
||||
def __call__(self, results):
|
||||
caption = results['caption']
|
||||
all_captions = list(json.load(open(self.caption_path, 'r')))
|
||||
caption = self.select_caption(caption)
|
||||
caption, label = self.random_cap(caption, all_captions)
|
||||
results['caption'] = caption
|
||||
results['is_next'] = label
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Tokenize(object):
|
||||
def __init__(self, ):
|
||||
"""
|
||||
Tokenize caption
|
||||
"""
|
||||
pass
|
||||
|
||||
def __call__(self, results):
|
||||
caption = results['caption']
|
||||
tokenizer = results['tokenizer']
|
||||
tokens_caption = tokenizer.tokenize(caption)
|
||||
results['caption'] = tokens_caption
|
||||
return results
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class RandomMask(object):
|
||||
def __init__(self,
|
||||
max_seq_length=36,
|
||||
max_action_length=5,
|
||||
max_region_length=36):
|
||||
self.max_seq_length = max_seq_length
|
||||
self.max_action_length = max_action_length
|
||||
self.max_region_length = max_region_length
|
||||
|
||||
def get_image_global_feature(self, image_feat, image_loc, image_mask):
|
||||
g_image_feat = np.sum(image_feat, axis=0) / np.sum(
|
||||
image_mask, axis=0, keepdims=True)
|
||||
image_feat = np.concatenate(
|
||||
[np.expand_dims(g_image_feat, axis=0), image_feat],
|
||||
axis=0).astype("float32")
|
||||
|
||||
g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32")
|
||||
image_loc = np.concatenate(
|
||||
[np.expand_dims(g_image_loc, axis=0), image_loc], axis=0)
|
||||
|
||||
g_image_mask = np.array([1])
|
||||
image_mask = np.concatenate([g_image_mask, image_mask], axis=0)
|
||||
|
||||
return image_feat, image_loc, image_mask
|
||||
|
||||
def _truncate_seq_pair(self, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length.
|
||||
This is a simple heuristic which will always truncate the longer sequence
|
||||
one token at a time. This makes more sense than truncating an equal percent
|
||||
of tokens from each, since if one sequence is very short then each token
|
||||
that's truncated likely contains more information than a longer sequence.
|
||||
"""
|
||||
while True:
|
||||
total_length = len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
tokens_b.pop()
|
||||
|
||||
def random_word(self, tokens, tokenizer):
|
||||
"""
|
||||
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
|
||||
Args:
|
||||
tokens: list of str, tokenized sentence.
|
||||
tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
|
||||
Return:
|
||||
(list of str, list of int), masked tokens and related labels for LM prediction
|
||||
"""
|
||||
output_label = []
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
prob = random.random()
|
||||
# mask token with 15% probability
|
||||
|
||||
if prob < 0.15:
|
||||
prob /= 0.15
|
||||
|
||||
# 80% randomly change token to mask token
|
||||
if prob < 0.8:
|
||||
tokens[i] = "[MASK]"
|
||||
|
||||
# 10% randomly change token to random token
|
||||
elif prob < 0.9:
|
||||
#tok = random.choice(list(tokenizer.vocab.items()))[0]
|
||||
tok = tokenizer.vocab.idx_to_token[random.randint(
|
||||
0,
|
||||
tokenizer.vocab_size,
|
||||
)]
|
||||
tokens[i] = tok
|
||||
|
||||
# rest 10% randomly keep current token
|
||||
# append current token to output (we will predict these later)
|
||||
try:
|
||||
output_label.append(tokenizer.vocab[token])
|
||||
except KeyError:
|
||||
# For unknown words (should not occur with BPE vocab)
|
||||
output_label.append(tokenizer.vocab["[UNK]"])
|
||||
print(
|
||||
"Cannot find token '{}' in vocab. Using [UNK] insetad".
|
||||
format(token))
|
||||
else:
|
||||
# no masking token (will be ignored by loss function later)
|
||||
output_label.append(-1)
|
||||
|
||||
return tokens, output_label
|
||||
|
||||
def random_region(self, image_feat, image_loc, num_boxes):
|
||||
output_label = []
|
||||
|
||||
for i in range(num_boxes):
|
||||
prob = random.random()
|
||||
# mask token with 15% probability
|
||||
if prob < 0.15:
|
||||
prob /= 0.15
|
||||
|
||||
# 80% randomly change token to mask token
|
||||
if prob < 0.9:
|
||||
image_feat[i] = 0
|
||||
|
||||
# rest 20% randomly keep current token
|
||||
# append current token to output (we will predict these later)
|
||||
output_label.append(1)
|
||||
else:
|
||||
# no masking token (will be ignored by loss function later)
|
||||
output_label.append(-1)
|
||||
|
||||
return image_feat, image_loc, output_label
|
||||
|
||||
def random_action(self, action_feat, action_target, num_actions):
|
||||
output_label = []
|
||||
|
||||
for i in range(num_actions):
|
||||
prob = random.random()
|
||||
# mask token with 15% probability
|
||||
if prob < 0.15:
|
||||
prob /= 0.15
|
||||
|
||||
# 90% randomly change token to mask token
|
||||
if prob < 0.9:
|
||||
action_feat[i] = 0
|
||||
|
||||
# rest 10% randomly keep current token
|
||||
# append current token to output (we will predict these later)
|
||||
output_label.append(action_target[i])
|
||||
else:
|
||||
# no masking token (will be ignored by loss function later)
|
||||
output_label.append(-1)
|
||||
|
||||
return action_feat, output_label
|
||||
|
||||
def __call__(self, results):
|
||||
caption = results['caption']
|
||||
tokenizer = results['tokenizer']
|
||||
image_feat = results['image_feat']
|
||||
image_loc = results['image_loc']
|
||||
num_boxes = results['num_boxes']
|
||||
action_feat = results['action_feat']
|
||||
action_target = results['action_target']
|
||||
num_actions = results['num_actions']
|
||||
is_next = results['is_next']
|
||||
image_target = results['image_target']
|
||||
|
||||
self._truncate_seq_pair(caption, self.max_seq_length - 2)
|
||||
caption, caption_label = self.random_word(caption, tokenizer)
|
||||
|
||||
image_feat, image_loc, image_label = self.random_region(
|
||||
image_feat, image_loc, num_boxes)
|
||||
action_feat, action_label = self.random_action(action_feat,
|
||||
action_target,
|
||||
num_actions)
|
||||
|
||||
# concatenate lm labels and account for CLS, SEP, SEP
|
||||
lm_label_ids = [-1] + caption_label + [-1]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
||||
# embedding vector (and position vector). This is not *strictly* necessary
|
||||
# since the [SEP] token unambigiously separates the sequences, but it makes
|
||||
# it easier for the model to learn the concept of sequences.
|
||||
#
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
|
||||
tokens = []
|
||||
segment_ids = []
|
||||
|
||||
tokens.append("[CLS]")
|
||||
segment_ids.append(0)
|
||||
|
||||
for token in caption:
|
||||
tokens.append(token)
|
||||
segment_ids.append(0)
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(0)
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
|
||||
input_mask = [1] * (len(input_ids))
|
||||
image_mask = [1] * (num_boxes)
|
||||
action_mask = [1] * (num_actions)
|
||||
|
||||
# Zero-pad up to the visual sequence length.
|
||||
while len(image_mask) < self.max_region_length:
|
||||
image_mask.append(0)
|
||||
image_label.append(-1)
|
||||
while len(action_mask) < self.max_action_length:
|
||||
action_mask.append(0)
|
||||
action_label.append(-1)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
while len(input_ids) < self.max_seq_length:
|
||||
input_ids.append(0)
|
||||
input_mask.append(0)
|
||||
segment_ids.append(0)
|
||||
lm_label_ids.append(-1)
|
||||
|
||||
assert len(input_ids) == self.max_seq_length
|
||||
assert len(input_mask) == self.max_seq_length
|
||||
assert len(segment_ids) == self.max_seq_length
|
||||
assert len(lm_label_ids) == self.max_seq_length
|
||||
assert len(image_mask) == self.max_region_length
|
||||
assert len(image_label) == self.max_region_length
|
||||
assert len(action_mask) == self.max_action_length
|
||||
assert len(action_label) == self.max_action_length
|
||||
|
||||
image_feat, image_loc, image_mask = self.get_image_global_feature(
|
||||
image_feat, image_loc, np.array(image_mask))
|
||||
features = [
|
||||
np.array(input_ids),
|
||||
action_feat,
|
||||
image_feat,
|
||||
image_loc,
|
||||
np.array(segment_ids),
|
||||
np.array(input_mask),
|
||||
image_mask,
|
||||
np.array(action_mask),
|
||||
np.array(lm_label_ids),
|
||||
np.array(action_label),
|
||||
np.array(is_next),
|
||||
np.array(image_label),
|
||||
image_target,
|
||||
]
|
||||
results['features'] = features
|
||||
return results
|
@ -0,0 +1,382 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
try:
|
||||
import SimpleITK as sitk
|
||||
except ImportError as e:
|
||||
print(
|
||||
f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care."
|
||||
)
|
||||
import cv2
|
||||
|
||||
from ..registry import PIPELINES
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
import pickle
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class Sampler(object):
|
||||
"""
|
||||
Sample frames id.
|
||||
NOTE: Use PIL to read image here, has diff with CV2
|
||||
Args:
|
||||
num_seg(int): number of segments.
|
||||
seg_len(int): number of sampled frames in each segment.
|
||||
valid_mode(bool): True or False.
|
||||
select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
|
||||
Returns:
|
||||
frames_idx: the index of sampled #frames.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_seg,
|
||||
seg_len,
|
||||
frame_interval=None,
|
||||
valid_mode=False,
|
||||
select_left=False,
|
||||
dense_sample=False,
|
||||
linspace_sample=False,
|
||||
use_pil=True):
|
||||
self.num_seg = num_seg
|
||||
self.seg_len = seg_len
|
||||
self.frame_interval = frame_interval
|
||||
self.valid_mode = valid_mode
|
||||
self.select_left = select_left
|
||||
self.dense_sample = dense_sample
|
||||
self.linspace_sample = linspace_sample
|
||||
self.use_pil = use_pil
|
||||
|
||||
def _get(self, frames_idx, results):
|
||||
data_format = results['format']
|
||||
|
||||
if data_format == "frame":
|
||||
frame_dir = results['frame_dir']
|
||||
imgs = []
|
||||
for idx in frames_idx:
|
||||
img = Image.open(
|
||||
os.path.join(frame_dir,
|
||||
results['suffix'].format(idx))).convert('RGB')
|
||||
imgs.append(img)
|
||||
|
||||
elif data_format == "MRI":
|
||||
frame_dir = results['frame_dir']
|
||||
imgs = []
|
||||
MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
|
||||
for idx in frames_idx:
|
||||
item = MRI[idx]
|
||||
item = cv2.resize(item, (224, 224))
|
||||
imgs.append(item)
|
||||
|
||||
elif data_format == "video":
|
||||
if results['backend'] == 'cv2':
|
||||
frames = np.array(results['frames'])
|
||||
imgs = []
|
||||
for idx in frames_idx:
|
||||
imgbuf = frames[idx]
|
||||
img = Image.fromarray(imgbuf, mode='RGB')
|
||||
imgs.append(img)
|
||||
elif results['backend'] == 'decord':
|
||||
container = results['frames']
|
||||
if self.use_pil:
|
||||
frames_select = container.get_batch(frames_idx)
|
||||
# dearray_to_img
|
||||
np_frames = frames_select.asnumpy()
|
||||
imgs = []
|
||||
for i in range(np_frames.shape[0]):
|
||||
imgbuf = np_frames[i]
|
||||
imgs.append(Image.fromarray(imgbuf, mode='RGB'))
|
||||
else:
|
||||
if frames_idx.ndim != 1:
|
||||
frames_idx = np.squeeze(frames_idx)
|
||||
frame_dict = {
|
||||
idx: container[idx].asnumpy()
|
||||
for idx in np.unique(frames_idx)
|
||||
}
|
||||
imgs = [frame_dict[idx] for idx in frames_idx]
|
||||
elif results['backend'] == 'pyav':
|
||||
imgs = []
|
||||
frames = np.array(results['frames'])
|
||||
for idx in frames_idx:
|
||||
if self.dense_sample:
|
||||
idx = idx - 1
|
||||
imgbuf = frames[idx]
|
||||
imgs.append(imgbuf)
|
||||
imgs = np.stack(imgs) # thwc
|
||||
else:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
raise NotImplementedError
|
||||
results['imgs'] = imgs
|
||||
return results
|
||||
|
||||
def _get_train_clips(self, num_frames):
|
||||
ori_seg_len = self.seg_len * self.frame_interval
|
||||
avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
|
||||
|
||||
if avg_interval > 0:
|
||||
base_offsets = np.arange(self.num_seg) * avg_interval
|
||||
clip_offsets = base_offsets + np.random.randint(avg_interval,
|
||||
size=self.num_seg)
|
||||
elif num_frames > max(self.num_seg, ori_seg_len):
|
||||
clip_offsets = np.sort(
|
||||
np.random.randint(num_frames - ori_seg_len + 1,
|
||||
size=self.num_seg))
|
||||
elif avg_interval == 0:
|
||||
ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
|
||||
clip_offsets = np.around(np.arange(self.num_seg) * ratio)
|
||||
else:
|
||||
clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
|
||||
return clip_offsets
|
||||
|
||||
def _get_test_clips(self, num_frames):
|
||||
ori_seg_len = self.seg_len * self.frame_interval
|
||||
avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
|
||||
if num_frames > ori_seg_len - 1:
|
||||
base_offsets = np.arange(self.num_seg) * avg_interval
|
||||
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
|
||||
else:
|
||||
clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
|
||||
return clip_offsets
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Args:
|
||||
frames_len: length of frames.
|
||||
return:
|
||||
sampling id.
|
||||
"""
|
||||
frames_len = int(results['frames_len'])
|
||||
frames_idx = []
|
||||
if self.frame_interval is not None:
|
||||
assert isinstance(self.frame_interval, int)
|
||||
if not self.valid_mode:
|
||||
offsets = self._get_train_clips(frames_len)
|
||||
else:
|
||||
offsets = self._get_test_clips(frames_len)
|
||||
|
||||
offsets = offsets[:, None] + np.arange(
|
||||
self.seg_len)[None, :] * self.frame_interval
|
||||
offsets = np.concatenate(offsets)
|
||||
|
||||
offsets = offsets.reshape((-1, self.seg_len))
|
||||
offsets = np.mod(offsets, frames_len)
|
||||
offsets = np.concatenate(offsets)
|
||||
|
||||
if results['format'] == 'video':
|
||||
frames_idx = offsets
|
||||
elif results['format'] == 'frame':
|
||||
frames_idx = list(offsets + 1)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return self._get(frames_idx, results)
|
||||
|
||||
if self.linspace_sample:
|
||||
if 'start_idx' in results and 'end_idx' in results:
|
||||
offsets = np.linspace(results['start_idx'], results['end_idx'],
|
||||
self.num_seg)
|
||||
else:
|
||||
offsets = np.linspace(0, frames_len - 1, self.num_seg)
|
||||
offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
|
||||
if results['format'] == 'video':
|
||||
frames_idx = list(offsets)
|
||||
frames_idx = [x % frames_len for x in frames_idx]
|
||||
elif results['format'] == 'frame':
|
||||
frames_idx = list(offsets + 1)
|
||||
|
||||
elif results['format'] == 'MRI':
|
||||
frames_idx = list(offsets)
|
||||
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return self._get(frames_idx, results)
|
||||
|
||||
average_dur = int(frames_len / self.num_seg)
|
||||
if not self.select_left:
|
||||
if self.dense_sample: # For ppTSM
|
||||
if not self.valid_mode: # train
|
||||
sample_pos = max(1, 1 + frames_len - 64)
|
||||
t_stride = 64 // self.num_seg
|
||||
start_idx = 0 if sample_pos == 1 else np.random.randint(
|
||||
0, sample_pos - 1)
|
||||
offsets = [(idx * t_stride + start_idx) % frames_len + 1
|
||||
for idx in range(self.num_seg)]
|
||||
frames_idx = offsets
|
||||
else:
|
||||
sample_pos = max(1, 1 + frames_len - 64)
|
||||
t_stride = 64 // self.num_seg
|
||||
start_list = np.linspace(0,
|
||||
sample_pos - 1,
|
||||
num=10,
|
||||
dtype=int)
|
||||
offsets = []
|
||||
for start_idx in start_list.tolist():
|
||||
offsets += [
|
||||
(idx * t_stride + start_idx) % frames_len + 1
|
||||
for idx in range(self.num_seg)
|
||||
]
|
||||
frames_idx = offsets
|
||||
else:
|
||||
for i in range(self.num_seg):
|
||||
idx = 0
|
||||
if not self.valid_mode:
|
||||
if average_dur >= self.seg_len:
|
||||
idx = random.randint(0, average_dur - self.seg_len)
|
||||
idx += i * average_dur
|
||||
elif average_dur >= 1:
|
||||
idx += i * average_dur
|
||||
else:
|
||||
idx = i
|
||||
else:
|
||||
if average_dur >= self.seg_len:
|
||||
idx = (average_dur - 1) // 2
|
||||
idx += i * average_dur
|
||||
elif average_dur >= 1:
|
||||
idx += i * average_dur
|
||||
else:
|
||||
idx = i
|
||||
for jj in range(idx, idx + self.seg_len):
|
||||
if results['format'] == 'video':
|
||||
frames_idx.append(int(jj % frames_len))
|
||||
elif results['format'] == 'frame':
|
||||
frames_idx.append(jj + 1)
|
||||
|
||||
elif results['format'] == 'MRI':
|
||||
frames_idx.append(jj)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return self._get(frames_idx, results)
|
||||
|
||||
else: # for TSM
|
||||
if not self.valid_mode:
|
||||
if average_dur > 0:
|
||||
offsets = np.multiply(list(range(self.num_seg)),
|
||||
average_dur) + np.random.randint(
|
||||
average_dur, size=self.num_seg)
|
||||
elif frames_len > self.num_seg:
|
||||
offsets = np.sort(
|
||||
np.random.randint(frames_len, size=self.num_seg))
|
||||
else:
|
||||
offsets = np.zeros(shape=(self.num_seg, ))
|
||||
else:
|
||||
if frames_len > self.num_seg:
|
||||
average_dur_float = frames_len / self.num_seg
|
||||
offsets = np.array([
|
||||
int(average_dur_float / 2.0 + average_dur_float * x)
|
||||
for x in range(self.num_seg)
|
||||
])
|
||||
else:
|
||||
offsets = np.zeros(shape=(self.num_seg, ))
|
||||
|
||||
if results['format'] == 'video':
|
||||
frames_idx = list(offsets)
|
||||
frames_idx = [x % frames_len for x in frames_idx]
|
||||
elif results['format'] == 'frame':
|
||||
frames_idx = list(offsets + 1)
|
||||
|
||||
elif results['format'] == 'MRI':
|
||||
frames_idx = list(offsets)
|
||||
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return self._get(frames_idx, results)
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class SamplerPkl(object):
|
||||
"""
|
||||
Sample frames id.
|
||||
NOTE: Use PIL to read image here, has diff with CV2
|
||||
Args:
|
||||
num_seg(int): number of segments.
|
||||
seg_len(int): number of sampled frames in each segment.
|
||||
mode(str): 'train', 'valid'
|
||||
Returns:
|
||||
frames_idx: the index of sampled #frames.
|
||||
"""
|
||||
def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False):
|
||||
self.num_seg = num_seg
|
||||
self.seg_len = seg_len
|
||||
self.valid_mode = valid_mode
|
||||
self.backend = backend
|
||||
|
||||
def _get(self, buf):
|
||||
if isinstance(buf, str):
|
||||
img = Image.open(StringIO(buf))
|
||||
else:
|
||||
img = Image.open(BytesIO(buf))
|
||||
img = img.convert('RGB')
|
||||
if self.backend != 'pillow':
|
||||
img = np.array(img)
|
||||
return img
|
||||
|
||||
def __call__(self, results):
|
||||
"""
|
||||
Args:
|
||||
frames_len: length of frames.
|
||||
return:
|
||||
sampling id.
|
||||
"""
|
||||
filename = results['frame_dir']
|
||||
data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes')
|
||||
video_name, label, frames = data_loaded
|
||||
if isinstance(label, dict):
|
||||
label = label['动作类型']
|
||||
results['labels'] = label
|
||||
elif len(label) == 1:
|
||||
results['labels'] = int(label[0])
|
||||
else:
|
||||
results['labels'] = int(label[0]) if random.random() < 0.5 else int(
|
||||
label[1])
|
||||
results['frames_len'] = len(frames)
|
||||
frames_len = results['frames_len']
|
||||
average_dur = int(int(frames_len) / self.num_seg)
|
||||
imgs = []
|
||||
for i in range(self.num_seg):
|
||||
idx = 0
|
||||
if not self.valid_mode:
|
||||
if average_dur >= self.seg_len:
|
||||
idx = random.randint(0, average_dur - self.seg_len)
|
||||
idx += i * average_dur
|
||||
elif average_dur >= 1:
|
||||
idx += i * average_dur
|
||||
else:
|
||||
idx = i
|
||||
else:
|
||||
if average_dur >= self.seg_len:
|
||||
idx = (average_dur - 1) // 2
|
||||
idx += i * average_dur
|
||||
elif average_dur >= 1:
|
||||
idx += i * average_dur
|
||||
else:
|
||||
idx = i
|
||||
|
||||
for jj in range(idx, idx + self.seg_len):
|
||||
imgbuf = frames[int(jj % results['frames_len'])]
|
||||
img = self._get(imgbuf)
|
||||
imgs.append(img)
|
||||
results['backend'] = self.backend
|
||||
results['imgs'] = imgs
|
||||
|
||||
return results
|
@ -0,0 +1,375 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import random
|
||||
from PIL import Image
|
||||
from ..registry import PIPELINES
|
||||
import os
|
||||
import numpy as np
|
||||
import io
|
||||
import os.path as osp
|
||||
from abc import ABCMeta, abstractmethod
|
||||
import cv2
|
||||
from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED
|
||||
import inspect
|
||||
|
||||
imread_backend = 'cv2'
|
||||
imread_flags = {
|
||||
'color': IMREAD_COLOR,
|
||||
'grayscale': IMREAD_GRAYSCALE,
|
||||
'unchanged': IMREAD_UNCHANGED
|
||||
}
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class SampleFrames:
|
||||
"""Sample frames from the video. """
|
||||
|
||||
def __init__(self,
|
||||
clip_len,
|
||||
frame_interval=1,
|
||||
num_clips=1,
|
||||
temporal_jitter=False,
|
||||
twice_sample=False,
|
||||
out_of_bound_opt='loop',
|
||||
test_mode=False):
|
||||
self.clip_len = clip_len
|
||||
self.frame_interval = frame_interval
|
||||
self.num_clips = num_clips
|
||||
self.temporal_jitter = temporal_jitter
|
||||
self.twice_sample = twice_sample
|
||||
self.out_of_bound_opt = out_of_bound_opt
|
||||
self.test_mode = test_mode
|
||||
assert self.out_of_bound_opt in ['loop', 'repeat_last']
|
||||
|
||||
def _get_train_clips(self, num_frames):
|
||||
"""Get clip offsets in train mode. """
|
||||
ori_clip_len = self.clip_len * self.frame_interval
|
||||
avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
|
||||
if avg_interval > 0:
|
||||
base_offsets = np.arange(self.num_clips) * avg_interval
|
||||
clip_offsets = base_offsets + np.random.randint(
|
||||
avg_interval, size=self.num_clips)
|
||||
elif num_frames > max(self.num_clips, ori_clip_len):
|
||||
clip_offsets = np.sort(
|
||||
np.random.randint(
|
||||
num_frames - ori_clip_len + 1, size=self.num_clips))
|
||||
elif avg_interval == 0:
|
||||
ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
|
||||
clip_offsets = np.around(np.arange(self.num_clips) * ratio)
|
||||
else:
|
||||
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
|
||||
return clip_offsets
|
||||
|
||||
def _get_test_clips(self, num_frames):
|
||||
"""Get clip offsets in test mode. """
|
||||
ori_clip_len = self.clip_len * self.frame_interval
|
||||
avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
|
||||
if num_frames > ori_clip_len - 1:
|
||||
base_offsets = np.arange(self.num_clips) * avg_interval
|
||||
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
|
||||
if self.twice_sample:
|
||||
clip_offsets = np.concatenate([clip_offsets, base_offsets])
|
||||
else:
|
||||
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int)
|
||||
return clip_offsets
|
||||
|
||||
def _sample_clips(self, num_frames):
|
||||
"""Choose clip offsets for the video in a given mode. """
|
||||
if self.test_mode:
|
||||
clip_offsets = self._get_test_clips(num_frames)
|
||||
else:
|
||||
clip_offsets = self._get_train_clips(num_frames)
|
||||
return clip_offsets
|
||||
|
||||
def __call__(self, results):
|
||||
"""Perform the SampleFrames loading. """
|
||||
total_frames = results['total_frames']
|
||||
clip_offsets = self._sample_clips(total_frames)
|
||||
frame_inds = clip_offsets[:, None] + np.arange(
|
||||
self.clip_len)[None, :] * self.frame_interval
|
||||
frame_inds = np.concatenate(frame_inds)
|
||||
if self.temporal_jitter:
|
||||
perframe_offsets = np.random.randint(
|
||||
self.frame_interval, size=len(frame_inds))
|
||||
frame_inds += perframe_offsets
|
||||
frame_inds = frame_inds.reshape((-1, self.clip_len))
|
||||
if self.out_of_bound_opt == 'loop':
|
||||
frame_inds = np.mod(frame_inds, total_frames)
|
||||
elif self.out_of_bound_opt == 'repeat_last':
|
||||
safe_inds = frame_inds < total_frames
|
||||
unsafe_inds = 1 - safe_inds
|
||||
last_ind = np.max(safe_inds * frame_inds, axis=1)
|
||||
new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
|
||||
frame_inds = new_inds
|
||||
else:
|
||||
raise ValueError('Illegal out_of_bound option.')
|
||||
start_index = results['start_index']
|
||||
frame_inds = np.concatenate(frame_inds) + start_index
|
||||
results['frame_inds'] = frame_inds.astype(np.int)
|
||||
results['clip_len'] = self.clip_len
|
||||
results['frame_interval'] = self.frame_interval
|
||||
results['num_clips'] = self.num_clips
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'clip_len={self.clip_len}, '
|
||||
f'frame_interval={self.frame_interval}, '
|
||||
f'num_clips={self.num_clips}, '
|
||||
f'temporal_jitter={self.temporal_jitter}, '
|
||||
f'twice_sample={self.twice_sample}, '
|
||||
f'out_of_bound_opt={self.out_of_bound_opt}, '
|
||||
f'test_mode={self.test_mode})')
|
||||
return repr_str
|
||||
|
||||
class BaseStorageBackend(metaclass=ABCMeta):
|
||||
"""Abstract class of storage backends. """
|
||||
|
||||
@abstractmethod
|
||||
def get(self, filepath):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text(self, filepath):
|
||||
pass
|
||||
|
||||
class HardDiskBackend(BaseStorageBackend):
|
||||
"""Raw hard disks storage backend."""
|
||||
|
||||
def get(self, filepath):
|
||||
filepath = str(filepath)
|
||||
with open(filepath, 'rb') as f:
|
||||
value_buf = f.read()
|
||||
return value_buf
|
||||
|
||||
def get_text(self, filepath):
|
||||
filepath = str(filepath)
|
||||
with open(filepath, 'r') as f:
|
||||
value_buf = f.read()
|
||||
return value_buf
|
||||
|
||||
class FileClient:
|
||||
"""A general file client to access files in different backend. """
|
||||
|
||||
_backends = {
|
||||
'disk': HardDiskBackend,
|
||||
}
|
||||
|
||||
def __init__(self, backend='disk', **kwargs):
|
||||
if backend not in self._backends:
|
||||
raise ValueError(
|
||||
f'Backend {backend} is not supported. Currently supported ones'
|
||||
f' are {list(self._backends.keys())}')
|
||||
self.backend = backend
|
||||
self.client = self._backends[backend](**kwargs)
|
||||
|
||||
@classmethod
|
||||
def _register_backend(cls, name, backend, force=False):
|
||||
if not isinstance(name, str):
|
||||
raise TypeError('the backend name should be a string, '
|
||||
f'but got {type(name)}')
|
||||
if not inspect.isclass(backend):
|
||||
raise TypeError(
|
||||
f'backend should be a class but got {type(backend)}')
|
||||
if not issubclass(backend, BaseStorageBackend):
|
||||
raise TypeError(
|
||||
f'backend {backend} is not a subclass of BaseStorageBackend')
|
||||
if not force and name in cls._backends:
|
||||
raise KeyError(
|
||||
f'{name} is already registered as a storage backend, '
|
||||
'add "force=True" if you want to override it')
|
||||
|
||||
cls._backends[name] = backend
|
||||
|
||||
@classmethod
|
||||
def register_backend(cls, name, backend=None, force=False):
|
||||
"""Register a backend to FileClient. """
|
||||
|
||||
if backend is not None:
|
||||
cls._register_backend(name, backend, force=force)
|
||||
return
|
||||
|
||||
def _register(backend_cls):
|
||||
cls._register_backend(name, backend_cls, force=force)
|
||||
return backend_cls
|
||||
|
||||
return _register
|
||||
|
||||
def get(self, filepath):
|
||||
return self.client.get(filepath)
|
||||
|
||||
def get_text(self, filepath):
|
||||
return self.client.get_text(filepath)
|
||||
|
||||
@PIPELINES.register()
|
||||
class RawFrameDecode:
|
||||
"""Load and decode frames with given indices. """
|
||||
|
||||
def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs):
|
||||
self.io_backend = io_backend
|
||||
self.decoding_backend = decoding_backend
|
||||
self.kwargs = kwargs
|
||||
self.file_client = None
|
||||
|
||||
def _pillow2array(self,img, flag='color', channel_order='bgr'):
|
||||
"""Convert a pillow image to numpy array. """
|
||||
|
||||
channel_order = channel_order.lower()
|
||||
if channel_order not in ['rgb', 'bgr']:
|
||||
raise ValueError('channel order must be either "rgb" or "bgr"')
|
||||
|
||||
if flag == 'unchanged':
|
||||
array = np.array(img)
|
||||
if array.ndim >= 3 and array.shape[2] >= 3: # color image
|
||||
array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR
|
||||
else:
|
||||
# If the image mode is not 'RGB', convert it to 'RGB' first.
|
||||
if img.mode != 'RGB':
|
||||
if img.mode != 'LA':
|
||||
# Most formats except 'LA' can be directly converted to RGB
|
||||
img = img.convert('RGB')
|
||||
else:
|
||||
# When the mode is 'LA', the default conversion will fill in
|
||||
# the canvas with black, which sometimes shadows black objects
|
||||
# in the foreground.
|
||||
#
|
||||
# Therefore, a random color (124, 117, 104) is used for canvas
|
||||
img_rgba = img.convert('RGBA')
|
||||
img = Image.new('RGB', img_rgba.size, (124, 117, 104))
|
||||
img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha
|
||||
if flag == 'color':
|
||||
array = np.array(img)
|
||||
if channel_order != 'rgb':
|
||||
array = array[:, :, ::-1] # RGB to BGR
|
||||
elif flag == 'grayscale':
|
||||
img = img.convert('L')
|
||||
array = np.array(img)
|
||||
else:
|
||||
raise ValueError(
|
||||
'flag must be "color", "grayscale" or "unchanged", '
|
||||
f'but got {flag}')
|
||||
return array
|
||||
|
||||
def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None):
|
||||
"""Read an image from bytes. """
|
||||
|
||||
img_np = np.frombuffer(content, np.uint8)
|
||||
flag = imread_flags[flag] if isinstance(flag, str) else flag
|
||||
img = cv2.imdecode(img_np, flag)
|
||||
if flag == IMREAD_COLOR and channel_order == 'rgb':
|
||||
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
|
||||
return img
|
||||
|
||||
def __call__(self, results):
|
||||
"""Perform the ``RawFrameDecode`` to pick frames given indices.
|
||||
|
||||
Args:
|
||||
results (dict): The resulting dict to be modified and passed
|
||||
to the next transform in pipeline.
|
||||
"""
|
||||
# mmcv.use_backend(self.decoding_backend)
|
||||
|
||||
directory = results['frame_dir']
|
||||
suffix = results['suffix']
|
||||
#modality = results['modality']
|
||||
|
||||
if self.file_client is None:
|
||||
self.file_client = FileClient(self.io_backend, **self.kwargs)
|
||||
|
||||
imgs = list()
|
||||
|
||||
if results['frame_inds'].ndim != 1:
|
||||
results['frame_inds'] = np.squeeze(results['frame_inds'])
|
||||
|
||||
offset = results.get('offset', 0)
|
||||
|
||||
for frame_idx in results['frame_inds']:
|
||||
frame_idx += offset
|
||||
filepath = osp.join(directory, suffix.format(frame_idx))
|
||||
img_bytes = self.file_client.get(filepath) #以二进制方式读取图片
|
||||
# Get frame with channel order RGB directly.
|
||||
|
||||
cur_frame = self._imfrombytes(img_bytes, channel_order='rgb')
|
||||
imgs.append(cur_frame)
|
||||
|
||||
results['imgs'] = imgs
|
||||
results['original_shape'] = imgs[0].shape[:2]
|
||||
results['img_shape'] = imgs[0].shape[:2]
|
||||
|
||||
# we resize the gt_bboxes and proposals to their real scale
|
||||
h, w = results['img_shape']
|
||||
scale_factor = np.array([w, h, w, h])
|
||||
if 'gt_bboxes' in results:
|
||||
gt_bboxes = results['gt_bboxes']
|
||||
gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32)
|
||||
results['gt_bboxes'] = gt_bboxes_new
|
||||
if 'proposals' in results and results['proposals'] is not None:
|
||||
proposals = results['proposals']
|
||||
proposals = (proposals * scale_factor).astype(np.float32)
|
||||
results['proposals'] = proposals
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'io_backend={self.io_backend}, '
|
||||
f'decoding_backend={self.decoding_backend})')
|
||||
return repr_str
|
||||
|
||||
@PIPELINES.register()
|
||||
class SampleAVAFrames(SampleFrames):
|
||||
|
||||
def __init__(self, clip_len, frame_interval=2, test_mode=False):
|
||||
|
||||
super().__init__(clip_len, frame_interval, test_mode=test_mode)
|
||||
|
||||
def _get_clips(self, center_index, skip_offsets, shot_info):
|
||||
start = center_index - (self.clip_len // 2) * self.frame_interval
|
||||
end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval
|
||||
frame_inds = list(range(start, end, self.frame_interval))
|
||||
frame_inds = frame_inds + skip_offsets
|
||||
frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1)
|
||||
|
||||
return frame_inds
|
||||
|
||||
def __call__(self, results):
|
||||
fps = results['fps']
|
||||
timestamp = results['timestamp']
|
||||
timestamp_start = results['timestamp_start']
|
||||
shot_info = results['shot_info']
|
||||
|
||||
#delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒
|
||||
#center_index=fps*delta为该帧距离15min视频开头有几帧
|
||||
#center_index+1是为了避免后续采样时出现负数?
|
||||
#后续需要以center_index为中心前后采样视频帧片段
|
||||
center_index = fps * (timestamp - timestamp_start) + 1
|
||||
|
||||
skip_offsets = np.random.randint(
|
||||
-self.frame_interval // 2, (self.frame_interval + 1) // 2,
|
||||
size=self.clip_len)
|
||||
frame_inds = self._get_clips(center_index, skip_offsets, shot_info)
|
||||
|
||||
results['frame_inds'] = np.array(frame_inds, dtype=np.int)
|
||||
results['clip_len'] = self.clip_len
|
||||
results['frame_interval'] = self.frame_interval
|
||||
results['num_clips'] = 1
|
||||
results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32)
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = (f'{self.__class__.__name__}('
|
||||
f'clip_len={self.clip_len}, '
|
||||
f'frame_interval={self.frame_interval}, '
|
||||
f'test_mode={self.test_mode})')
|
||||
return repr_str
|
||||
|
@ -0,0 +1,69 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class SamplerUCF24(object):
|
||||
"""
|
||||
Sample frames id.
|
||||
NOTE: Use PIL to read image here, has diff with CV2
|
||||
Args:
|
||||
num_frames(int): The amount of frames used in a video
|
||||
frame_interval(int): Sampling rate
|
||||
valid_mode(bool): True or False.
|
||||
Returns:
|
||||
frames_idx: the index of sampled #frames.
|
||||
"""
|
||||
def __init__(self,
|
||||
num_frames=16,
|
||||
frame_interval=1,
|
||||
valid_mode=False):
|
||||
self.num_frames = num_frames
|
||||
self.frame_interval = frame_interval if valid_mode else random.randint(1, 2)
|
||||
self.valid_mode = valid_mode
|
||||
|
||||
def _get(self, frames_idxs, img_folder, results):
|
||||
imgs = []
|
||||
for idx in frames_idxs:
|
||||
img = Image.open(
|
||||
os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB')
|
||||
imgs.append(img)
|
||||
results['imgs'] = imgs
|
||||
return results
|
||||
|
||||
def _make_clip(self, im_ind, max_num):
|
||||
frame_idxs = []
|
||||
for i in reversed(range(self.num_frames)):
|
||||
# make it as a loop
|
||||
i_temp = im_ind - i * self.frame_interval
|
||||
if i_temp < 1:
|
||||
i_temp = 1
|
||||
elif i_temp > max_num:
|
||||
i_temp = max_num
|
||||
frame_idxs.append(i_temp)
|
||||
return frame_idxs
|
||||
|
||||
def __call__(self, results):
|
||||
img_folder, key_frame = os.path.split(results['filename'])
|
||||
frame_len = len(os.listdir(img_folder))
|
||||
key_idx = int(key_frame[0:5])
|
||||
frame_idxs = self._make_clip(key_idx, frame_len)
|
||||
return self._get(frame_idxs, img_folder, results)
|
@ -0,0 +1,130 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import copy
|
||||
import cv2
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class MultiRestrictSize(object):
|
||||
def __init__(self,
|
||||
min_size=None,
|
||||
max_size=800,
|
||||
flip=False,
|
||||
multi_scale=[1.3]):
|
||||
self.min_size = min_size
|
||||
self.max_size = max_size
|
||||
self.multi_scale = multi_scale
|
||||
self.flip = flip
|
||||
assert ((min_size is None)) or ((max_size is None))
|
||||
|
||||
def __call__(self, sample):
|
||||
samples = []
|
||||
image = sample['current_img']
|
||||
h, w = image.shape[:2]
|
||||
for scale in self.multi_scale:
|
||||
# Fixed range of scales
|
||||
sc = None
|
||||
# Align short edge
|
||||
if not (self.min_size is None):
|
||||
if h > w:
|
||||
short_edge = w
|
||||
else:
|
||||
short_edge = h
|
||||
if short_edge > self.min_size:
|
||||
sc = float(self.min_size) / short_edge
|
||||
else:
|
||||
if h > w:
|
||||
long_edge = h
|
||||
else:
|
||||
long_edge = w
|
||||
if long_edge > self.max_size:
|
||||
sc = float(self.max_size) / long_edge
|
||||
|
||||
if sc is None:
|
||||
new_h = h
|
||||
new_w = w
|
||||
else:
|
||||
new_h = sc * h
|
||||
new_w = sc * w
|
||||
new_h = int(new_h * scale)
|
||||
new_w = int(new_w * scale)
|
||||
|
||||
if (new_h - 1) % 16 != 0:
|
||||
new_h = int(np.around((new_h - 1) / 16.) * 16 + 1)
|
||||
if (new_w - 1) % 16 != 0:
|
||||
new_w = int(np.around((new_w - 1) / 16.) * 16 + 1)
|
||||
|
||||
if new_h == h and new_w == w:
|
||||
samples.append(sample)
|
||||
else:
|
||||
new_sample = {}
|
||||
for elem in sample.keys():
|
||||
if 'meta' in elem:
|
||||
new_sample[elem] = sample[elem]
|
||||
continue
|
||||
tmp = sample[elem]
|
||||
if 'label' in elem:
|
||||
new_sample[elem] = sample[elem]
|
||||
continue
|
||||
else:
|
||||
flagval = cv2.INTER_CUBIC
|
||||
tmp = cv2.resize(tmp,
|
||||
dsize=(new_w, new_h),
|
||||
interpolation=flagval)
|
||||
new_sample[elem] = tmp
|
||||
samples.append(new_sample)
|
||||
|
||||
if self.flip:
|
||||
now_sample = samples[-1]
|
||||
new_sample = {}
|
||||
for elem in now_sample.keys():
|
||||
if 'meta' in elem:
|
||||
new_sample[elem] = now_sample[elem].copy()
|
||||
new_sample[elem]['flip'] = True
|
||||
continue
|
||||
tmp = now_sample[elem]
|
||||
tmp = tmp[:, ::-1].copy()
|
||||
new_sample[elem] = tmp
|
||||
samples.append(new_sample)
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class MultiNorm(object):
|
||||
def __call__(self, samples):
|
||||
for idx in range(len(samples)):
|
||||
sample = samples[idx]
|
||||
for elem in sample.keys():
|
||||
if 'meta' in elem:
|
||||
continue
|
||||
tmp = sample[elem]
|
||||
if tmp is None:
|
||||
continue
|
||||
|
||||
if tmp.ndim == 2:
|
||||
tmp = tmp[:, :, np.newaxis]
|
||||
else:
|
||||
tmp = tmp / 255.
|
||||
tmp -= (0.485, 0.456, 0.406)
|
||||
tmp /= (0.229, 0.224, 0.225)
|
||||
|
||||
tmp = tmp.transpose((2, 0, 1))
|
||||
samples[idx][elem] = tmp
|
||||
|
||||
return samples
|
@ -0,0 +1,40 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import random
|
||||
import paddle
|
||||
from ..registry import PIPELINES
|
||||
"""
|
||||
pipeline ops for Action Segmentation Dataset.
|
||||
"""
|
||||
|
||||
|
||||
@PIPELINES.register()
|
||||
class SegmentationSampler(object):
|
||||
|
||||
def __init__(self, sample_rate):
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
def __call__(self, results):
|
||||
for key, data in results.items():
|
||||
if len(data.shape) == 1:
|
||||
data = data[::self.sample_rate]
|
||||
results[key] = copy.deepcopy(data)
|
||||
else:
|
||||
data = data[:, ::self.sample_rate]
|
||||
results[key] = copy.deepcopy(data)
|
||||
return results
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,18 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ..utils import Registry
|
||||
|
||||
PIPELINES = Registry("pipeline")
|
||||
DATASETS = Registry("datasets")
|
@ -0,0 +1,3 @@
|
||||
from .anet_prop import ANETproposal
|
||||
|
||||
__all__ = ['ANETproposal']
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,359 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import urllib.request as urllib2
|
||||
from paddlevideo.utils import get_logger
|
||||
|
||||
logger = get_logger("paddlevideo")
|
||||
|
||||
|
||||
class ANETproposal(object):
|
||||
"""
|
||||
This class is used for calculating AR@N and AUC;
|
||||
Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)
|
||||
"""
|
||||
GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
|
||||
PROPOSAL_FIELDS = ['results', 'version', 'external_data']
|
||||
API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py'
|
||||
|
||||
def __init__(self,
|
||||
ground_truth_filename=None,
|
||||
proposal_filename=None,
|
||||
ground_truth_fields=GROUND_TRUTH_FIELDS,
|
||||
proposal_fields=PROPOSAL_FIELDS,
|
||||
tiou_thresholds=np.linspace(0.5, 0.95, 10),
|
||||
max_avg_nr_proposals=None,
|
||||
subset='validation',
|
||||
verbose=False,
|
||||
check_status=True):
|
||||
if not ground_truth_filename:
|
||||
raise IOError('Please input a valid ground truth file.')
|
||||
if not proposal_filename:
|
||||
raise IOError('Please input a valid proposal file.')
|
||||
self.subset = subset
|
||||
self.tiou_thresholds = tiou_thresholds
|
||||
self.max_avg_nr_proposals = max_avg_nr_proposals
|
||||
self.verbose = verbose
|
||||
self.gt_fields = ground_truth_fields
|
||||
self.pred_fields = proposal_fields
|
||||
self.recall = None
|
||||
self.avg_recall = None
|
||||
self.proposals_per_video = None
|
||||
self.check_status = check_status
|
||||
# Retrieve blocked videos from server.
|
||||
if self.check_status:
|
||||
self.blocked_videos = self.get_blocked_videos()
|
||||
else:
|
||||
self.blocked_videos = list()
|
||||
# Import ground truth and proposals.
|
||||
self.ground_truth, self.activity_index = self._import_ground_truth(
|
||||
ground_truth_filename)
|
||||
self.proposal = self._import_proposal(proposal_filename)
|
||||
|
||||
if self.verbose:
|
||||
print('[INIT] Loaded annotations from {} subset.'.format(subset))
|
||||
nr_gt = len(self.ground_truth)
|
||||
print('\tNumber of ground truth instances: {}'.format(nr_gt))
|
||||
nr_pred = len(self.proposal)
|
||||
print('\tNumber of proposals: {}'.format(nr_pred))
|
||||
print('\tFixed threshold for tiou score: {}'.format(
|
||||
self.tiou_thresholds))
|
||||
|
||||
def _import_ground_truth(self, ground_truth_filename):
|
||||
"""
|
||||
Reads ground truth file, checks if it is well formatted, and returns
|
||||
the ground truth instances and the activity classes.
|
||||
|
||||
Parameters:
|
||||
ground_truth_filename (str): full path to the ground truth json file.
|
||||
Returns:
|
||||
ground_truth (df): Data frame containing the ground truth instances.
|
||||
activity_index (dict): Dictionary containing class index.
|
||||
"""
|
||||
with open(ground_truth_filename, 'r') as fobj:
|
||||
data = json.load(fobj)
|
||||
# Checking format
|
||||
if not all([field in data.keys() for field in self.gt_fields]):
|
||||
raise IOError('Please input a valid ground truth file.')
|
||||
|
||||
# Read ground truth data.
|
||||
activity_index, cidx = {}, 0
|
||||
video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], []
|
||||
for videoid, v in data['database'].items():
|
||||
if self.subset != v['subset']:
|
||||
continue
|
||||
if videoid in self.blocked_videos:
|
||||
continue
|
||||
for ann in v['annotations']:
|
||||
if ann['label'] not in activity_index:
|
||||
activity_index[ann['label']] = cidx
|
||||
cidx += 1
|
||||
video_lst.append(videoid)
|
||||
t_start_lst.append(float(ann['segment'][0]))
|
||||
t_end_lst.append(float(ann['segment'][1]))
|
||||
label_lst.append(activity_index[ann['label']])
|
||||
|
||||
ground_truth = pd.DataFrame({
|
||||
'video-id': video_lst,
|
||||
't-start': t_start_lst,
|
||||
't-end': t_end_lst,
|
||||
'label': label_lst
|
||||
})
|
||||
return ground_truth, activity_index
|
||||
|
||||
def _import_proposal(self, proposal_filename):
|
||||
"""
|
||||
Reads proposal file, checks if it is well formatted, and returns
|
||||
the proposal instances.
|
||||
|
||||
Parameters:
|
||||
proposal_filename (str): Full path to the proposal json file.
|
||||
Returns:
|
||||
proposal (df): Data frame containing the proposal instances.
|
||||
"""
|
||||
with open(proposal_filename, 'r') as fobj:
|
||||
data = json.load(fobj)
|
||||
# Checking format...
|
||||
if not all([field in data.keys() for field in self.pred_fields]):
|
||||
raise IOError('Please input a valid proposal file.')
|
||||
|
||||
# Read predictions.
|
||||
video_lst, t_start_lst, t_end_lst = [], [], []
|
||||
score_lst = []
|
||||
for videoid, v in data['results'].items():
|
||||
if videoid in self.blocked_videos:
|
||||
continue
|
||||
for result in v:
|
||||
video_lst.append(videoid)
|
||||
t_start_lst.append(float(result['segment'][0]))
|
||||
t_end_lst.append(float(result['segment'][1]))
|
||||
score_lst.append(result['score'])
|
||||
proposal = pd.DataFrame({
|
||||
'video-id': video_lst,
|
||||
't-start': t_start_lst,
|
||||
't-end': t_end_lst,
|
||||
'score': score_lst
|
||||
})
|
||||
return proposal
|
||||
|
||||
def evaluate(self):
|
||||
"""
|
||||
Evaluates a proposal file. To measure the performance of a
|
||||
method for the proposal task, we computes the area under the
|
||||
average recall vs average number of proposals per video curve.
|
||||
"""
|
||||
recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals(
|
||||
self.ground_truth,
|
||||
self.proposal,
|
||||
max_avg_nr_proposals=self.max_avg_nr_proposals,
|
||||
tiou_thresholds=self.tiou_thresholds)
|
||||
|
||||
area_under_curve = np.trapz(avg_recall, proposals_per_video)
|
||||
|
||||
if self.verbose:
|
||||
print('[RESULTS] Performance on ActivityNet proposal task.')
|
||||
with open("data/bmn/BMN_Test_results/auc_result.txt",
|
||||
"a") as text_file:
|
||||
text_file.write(
|
||||
'\tArea Under the AR vs AN curve: {}% \n'.format(
|
||||
100. * float(area_under_curve) /
|
||||
proposals_per_video[-1]))
|
||||
print('\tArea Under the AR vs AN curve: {}%'.format(
|
||||
100. * float(area_under_curve) / proposals_per_video[-1]))
|
||||
|
||||
self.recall = recall
|
||||
self.avg_recall = avg_recall
|
||||
self.proposals_per_video = proposals_per_video
|
||||
|
||||
def average_recall_vs_avg_nr_proposals(self,
|
||||
ground_truth,
|
||||
proposals,
|
||||
max_avg_nr_proposals=None,
|
||||
tiou_thresholds=np.linspace(
|
||||
0.5, 0.95, 10)):
|
||||
"""
|
||||
Computes the average recall given an average number of
|
||||
proposals per video.
|
||||
|
||||
Parameters:
|
||||
ground_truth(df): Data frame containing the ground truth instances.
|
||||
Required fields: ['video-id', 't-start', 't-end']
|
||||
proposal(df): Data frame containing the proposal instances.
|
||||
Required fields: ['video-id, 't-start', 't-end', 'score']
|
||||
tiou_thresholds(1d-array | optional): array with tiou thresholds.
|
||||
|
||||
Returns:
|
||||
recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth
|
||||
average number of average number of proposals per video.
|
||||
average_recall(1d-array): recall averaged over a list of tiou threshold.
|
||||
This is equivalent to recall.mean(axis=0).
|
||||
proposals_per_video(1d-array): average number of proposals per video.
|
||||
"""
|
||||
|
||||
# Get list of videos.
|
||||
video_lst = ground_truth['video-id'].unique()
|
||||
|
||||
if not max_avg_nr_proposals:
|
||||
max_avg_nr_proposals = float(
|
||||
proposals.shape[0]) / video_lst.shape[0]
|
||||
|
||||
ratio = max_avg_nr_proposals * float(
|
||||
video_lst.shape[0]) / proposals.shape[0]
|
||||
|
||||
# Adaptation to query faster
|
||||
ground_truth_gbvn = ground_truth.groupby('video-id')
|
||||
proposals_gbvn = proposals.groupby('video-id')
|
||||
|
||||
# For each video, computes tiou scores among the retrieved proposals.
|
||||
score_lst = []
|
||||
total_nr_proposals = 0
|
||||
for videoid in video_lst:
|
||||
# Get ground-truth instances associated to this video.
|
||||
ground_truth_videoid = ground_truth_gbvn.get_group(videoid)
|
||||
this_video_ground_truth = ground_truth_videoid.loc[:, [
|
||||
't-start', 't-end'
|
||||
]].values
|
||||
|
||||
# Get proposals for this video.
|
||||
try:
|
||||
proposals_videoid = proposals_gbvn.get_group(videoid)
|
||||
except:
|
||||
n = this_video_ground_truth.shape[0]
|
||||
score_lst.append(np.zeros((n, 1)))
|
||||
continue
|
||||
|
||||
this_video_proposals = proposals_videoid.loc[:,
|
||||
['t-start', 't-end'
|
||||
]].values
|
||||
|
||||
if this_video_proposals.shape[0] == 0:
|
||||
n = this_video_ground_truth.shape[0]
|
||||
score_lst.append(np.zeros((n, 1)))
|
||||
continue
|
||||
|
||||
# Sort proposals by score.
|
||||
sort_idx = proposals_videoid['score'].argsort()[::-1]
|
||||
this_video_proposals = this_video_proposals[sort_idx, :]
|
||||
|
||||
if this_video_proposals.ndim != 2:
|
||||
this_video_proposals = np.expand_dims(this_video_proposals,
|
||||
axis=0)
|
||||
if this_video_ground_truth.ndim != 2:
|
||||
this_video_ground_truth = np.expand_dims(
|
||||
this_video_ground_truth, axis=0)
|
||||
|
||||
nr_proposals = np.minimum(
|
||||
int(this_video_proposals.shape[0] * ratio),
|
||||
this_video_proposals.shape[0])
|
||||
total_nr_proposals += nr_proposals
|
||||
this_video_proposals = this_video_proposals[:nr_proposals, :]
|
||||
|
||||
# Compute tiou scores.
|
||||
tiou = self.wrapper_segment_iou(this_video_proposals,
|
||||
this_video_ground_truth)
|
||||
score_lst.append(tiou)
|
||||
|
||||
# Given that the length of the videos is really varied, we
|
||||
# compute the number of proposals in terms of a ratio of the total
|
||||
# proposals retrieved, i.e. average recall at a percentage of proposals
|
||||
# retrieved per video.
|
||||
|
||||
# Computes average recall.
|
||||
pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float(
|
||||
video_lst.shape[0]) / total_nr_proposals)
|
||||
matches = np.empty((video_lst.shape[0], pcn_lst.shape[0]))
|
||||
positives = np.empty(video_lst.shape[0])
|
||||
recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0]))
|
||||
# Iterates over each tiou threshold.
|
||||
for ridx, tiou in enumerate(tiou_thresholds):
|
||||
|
||||
# Inspect positives retrieved per video at different
|
||||
# number of proposals (percentage of the total retrieved).
|
||||
for i, score in enumerate(score_lst):
|
||||
# Total positives per video.
|
||||
positives[i] = score.shape[0]
|
||||
# Find proposals that satisfies minimum tiou threshold.
|
||||
true_positives_tiou = score >= tiou
|
||||
# Get number of proposals as a percentage of total retrieved.
|
||||
pcn_proposals = np.minimum(
|
||||
(score.shape[1] * pcn_lst).astype(int), score.shape[1])
|
||||
|
||||
for j, nr_proposals in enumerate(pcn_proposals):
|
||||
# Compute the number of matches for each percentage of the proposals
|
||||
matches[i, j] = np.count_nonzero(
|
||||
(true_positives_tiou[:, :nr_proposals]).sum(axis=1))
|
||||
|
||||
# Computes recall given the set of matches per video.
|
||||
recall[ridx, :] = matches.sum(axis=0) / positives.sum()
|
||||
|
||||
# Recall is averaged.
|
||||
avg_recall = recall.mean(axis=0)
|
||||
|
||||
# Get the average number of proposals per video.
|
||||
proposals_per_video = pcn_lst * (float(total_nr_proposals) /
|
||||
video_lst.shape[0])
|
||||
|
||||
return recall, avg_recall, proposals_per_video
|
||||
|
||||
def get_blocked_videos(self, api=API):
|
||||
api_url = '{}?action=get_blocked'.format(api)
|
||||
req = urllib2.Request(api_url)
|
||||
response = urllib2.urlopen(req)
|
||||
return json.loads(response.read())
|
||||
|
||||
def wrapper_segment_iou(self, target_segments, candidate_segments):
|
||||
"""
|
||||
Compute intersection over union btw segments
|
||||
Parameters:
|
||||
target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]]
|
||||
candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]]
|
||||
Returns:
|
||||
tiou(nd-array): 2-dim array [n x m] with IOU ratio.
|
||||
Note: It assumes that candidate-segments are more scarce that target-segments
|
||||
"""
|
||||
if candidate_segments.ndim != 2 or target_segments.ndim != 2:
|
||||
raise ValueError('Dimension of arguments is incorrect')
|
||||
|
||||
n, m = candidate_segments.shape[0], target_segments.shape[0]
|
||||
tiou = np.empty((n, m))
|
||||
for i in range(m):
|
||||
tiou[:, i] = self.segment_iou(target_segments[i, :],
|
||||
candidate_segments)
|
||||
|
||||
return tiou
|
||||
|
||||
def segment_iou(self, target_segment, candidate_segments):
|
||||
"""
|
||||
Compute the temporal intersection over union between a
|
||||
target segment and all the test segments.
|
||||
|
||||
Parameters:
|
||||
target_segment(1d-array): Temporal target segment containing [starting, ending] times.
|
||||
candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times.
|
||||
|
||||
Returns:
|
||||
tiou(1d-array): Temporal intersection over union score of the N's candidate segments.
|
||||
"""
|
||||
tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
|
||||
tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
|
||||
# Intersection including Non-negative overlap score.
|
||||
segments_intersection = (tt2 - tt1).clip(0)
|
||||
# Segment union.
|
||||
segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
|
||||
+ (target_segment[1] - target_segment[0]) - segments_intersection
|
||||
# Compute overlap as the ratio of the intersection
|
||||
# over union of two segments.
|
||||
tIoU = segments_intersection.astype(float) / segments_union
|
||||
return tIoU
|
@ -0,0 +1,36 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .bmn_metric import BMNMetric
|
||||
from .build import build_metric
|
||||
from .center_crop_metric import CenterCropMetric
|
||||
from .depth_metric import DepthMetric
|
||||
from .msrvtt_metric import MSRVTTMetric
|
||||
from .multi_crop_metric import MultiCropMetric
|
||||
from .registry import METRIC
|
||||
from .skeleton_metric import SkeletonMetric
|
||||
from .transnetv2_metric import TransNetV2Metric
|
||||
from .youtube8m.eval_util import HitOneMetric
|
||||
from .segmentation_metric import SegmentationMetric
|
||||
from .ava_metric import AVAMetric
|
||||
from .vos_metric import VOSMetric
|
||||
from .center_crop_metric_MRI import CenterCropMetric_MRI
|
||||
from .yowo_metric import YOWOMetric
|
||||
|
||||
__all__ = [
|
||||
'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric',
|
||||
'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric',
|
||||
'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric',
|
||||
'SegmentationMetric', 'YOWOMetric'
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue