You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1670 lines
59 KiB
Python

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import sys
from typing import List
import pickle
import cv2
try:
import imageio
except ImportError as e:
print(
f"Warning! {e}, [imageio] package and it's dependencies is required for VideoSwin."
)
try:
import matplotlib as mpl
import matplotlib.cm as cm
except ImportError as e:
print(
f"Warning! {e}, [matplotlib] package and it's dependencies is required for ADDS."
)
import numpy as np
import paddle
import paddle.nn.functional as F
import pandas
from PIL import Image
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
from abc import abstractmethod
from paddlevideo.loader.builder import build_pipeline
from paddlevideo.loader.pipelines import (
AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder,
GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop,
Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm,
TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler,
SketeonCropSample, MultiCenterCrop, SketeonCropSample, UniformSampleFrames,
PoseDecode, PoseCompact, Resize, CenterCrop_V2, GeneratePoseTarget,
FormatShape, Collect)
from paddlevideo.metrics.ava_utils import read_labelmap
from paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms
from paddlevideo.utils import Registry, build, get_config
from paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing
from tools.ava_predict import (detection_inference, frame_extraction,
get_detection_result, get_timestep_result,
pack_result, visualize)
from paddlevideo.modeling.framework.localizers.yowo_utils import nms, get_region_boxes
INFERENCE = Registry('inference')
def build_inference_helper(cfg):
return build(cfg, INFERENCE)
class Base_Inference_helper():
def __init__(self,
num_seg=8,
seg_len=1,
short_size=256,
target_size=224,
top_k=1):
"""Base_Inference_helper
Args:
num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8.
seg_len (int, optional): length of each segmentation. Defaults to 1.
short_size (int, optional): short size of input video. Defaults to 256.
target_size (int, optional): size of cropped video. Defaults to 224.
top_k (int, optional): select topk result in outputs. Defaults to 1.
"""
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
@abstractmethod
def preprocess(self, input_file: str):
"""preprocess abstractmethod
Args:
input_file (str): input file path.
"""
pass
def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]:
"""preprocess for file list
Args:
file_list (List[str]): file pathes in an list, [path1, path2, ...].
Returns:
List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...].
"""
batched_inputs = []
for file in file_list:
inputs = self.preprocess(file)
batched_inputs.append(inputs)
batched_inputs = [
np.concatenate([item[i] for item in batched_inputs])
for i in range(len(batched_inputs[0]))
]
self.input_file = file_list
return batched_inputs
def postprocess(self,
output: np.ndarray,
print_output: bool = True,
return_result: bool = False):
"""postprocess
Args:
output (np.ndarray): batched output scores, shape of (batch_size, class_num).
print_output (bool, optional): whether to print result. Defaults to True.
"""
if not isinstance(self.input_file, list):
self.input_file = [
self.input_file,
]
output = output[0] # [B, num_cls]
N = len(self.input_file)
if output.shape[0] != N:
output = output.reshape([N] + [output.shape[0] // N] +
list(output.shape[1:])) # [N, T, C]
output = output.mean(axis=1) # [N, C]
output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()
results_list = []
for i in range(N):
classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[i, classes])]
scores = output[i, classes]
topk_class = classes[:self.top_k]
topk_scores = scores[:self.top_k]
result = {
"video_id": self.input_file[i],
"topk_class": topk_class,
"topk_scores": topk_scores
}
results_list.append(result)
if print_output:
print("Current video file: {0}".format(self.input_file[i]))
print("\ttop-{0} class: {1}".format(self.top_k, topk_class))
print("\ttop-{0} score: {1}".format(self.top_k, topk_scores))
if return_result:
return results_list
@INFERENCE.register()
class ppTSM_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=8,
seg_len=1,
short_size=256,
target_size=224,
top_k=1):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
ops = [
VideoDecoder(backend="decord"),
Sampler(self.num_seg, self.seg_len, valid_mode=True),
Scale(self.short_size),
CenterCrop(self.target_size),
Image2Array(),
Normalization(img_mean, img_std)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
@INFERENCE.register()
class ppTSN_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=25,
seg_len=1,
short_size=256,
target_size=224,
top_k=1):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
ops = [
VideoDecoder(backend="decord"),
Sampler(self.num_seg,
self.seg_len,
valid_mode=True,
select_left=True),
Scale(self.short_size,
fixed_ratio=True,
do_round=True,
backend='cv2'),
TenCrop(self.target_size),
Image2Array(),
Normalization(img_mean, img_std)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
@INFERENCE.register()
class BMN_Inference_helper(Base_Inference_helper):
def __init__(self, feat_dim, dscale, tscale, result_path):
self.feat_dim = feat_dim
self.dscale = dscale
self.tscale = tscale
self.result_path = result_path
if not os.path.isdir(self.result_path):
os.makedirs(self.result_path)
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
file_info = json.load(open(input_file))
self.feat_path = file_info['feat_path']
self.video_duration = file_info['duration_second']
feat = np.load(self.feat_path).astype('float32').T
res = np.expand_dims(feat, axis=0).copy()
return [res]
def postprocess(self, outputs, print_output=True):
"""
output: list
"""
pred_bm, pred_start, pred_end = outputs
self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output)
def _gen_props(self, pred_bm, pred_start, pred_end, print_output):
snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)]
snippet_xmaxs = [
1.0 / self.tscale * i for i in range(1, self.tscale + 1)
]
pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :]
start_mask = boundary_choose(pred_start)
start_mask[0] = 1.
end_mask = boundary_choose(pred_end)
end_mask[-1] = 1.
score_vector_list = []
for idx in range(self.dscale):
for jdx in range(self.tscale):
start_index = jdx
end_index = start_index + idx
if end_index < self.tscale and start_mask[
start_index] == 1 and end_mask[end_index] == 1:
xmin = snippet_xmins[start_index]
xmax = snippet_xmaxs[end_index]
xmin_score = pred_start[start_index]
xmax_score = pred_end[end_index]
bm_score = pred_bm[idx, jdx]
conf_score = xmin_score * xmax_score * bm_score
score_vector_list.append([xmin, xmax, conf_score])
cols = ["xmin", "xmax", "score"]
score_vector_list = np.stack(score_vector_list)
df = pandas.DataFrame(score_vector_list, columns=cols)
result_dict = {}
proposal_list = []
df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9)
for idx in range(min(100, len(df))):
tmp_prop={"score":df.score.values[idx], \
"segment":[max(0,df.xmin.values[idx])*self.video_duration, \
min(1,df.xmax.values[idx])*self.video_duration]}
proposal_list.append(tmp_prop)
result_dict[self.feat_path] = proposal_list
# print top-5 predictions
if print_output:
print("Current video file: {0} :".format(self.feat_path))
for pred in proposal_list[:5]:
print(pred)
# save result
outfile = open(
os.path.join(self.result_path, "bmn_results_inference.json"), "w")
json.dump(result_dict, outfile)
@INFERENCE.register()
class TokenShift_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=8,
seg_len=1,
short_size=256,
target_size=256,
top_k=1,
mean=[0.5, 0.5, 0.5],
std=[0.5, 0.5, 0.5]):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
self.mean = mean
self.std = std
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
ops = [
VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
Sampler(self.num_seg, self.seg_len, valid_mode=True),
Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
Image2Array(data_format='cthw'),
JitterScale(self.short_size, self.short_size),
MultiCenterCrop(self.target_size)
]
for op in ops:
results = op(results)
# [N,C,Tx3,H,W]
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
@INFERENCE.register()
class TimeSformer_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=8,
seg_len=1,
short_size=224,
target_size=224,
top_k=1,
mean=[0.45, 0.45, 0.45],
std=[0.225, 0.225, 0.225]):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
self.mean = mean
self.std = std
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
ops = [
VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg),
Sampler(self.num_seg,
self.seg_len,
valid_mode=True,
linspace_sample=True),
Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]),
Image2Array(data_format='cthw'),
JitterScale(self.short_size, self.short_size),
UniformCrop(self.target_size)
]
for op in ops:
results = op(results)
# [N,C,Tx3,H,W]
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
@INFERENCE.register()
class VideoSwin_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=4,
seg_len=32,
frame_interval=2,
short_size=224,
target_size=224,
top_k=1,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375]):
self.num_seg = num_seg
self.seg_len = seg_len
self.frame_interval = frame_interval
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
self.mean = mean
self.std = std
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
self.input_file = input_file
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
ops = [
VideoDecoder(backend='decord', mode='valid'),
Sampler(num_seg=self.num_seg,
frame_interval=self.frame_interval,
seg_len=self.seg_len,
valid_mode=True,
use_pil=False),
Scale(short_size=self.short_size,
fixed_ratio=False,
keep_ratio=True,
backend='cv2',
do_round=True),
CenterCrop(target_size=224, backend='cv2'),
Normalization(mean=self.mean,
std=self.std,
tensor_shape=[3, 1, 1, 1],
inplace=True),
Image2Array(data_format='cthw')
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
def postprocess(self, output, print_output=True):
"""
output: list
"""
if not isinstance(self.input_file, list):
self.input_file = [
self.input_file,
]
output = output[0] # [B, num_cls]
N = len(self.input_file)
if output.shape[0] != N:
output = output.reshape([N] + [output.shape[0] // N] +
list(output.shape[1:])) # [N, T, C]
output = output.mean(axis=1) # [N, C]
for i in range(N):
classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[i, classes])]
scores = output[i, classes]
if print_output:
print("Current video file: {0}".format(self.input_file[i]))
for j in range(self.top_k):
print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
@INFERENCE.register()
class VideoSwin_TableTennis_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=1,
seg_len=32,
short_size=256,
target_size=224,
top_k=1):
self.num_seg = num_seg
self.seg_len = seg_len
self.short_size = short_size
self.target_size = target_size
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'}
img_mean = [123.675, 116.28, 103.53]
img_std = [58.395, 57.12, 57.375]
ops = [
FrameDecoder(),
SamplerPkl(num_seg=self.num_seg,
seg_len=self.seg_len,
backend='cv2',
valid_mode=True),
Scale(short_size=self.short_size,
fixed_ratio=False,
keep_ratio=True,
backend='cv2',
do_round=True),
UniformCrop(target_size=self.target_size, backend='cv2'),
Normalization(mean=img_mean,
std=img_std,
tensor_shape=[3, 1, 1, 1],
inplace=True),
Image2Array(data_format='cthw')
]
for op in ops:
results = op(results)
res = np.expand_dims(results['imgs'], axis=0).copy()
return [res]
def add_text_to_video(
self,
video_path,
output_dir="applications/TableTennis/ActionRecognition/results",
text=None):
os.makedirs(output_dir, exist_ok=True)
if video_path.endswith('.pkl'):
try:
import cPickle as pickle
from cStringIO import StringIO
except ImportError:
import pickle
from io import BytesIO
from PIL import Image
data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes')
_, _, frames = data_loaded
frames_len = len(frames)
else:
videoCapture = cv2.VideoCapture()
videoCapture.open(video_path)
fps = videoCapture.get(cv2.CAP_PROP_FPS)
frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
print("fps=", int(fps), "frames=", int(frames_len), "scale=",
f"{frame_height}x{frame_width}")
frames_rgb_list = []
for i in range(int(frames_len)):
if video_path.endswith('.pkl'):
frame = np.array(
Image.open(BytesIO(frames[i])).convert("RGB").resize(
(240, 135)))[:, :, ::-1].astype('uint8')
else:
_, frame = videoCapture.read()
frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX,
1.0, (0, 0, 255), 2)
frames_rgb_list.append(frame[:, :, ::-1]) # bgr to rgb
if not video_path.endswith('.pkl'):
videoCapture.release()
cv2.destroyAllWindows()
output_filename = os.path.basename(video_path)
output_filename = output_filename.split('.')[0] + '.gif'
imageio.mimsave(f'{output_dir}/{output_filename}',
frames_rgb_list,
'GIF',
duration=0.00085)
def postprocess(self, output, print_output=True, save_gif=True):
"""
output: list
"""
if not isinstance(self.input_file, list):
self.input_file = [
self.input_file,
]
output = output[0] # [B, num_cls]
N = len(self.input_file)
if output.shape[0] != N:
output = output.reshape([N] + [output.shape[0] // N] +
list(output.shape[1:])) # [N, T, C]
output = output.mean(axis=1) # [N, C]
for i in range(N):
classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[i, classes])]
scores = output[i, classes]
if print_output:
print("Current video file: {0}".format(self.input_file[i]))
for j in range(self.top_k):
print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
if save_gif:
self.add_text_to_video(
self.input_file[0],
text=f"{str(classes[0])} {float(scores[0]):.5f}")
@INFERENCE.register()
class SlowFast_Inference_helper(Base_Inference_helper):
def __init__(self,
num_frames=32,
sampling_rate=2,
target_size=256,
alpha=8,
top_k=1):
self.num_frames = num_frames
self.sampling_rate = sampling_rate
self.target_size = target_size
self.alpha = alpha
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {
'filename': input_file,
'temporal_sample_index': 0,
'spatial_sample_index': 0,
'temporal_num_clips': 1,
'spatial_num_clips': 1
}
img_mean = [0.45, 0.45, 0.45]
img_std = [0.225, 0.225, 0.225]
ops = [
DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True),
JitterScale(self.target_size, self.target_size),
MultiCrop(self.target_size),
Image2Array(transpose=False),
Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]),
PackOutput(self.alpha),
]
for op in ops:
results = op(results)
res = []
for item in results['imgs']:
res.append(np.expand_dims(item, axis=0).copy())
return res
def postprocess(self, output, print_output=True):
"""
output: list
"""
if not isinstance(self.input_file, list):
self.input_file = [
self.input_file,
]
output = output[0] # [B, num_cls]
N = len(self.input_file)
if output.shape[0] != N:
output = output.reshape([N] + [output.shape[0] // N] +
list(output.shape[1:])) # [N, T, C]
output = output.mean(axis=1) # [N, C]
# output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head
for i in range(N):
classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[i, classes])]
scores = output[i, classes]
if print_output:
print("Current video file: {0}".format(self.input_file[i]))
for j in range(self.top_k):
print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
@INFERENCE.register()
class STGCN_Inference_helper(Base_Inference_helper):
def __init__(self,
num_channels,
window_size,
vertex_nums,
person_nums,
top_k=1):
self.num_channels = num_channels
self.window_size = window_size
self.vertex_nums = vertex_nums
self.person_nums = person_nums
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
data = np.load(input_file)
results = {'data': data}
ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()]
for op in ops:
results = op(results)
res = np.expand_dims(results['data'], axis=0).copy()
return [res]
@INFERENCE.register()
class CTRGCN_Inference_helper(Base_Inference_helper):
def __init__(self,
num_channels=3,
vertex_nums=25,
person_nums=2,
window_size=64,
p_interval=[0.95],
top_k=1):
self.window_size = window_size
self.p_interval = p_interval
self.num_channels = num_channels
self.vertex_nums = vertex_nums
self.person_nums = person_nums
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
data = np.load(input_file)
results = {'data': data}
ops = [
SketeonCropSample(window_size=self.window_size,
p_interval=self.p_interval)
]
for op in ops:
results = op(results)
res = np.expand_dims(results['data'], axis=0).copy()
return [res]
@INFERENCE.register()
class AGCN2s_Inference_helper(Base_Inference_helper):
def __init__(self,
window_size=300,
num_channels=3,
vertex_nums=25,
person_nums=2,
top_k=1):
self.window_size = window_size
self.num_channels = num_channels
self.vertex_nums = vertex_nums
self.person_nums = person_nums
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
data = np.load(input_file)
results = {'data': data}
res = np.expand_dims(results['data'], axis=0).copy()
return [res]
@INFERENCE.register()
class MSTCN_Inference_helper(Base_Inference_helper):
def __init__(self, num_channels, actions_map_file_path, feature_path=None):
self.num_channels = num_channels
file_ptr = open(actions_map_file_path, 'r')
actions = file_ptr.read().split('\n')[:-1]
file_ptr.close()
self.actions_dict = dict()
for a in actions:
self.actions_dict[a.split()[1]] = int(a.split()[0])
self.feature_path = feature_path
self.file_name_list = []
def get_process_file(self, input_file_txt):
with open(input_file_txt, 'r') as file_ptr:
info = file_ptr.read().split('\n')[:-1]
files = []
for video_name in info:
if self.feature_path is not None:
file_name = video_name.split('.')[0] + ".npy"
input_file = os.path.join(self.feature_path, file_name)
else:
input_file = video_name
assert os.path.isfile(
input_file) is not None, "{0} not exists".format(input_file)
files.append(input_file)
self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
return files
def preprocess(self, input_file):
"""
input_file: str, feature file list txt path
return: list
"""
output_list = []
data = np.load(input_file)
results = {'video_feat': data, 'video_gt': None}
ops = []
for op in ops:
results = op(results)
res = np.expand_dims(results['video_feat'], axis=0).copy()
output_list.append(res)
return output_list
def postprocess(self, output, print_output=True):
reslut_path = os.path.join("./inference/infer_results/")
if not os.path.isdir(reslut_path):
os.makedirs(reslut_path)
output = [output]
for outputs in output:
output_np = outputs[0]
recognition = []
for i in range(output_np.shape[0]):
recognition = np.concatenate((recognition, [
list(self.actions_dict.keys())[list(
self.actions_dict.values()).index(output_np[i])]
]))
recog_content = list(recognition)
recog_content = [line + "\n" for line in recog_content]
filename = self.file_name_list.pop(0)
write_path = os.path.join(reslut_path, filename + ".txt")
f = open(write_path, "w")
f.writelines(recog_content)
f.close()
print("result write in : " + write_path)
@INFERENCE.register()
class ASRF_Inference_helper(Base_Inference_helper):
def __init__(self,
num_channels,
actions_map_file_path,
postprocessing_method,
boundary_threshold,
feature_path=None):
self.num_channels = num_channels
file_ptr = open(actions_map_file_path, 'r')
actions = file_ptr.read().split('\n')[:-1]
file_ptr.close()
self.actions_dict = dict()
for a in actions:
self.actions_dict[a.split()[1]] = int(a.split()[0])
self.postprocessing_method = postprocessing_method
self.boundary_threshold = boundary_threshold
self.feature_path = feature_path
self.file_name_list = []
def get_process_file(self, input_file_txt):
with open(input_file_txt, 'r') as file_ptr:
info = file_ptr.read().split('\n')[:-1]
files = []
for video_name in info:
if self.feature_path is not None:
file_name = video_name.split('.')[0] + ".npy"
input_file = os.path.join(self.feature_path, file_name)
else:
input_file = video_name
assert os.path.isfile(
input_file) is not None, "{0} not exists".format(input_file)
files.append(input_file)
self.file_name_list.append(input_file.split('/')[-1].split('.')[0])
return files
def preprocess(self, input_file):
"""
input_file: str, feature file list txt path
return: list
"""
output_list = []
data = np.load(input_file)
results = {'video_feat': data, 'video_gt': None}
ops = []
for op in ops:
results = op(results)
res = np.expand_dims(results['video_feat'], axis=0).copy()
output_list.append(res)
return output_list
def postprocess(self, output, print_output=True):
reslut_path = os.path.join("./inference/infer_results/")
if not os.path.isdir(reslut_path):
os.makedirs(reslut_path)
output = [output]
for outputs in output:
outputs_cls_np = outputs[0]
outputs_boundary_np = outputs[1]
output_np = ASRFPostProcessing(
outputs_cls_np,
outputs_boundary_np,
self.postprocessing_method,
boundary_threshold=self.boundary_threshold).numpy()[0, :]
recognition = []
for i in range(output_np.shape[0]):
recognition = np.concatenate((recognition, [
list(self.actions_dict.keys())[list(
self.actions_dict.values()).index(output_np[i])]
]))
recog_content = list(recognition)
recog_content = [line + "\n" for line in recog_content]
filename = self.file_name_list.pop(0)
write_path = os.path.join(reslut_path, filename + ".txt")
f = open(write_path, "w")
f.writelines(recog_content)
f.close()
print("result write in : " + write_path)
@INFERENCE.register()
class AttentionLSTM_Inference_helper(Base_Inference_helper):
def __init__(
self,
num_classes, #Optional, the number of classes to be classified.
feature_num,
feature_dims,
embedding_size,
lstm_size,
top_k=1):
self.num_classes = num_classes
self.feature_num = feature_num
self.feature_dims = feature_dims
self.embedding_size = embedding_size
self.lstm_size = lstm_size
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {'filename': input_file}
ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)]
for op in ops:
results = op(results)
res = []
for modality in ['rgb', 'audio']:
res.append(
np.expand_dims(results[f'{modality}_data'], axis=0).copy())
res.append(
np.expand_dims(results[f'{modality}_len'], axis=0).copy())
res.append(
np.expand_dims(results[f'{modality}_mask'], axis=0).copy())
return res
@INFERENCE.register()
class TransNetV2_Inference_helper():
def __init__(self,
num_frames,
height,
width,
num_channels,
threshold=0.5,
output_path=None,
visualize=True):
self._input_size = (height, width, num_channels)
self.output_path = output_path
self.len_frames = 0
self.threshold = threshold
self.visualize = visualize
def input_iterator(self, frames):
# return windows of size 100 where the first/last 25 frames are from the previous/next batch
# the first and last window must be padded by copies of the first and last frame of the video
no_padded_frames_start = 25
no_padded_frames_end = 25 + 50 - (
len(frames) % 50 if len(frames) % 50 != 0 else 50) # 25 - 74
start_frame = np.expand_dims(frames[0], 0)
end_frame = np.expand_dims(frames[-1], 0)
padded_inputs = np.concatenate([start_frame] * no_padded_frames_start +
[frames] +
[end_frame] * no_padded_frames_end, 0)
ptr = 0
while ptr + 100 <= len(padded_inputs):
out = padded_inputs[ptr:ptr + 100]
out = out.astype(np.float32)
ptr += 50
yield out[np.newaxis]
def preprocess(self, input_file):
"""
input_file: str, file path
return: iterator
"""
try:
import ffmpeg
except ImportError as e:
print(
f"Warning! {e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2."
)
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
self.input_file = input_file
self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0]
video_stream, err = ffmpeg.input(
self.input_file).output("pipe:",
format="rawvideo",
pix_fmt="rgb24",
s="48x27").run(capture_stdout=True,
capture_stderr=True)
self.frames = np.frombuffer(video_stream,
np.uint8).reshape([-1, 27, 48, 3])
self.len_frames = len(self.frames)
return self.input_iterator(self.frames)
def predictions_to_scenes(self, predictions):
predictions = (predictions > self.threshold).astype(np.uint8)
scenes = []
t, t_prev, start = -1, 0, 0
for i, t in enumerate(predictions):
if t_prev == 1 and t == 0:
start = i
if t_prev == 0 and t == 1 and i != 0:
scenes.append([start, i])
t_prev = t
if t == 0:
scenes.append([start, i])
# just fix if all predictions are 1
if len(scenes) == 0:
return np.array([[0, len(predictions) - 1]], dtype=np.int32)
return np.array(scenes, dtype=np.int32)
def visualize_predictions(self, frames, predictions):
from PIL import Image, ImageDraw
if isinstance(predictions, np.ndarray):
predictions = [predictions]
ih, iw, ic = frames.shape[1:]
width = 25
# pad frames so that length of the video is divisible by width
# pad frames also by len(predictions) pixels in width in order to show predictions
pad_with = width - len(frames) % width if len(
frames) % width != 0 else 0
frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)),
(0, 0)])
predictions = [np.pad(x, (0, pad_with)) for x in predictions]
height = len(frames) // width
img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])
img = np.concatenate(np.split(
np.concatenate(np.split(img, height), axis=2)[0], width),
axis=2)[0, :-1]
img = Image.fromarray(img)
draw = ImageDraw.Draw(img)
# iterate over all frames
for i, pred in enumerate(zip(*predictions)):
x, y = i % width, i // width
x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1
# we can visualize multiple predictions per single frame
for j, p in enumerate(pred):
color = [0, 0, 0]
color[(j + 1) % 3] = 255
value = round(p * (ih - 1))
if value != 0:
draw.line((x + j, y, x + j, y - value),
fill=tuple(color),
width=1)
return img
def postprocess(self, outputs, print_output=True):
"""
output: list
"""
predictions = []
for output in outputs:
single_frame_logits, all_frames_logits = output
single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits))
all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits))
predictions.append((single_frame_pred.numpy()[0, 25:75, 0],
all_frames_pred.numpy()[0, 25:75, 0]))
single_frame_pred = np.concatenate(
[single_ for single_, all_ in predictions])
all_frames_pred = np.concatenate(
[all_ for single_, all_ in predictions])
single_frame_predictions, all_frame_predictions = single_frame_pred[:
self
.
len_frames], all_frames_pred[:
self
.
len_frames]
scenes = self.predictions_to_scenes(single_frame_predictions)
if print_output:
print("Current video file: {0}".format(self.input_file))
print("\tShot Boundarys: {0}".format(scenes))
if self.output_path:
if not os.path.exists(self.output_path):
os.makedirs(self.output_path)
predictions = np.stack(
[single_frame_predictions, all_frame_predictions], 1)
predictions_file = os.path.join(self.output_path,
self.filename + "_predictions.txt")
np.savetxt(predictions_file, predictions, fmt="%.6f")
scenes_file = os.path.join(self.output_path,
self.filename + "_scenes.txt")
np.savetxt(scenes_file, scenes, fmt="%d")
if self.visualize:
pil_image = self.visualize_predictions(
self.frames,
predictions=(single_frame_predictions,
all_frame_predictions))
image_file = os.path.join(self.output_path,
self.filename + "_vis.png")
pil_image.save(image_file)
@INFERENCE.register()
class ADDS_Inference_helper(Base_Inference_helper):
def __init__(self,
frame_idxs=[0],
num_scales=4,
side_map={
"2": 2,
"3": 3,
"l": 2,
"r": 3
},
height=256,
width=512,
full_res_shape=None,
num_channels=None,
img_ext=".png",
K=None):
self.frame_idxs = frame_idxs
self.num_scales = num_scales
self.side_map = side_map
self.full_res_shape = full_res_shape
self.img_ext = img_ext
self.height = height
self.width = width
self.K = K
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
results = {
'filename': input_file,
'mode': 'infer',
'day_or_night': 'day',
}
ops = [
ImageDecoder(
backend='pil',
dataset='kitti',
frame_idxs=self.frame_idxs,
num_scales=self.num_scales,
side_map=self.side_map,
full_res_shape=self.full_res_shape,
img_ext=self.img_ext,
),
GroupResize(
height=self.height,
width=self.width,
K=self.K,
scale=1,
mode='infer',
),
ToArray(),
]
for op in ops:
results = op(results)
res = results['imgs'][('color', 0, 0)]
res = np.expand_dims(res, axis=0).copy()
return [res]
def postprocess(self, output, print_output, save_dir='data/'):
"""
output: list
"""
if not isinstance(self.input_file, list):
self.input_file = [
self.input_file,
]
print(len(output))
N = len(self.input_file)
for i in range(N):
pred_depth = output[i] # [H, W]
if print_output:
print("Current input image: {0}".format(self.input_file[i]))
file_name = os.path.basename(self.input_file[i]).split('.')[0]
save_path = os.path.join(save_dir,
file_name + "_depth" + ".png")
pred_depth_color = self._convertPNG(pred_depth)
pred_depth_color.save(save_path)
print(f"pred depth image saved to: {save_path}")
def _convertPNG(self, image_numpy):
disp_resized = cv2.resize(image_numpy, (1280, 640))
disp_resized_np = disp_resized
vmax = np.percentile(disp_resized_np, 95)
normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
255).astype(np.uint8)
im = Image.fromarray(colormapped_im)
return im
@INFERENCE.register()
class AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper):
def __init__(self,
detection_model_name,
detection_model_weights,
config_file_path,
predict_stepsize=8,
output_stepsize=4,
output_fps=6,
out_filename='ava_det_demo.mp4',
num_frames=32,
alpha=4,
target_size=256):
self.detection_model_name = detection_model_name
self.detection_model_weights = detection_model_weights
self.config = get_config(config_file_path,
show=False) #parse config file
self.predict_stepsize = predict_stepsize
self.output_stepsize = output_stepsize
self.output_fps = output_fps
self.out_filename = out_filename
self.num_frames = num_frames
self.alpha = alpha
self.target_size = target_size
def preprocess(self, input_file):
"""
input_file: str, file path
"""
frame_dir = 'tmp_frames'
self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir)
num_frame = len(self.frame_paths) #视频秒数*FPS
assert num_frame != 0
# 帧图像高度和宽度
h, w, _ = frames[0].shape
# Get clip_len, frame_interval and calculate center index of each clip
data_process_pipeline = build_pipeline(
self.config.PIPELINE.test) #测试时输出处理流水配置
clip_len = self.config.PIPELINE.test.sample['clip_len']
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
frame_interval = self.config.PIPELINE.test.sample['frame_interval']
# 此处关键帧每秒取一个
clip_len = self.config.PIPELINE.test.sample['clip_len']
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
frame_interval = self.config.PIPELINE.test.sample['frame_interval']
window_size = clip_len * frame_interval
timestamps = np.arange(window_size // 2,
(num_frame + 1 - window_size // 2),
self.predict_stepsize)
selected_frame_list = []
for timestamp in timestamps:
selected_frame_list.append(self.frame_paths[timestamp - 1])
# Load label_map
label_map_path = self.config.DATASET.test['label_file']
self.categories, self.class_whitelist = read_labelmap(
open(label_map_path))
label_map = {}
for item in self.categories:
id = item['id']
name = item['name']
label_map[id] = name
self.label_map = label_map
detection_result_dir = 'tmp_detection'
detection_model_name = self.detection_model_name
detection_model_weights = self.detection_model_weights
detection_txt_list = detection_inference(selected_frame_list,
detection_result_dir,
detection_model_name,
detection_model_weights)
assert len(detection_txt_list) == len(timestamps)
human_detections = []
data_list = []
person_num_list = []
for timestamp, detection_txt_path in zip(timestamps,
detection_txt_list):
proposals, scores = get_detection_result(
detection_txt_path, h, w,
(float)(self.config.DATASET.test['person_det_score_thr']))
if proposals.shape[0] == 0:
#person_num_list.append(0)
human_detections.append(None)
continue
human_detections.append(proposals)
result = get_timestep_result(frame_dir,
timestamp,
clip_len,
frame_interval,
FPS=FPS)
result["proposals"] = proposals
result["scores"] = scores
new_result = data_process_pipeline(result)
proposals = new_result['proposals']
img_slow = new_result['imgs'][0]
img_slow = img_slow[np.newaxis, :]
img_fast = new_result['imgs'][1]
img_fast = img_fast[np.newaxis, :]
proposals = proposals[np.newaxis, :]
scores = scores[np.newaxis, :]
img_shape = np.asarray(new_result['img_shape'])
img_shape = img_shape[np.newaxis, :]
data = [
paddle.to_tensor(img_slow, dtype='float32'),
paddle.to_tensor(img_fast, dtype='float32'),
paddle.to_tensor(proposals, dtype='float32'),
paddle.to_tensor(img_shape, dtype='int32')
]
person_num = proposals.shape[1]
person_num_list.append(person_num)
data_list.append(data)
self.human_detections = human_detections
self.person_num_list = person_num_list
self.timestamps = timestamps
self.frame_dir = frame_dir
self.detection_result_dir = detection_result_dir
return data_list
def postprocess(self, outputs, print_output=True):
"""
output: list
"""
predictions = []
assert len(self.person_num_list) == len(outputs)
#print("*** self.human_detections",len( self.human_detections))
#print("*** outputs",len( outputs))
index = 0
for t_index in range(len(self.timestamps)):
if self.human_detections[t_index] is None:
predictions.append(None)
continue
human_detection = self.human_detections[t_index]
output = outputs[index]
result = output #长度为类别个数,不包含背景
person_num = self.person_num_list[index]
index = index + 1
prediction = []
if human_detection is None:
predictions.append(None)
continue
# N proposals
for i in range(person_num):
prediction.append([])
# Perform action score thr
for i in range(len(result)): # for class
if i + 1 not in self.class_whitelist:
continue
for j in range(person_num):
if result[i][j, 4] > self.config.MODEL.head['action_thr']:
prediction[j].append(
(self.label_map[i + 1], result[i][j, 4]
)) # label_map is a dict, label index start from 1
predictions.append(prediction)
results = []
for human_detection, prediction in zip(self.human_detections,
predictions):
results.append(pack_result(human_detection, prediction))
def dense_timestamps(timestamps, n):
"""Make it nx frames."""
old_frame_interval = (timestamps[1] - timestamps[0])
start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
new_frame_inds = np.arange(
len(timestamps) * n) * old_frame_interval / n + start
return new_frame_inds.astype(np.int)
dense_n = int(self.predict_stepsize / self.output_stepsize) #30
frames = [
cv2.imread(self.frame_paths[i - 1])
for i in dense_timestamps(self.timestamps, dense_n)
]
vis_frames = visualize(frames, results)
try:
import moviepy.editor as mpy
except ImportError:
raise ImportError('Please install moviepy to enable output file')
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
fps=self.output_fps)
vid.write_videofile(self.out_filename)
print("finish write !")
# delete tmp files and dirs
shutil.rmtree(self.frame_dir)
shutil.rmtree(self.detection_result_dir)
@INFERENCE.register()
class PoseC3D_Inference_helper(Base_Inference_helper):
def __init__(self, top_k=1):
self.top_k = top_k
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
with open(input_file, 'rb') as f:
data = pickle.load(f)
self.input_file = input_file
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
ops = [
UniformSampleFrames(clip_len=48, num_clips=10, test_mode=True),
PoseDecode(),
PoseCompact(hw_ratio=1., allow_imgpad=True),
Resize(scale=(-1, 56)),
CenterCrop_V2(crop_size=56),
GeneratePoseTarget(sigma=0.6,
use_score=True,
with_kp=True,
with_limb=False,
double=True,
left_kp=left_kp,
right_kp=right_kp),
FormatShape(input_format='NCTHW'),
Collect(keys=['imgs', 'label'], meta_keys=[])
]
for op in ops:
results = op(data)
results = [results[0][np.newaxis, :, :, :, :, :]]
self.num_segs = results[0].shape[1]
return results
def postprocess(self, outputs, print_output=True):
batch_size = outputs[0].shape[0]
cls_score = outputs[0].reshape(
[batch_size // self.num_segs, self.num_segs, outputs[0].shape[-1]])
output = F.softmax(paddle.to_tensor(cls_score),
axis=2).mean(axis=1).numpy()
N = len(self.input_file)
for i in range(N):
classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
classes = classes[np.argsort(-output[i, classes])]
scores = output[i, classes]
if print_output:
print("Current video file: {0}".format(self.input_file[i]))
for j in range(self.top_k):
print("\ttop-{0} class: {1}".format(j + 1, classes[j]))
print("\ttop-{0} score: {1}".format(j + 1, scores[j]))
@INFERENCE.register()
class YOWO_Inference_helper(Base_Inference_helper):
def __init__(self,
num_seg=16,
target_size=224,
nms_thresh=0.5,
conf_thresh_valid=0.5,
mean=[0.4345, 0.4051, 0.3775],
std=[0.2768, 0.2713, 0.2737]):
self.num_seg = num_seg
self.target_size = target_size
self.nms_thresh = nms_thresh
self.conf_thresh_valid = conf_thresh_valid
self.mean = mean
self.std = std
def preprocess(self, input_file):
"""
input_file: str, file path
return: list
"""
assert os.path.isfile(input_file) is not None, "{0} not exists".format(
input_file)
cap = cv2.VideoCapture(input_file)
queue = []
inputs = []
frames = []
while (cap.isOpened()):
ret, frame = cap.read()
if ret == False:
break
if len(queue) <= 0: # At initialization, populate queue with initial frame
for i in range(self.num_seg):
queue.append(frame)
# Add the read frame to last and pop out the oldest one
queue.append(frame)
queue.pop(0)
# Resize images
imgs = [cv2.resize(img, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR) for img in
queue]
# Convert image to CHW keeping BGR order.
imgs = [img.transpose([2, 0, 1]) for img in imgs]
# Image [0, 255] -> [0, 1].
imgs = [img / 255.0 for img in imgs]
imgs = [
np.ascontiguousarray(
img.reshape((3, imgs[0].shape[1], imgs[0].shape[2]))
).astype(np.float32)
for img in imgs
]
# Concat list of images to single ndarray.
imgs = np.concatenate(
[np.expand_dims(img, axis=1) for img in imgs], axis=1
)
imgs = np.ascontiguousarray(imgs)
imgs = np.expand_dims(imgs, axis=0)
imgs = np.expand_dims(imgs, axis=0)
inputs.append(imgs)
frames.append(queue[-1])
return inputs, frames
def postprocess(self, outputs, frame, filename, save_img=True):
"""
outputs: list
frames: list
"""
labels = [
"Basketball", "BasketballDunk", "Biking", "CliffDiving", "CricketBowling",
"Diving", "Fencing", "FloorGymnastics", "GolfSwing", "HorseRiding",
"IceDancing", "LongJump", "PoleVault", "RopeClimbing", "SalsaSpin",
"SkateBoarding", "Skiing", "Skijet", "SoccerJuggling", "Surfing",
"TennisSwing", "TrampolineJumping", "VolleyballSpiking", "WalkingWithDog"]
nms_thresh = 0.5
font = cv2.FONT_HERSHEY_SIMPLEX
for out in outputs:
out = paddle.to_tensor(out)
preds = []
all_boxes = get_region_boxes(out)
for i in range(out.shape[0]):
boxes = all_boxes[i]
boxes = nms(boxes, nms_thresh)
for box in boxes:
x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
det_conf = float(box[4])
for j in range((len(box) - 5) // 2):
cls_conf = float(box[5 + 2 * j].item())
prob = det_conf * cls_conf
preds.append([[x1, y1, x2, y2], prob, labels[int(box[6])]])
for _, dets in enumerate(preds):
if dets[1] < 0.4:
break
text = dets[2] + ' ' + '{:.2f}'.format(dets[1])
cv2.rectangle(frame, (dets[0][0], dets[0][1]), (dets[0][2], dets[0][3]), (0, 255, 0), 2)
cv2.putText(frame, text, (dets[0][0] + 3, dets[0][1] - 5 - 10 * _), font, 0.5, (0, 255, 0), 2)
cv2.imwrite('{}.jpg'.format(filename), frame)