You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

510 lines
16 KiB
Python

# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle
import os, sys
import copy as cp
import cv2
import math
try:
import ppdet
except ImportError as e:
print(
f"Warning! {e}, [paddledet] package and it's dependencies is required for AVA."
)
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
from paddlevideo.modeling.builder import build_model
from paddlevideo.utils import get_config
from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline
from paddlevideo.metrics.ava_utils import read_labelmap
import time
from os import path as osp
import numpy as np
from paddlevideo.utils import get_config
import pickle
from paddlevideo.utils import (get_logger, load, mkdir, save)
import shutil
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 0.5
FONTCOLOR = (255, 255, 255) # BGR, white
MSGCOLOR = (128, 128, 128) # BGR, gray
THICKNESS = 1
LINETYPE = 1
def hex2color(h):
"""Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
plate_blue = plate_blue.split('-')
plate_blue = [hex2color(h) for h in plate_blue]
plate_green = '004b23-006400-007200-008000-38b000-70e000'
plate_green = plate_green.split('-')
plate_green = [hex2color(h) for h in plate_green]
def abbrev(name):
"""Get the abbreviation of label name:
'take (an object) from (a person)' -> 'take ... from ...'
"""
while name.find('(') != -1:
st, ed = name.find('('), name.find(')')
name = name[:st] + '...' + name[ed + 1:]
return name
# annotations is pred results
def visualize(frames, annotations, plate=plate_blue, max_num=5):
"""Visualize frames with predicted annotations.
Args:
frames (list[np.ndarray]): Frames for visualization, note that
len(frames) % len(annotations) should be 0.
annotations (list[list[tuple]]): The predicted results.
plate (str): The plate used for visualization. Default: plate_blue.
max_num (int): Max number of labels to visualize for a person box.
Default: 5目前不能大于5.
Returns:
list[np.ndarray]: Visualized frames.
"""
assert max_num + 1 <= len(plate)
plate = [x[::-1] for x in plate]
frames_ = cp.deepcopy(frames)
nf, na = len(frames), len(annotations)
assert nf % na == 0
nfpa = len(frames) // len(annotations)
anno = None
h, w, _ = frames[0].shape
# proposals被归一化需要还原真实坐标值
scale_ratio = np.array([w, h, w, h])
for i in range(na):
anno = annotations[i]
if anno is None:
continue
for j in range(nfpa):
ind = i * nfpa + j
frame = frames_[ind]
for ann in anno:
box = ann[0]
label = ann[1]
if not len(label):
continue
score = ann[2]
box = (box * scale_ratio).astype(np.int64)
st, ed = tuple(box[:2]), tuple(box[2:])
cv2.rectangle(frame, st, ed, plate[0], 2)
for k, lb in enumerate(label):
if k >= max_num:
break
text = abbrev(lb)
text = ': '.join([text, str(score[k])])
location = (0 + st[0], 18 + k * 18 + st[1])
textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
THICKNESS)[0]
textwidth = textsize[0]
diag0 = (location[0] + textwidth, location[1] - 14)
diag1 = (location[0], location[1] + 2)
cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
return frames_
def frame_extraction(video_path, target_dir):
"""Extract frames given video_path.
Args:
video_path (str): The video_path.
"""
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
# Should be able to handle videos up to several hours
frame_tmpl = osp.join(target_dir, '{:05d}.jpg')
vid = cv2.VideoCapture(video_path)
FPS = int(vid.get(5))
frames = []
frame_paths = []
flag, frame = vid.read()
index = 1
while flag:
frames.append(frame)
frame_path = frame_tmpl.format(index)
frame_paths.append(frame_path)
cv2.imwrite(frame_path, frame)
index += 1
flag, frame = vid.read()
return frame_paths, frames, FPS
def parse_args():
def str2bool(v):
return v.lower() in ("true", "t", "1")
# general params
parser = argparse.ArgumentParser("PaddleVideo Inference model script")
parser.add_argument('-c',
'--config',
type=str,
default='configs/example.yaml',
help='config file path')
parser.add_argument('--video_path', help='video file/url')
parser.add_argument('-o',
'--override',
action='append',
default=[],
help='config options to be overridden')
parser.add_argument('-w',
'--weights',
type=str,
help='weights for finetuning or testing')
#detection_model_name
parser.add_argument('--detection_model_name',
help='the name of detection model ')
# detection_model_weights
parser.add_argument('--detection_model_weights',
help='the weights path of detection model ')
# params for predict
parser.add_argument('--out-filename',
default='ava_det_demo.mp4',
help='output filename')
parser.add_argument('--predict-stepsize',
default=8,
type=int,
help='give out a prediction per n frames')
parser.add_argument(
'--output-stepsize',
default=4,
type=int,
help=('show one frame per n frames in the demo, we should have: '
'predict_stepsize % output_stepsize == 0'))
parser.add_argument('--output-fps',
default=6,
type=int,
help='the fps of demo video output')
return parser.parse_args()
# 一帧的结果。根据概率大小进行排序
def pack_result(human_detection, result):
"""Short summary.
Args:
human_detection (np.ndarray): Human detection result.
result (type): The predicted label of each human proposal.
Returns:
tuple: Tuple of human proposal, label name and label score.
"""
results = []
if result is None:
return None
for prop, res in zip(human_detection, result):
res.sort(key=lambda x: -x[1])
results.append((prop, [x[0] for x in res], [x[1] for x in res]))
return results
# 构造数据处理需要的results
def get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS):
result = {}
result["frame_dir"] = frame_dir
frame_num = len(os.listdir(frame_dir))
dir_name = frame_dir.split("/")[-1]
result["video_id"] = dir_name
result['timestamp'] = timestamp
timestamp_str = '{:04d}'.format(timestamp)
img_key = dir_name + "," + timestamp_str
result['img_key'] = img_key
result['shot_info'] = (1, frame_num)
result['fps'] = FPS
result['suffix'] = '{:05}.jpg'
result['timestamp_start'] = 1
result['timestamp_end'] = int(frame_num / result['fps'])
return result
def detection_inference(frame_paths, output_dir, model_name, weights_path):
"""Detect human boxes given frame paths.
Args:
frame_paths (list[str]): The paths of frames to do detection inference.
Returns:
list[np.ndarray]: The human detection results.
"""
detection_cfg = ppdet.model_zoo.get_config_file(model_name)
detection_cfg = ppdet.core.workspace.load_config(detection_cfg)
detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test')
detection_trainer.load_weights(weights_path)
print('Performing Human Detection for each frame')
detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True)
print("finish object detection")
results = []
for frame_path in frame_paths:
(file_dir, file_name) = os.path.split(frame_path)
(file_path, ext) = os.path.splitext(frame_path)
txt_file_name = file_name.replace(ext, ".txt")
txt_path = os.path.join(output_dir, txt_file_name)
results.append(txt_path)
return results
def get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr):
"""
根据检测结果文件得到图像中人的检测框(proposals)和置信度scores
txt_file_path:检测结果存放路径
img_h:图像高度
img_w:图像宽度
"""
proposals = []
scores = []
with open(txt_file_path, 'r') as detection_file:
lines = detection_file.readlines()
for line in lines: # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375
items = line.split(" ")
if items[0] != 'person': #只要人
continue
score = items[1]
if (float)(score) < person_det_score_thr:
continue
x1 = (float(items[2])) / img_w
y1 = ((float)(items[3])) / img_h
box_w = ((float)(items[4]))
box_h = ((float)(items[5]))
x2 = (float(items[2]) + box_w) / img_w
y2 = (float(items[3]) + box_h) / img_h
scores.append(score)
proposals.append([x1, y1, x2, y2])
return np.array(proposals), np.array(scores)
@paddle.no_grad()
def main(args):
config = get_config(args.config, show=False) #parse config file
# extract frames from video
video_path = args.video_path
frame_dir = 'tmp_frames'
frame_paths, frames, FPS = frame_extraction(video_path, frame_dir)
num_frame = len(frame_paths) #视频秒数*FPS
assert num_frame != 0
print("Frame Number", num_frame)
# 帧图像高度和宽度
h, w, _ = frames[0].shape
# Get clip_len, frame_interval and calculate center index of each clip
data_process_pipeline = build_pipeline(config.PIPELINE.test) #测试时输出处理流水配置
clip_len = config.PIPELINE.test.sample['clip_len']
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
frame_interval = config.PIPELINE.test.sample['frame_interval']
# 此处关键帧每秒取一个
clip_len = config.PIPELINE.test.sample['clip_len']
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
frame_interval = config.PIPELINE.test.sample['frame_interval']
window_size = clip_len * frame_interval
timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2),
args.predict_stepsize)
print("timetamps number:", len(timestamps))
# get selected frame list according to timestamps
selected_frame_list = []
for timestamp in timestamps:
selected_frame_list.append(frame_paths[timestamp - 1])
# Load label_map
label_map_path = config.DATASET.test['label_file']
categories, class_whitelist = read_labelmap(open(label_map_path))
label_map = {}
for item in categories:
id = item['id']
name = item['name']
label_map[id] = name
# Construct model.
if config.MODEL.backbone.get('pretrained'):
config.MODEL.backbone.pretrained = '' # disable pretrain model init
model = build_model(config.MODEL)
model.eval()
state_dicts = load(args.weights)
model.set_state_dict(state_dicts)
detection_result_dir = 'tmp_detection'
detection_model_name = args.detection_model_name
detection_model_weights = args.detection_model_weights
detection_txt_list = detection_inference(selected_frame_list,
detection_result_dir,
detection_model_name,
detection_model_weights)
assert len(detection_txt_list) == len(timestamps)
print('Performing SpatioTemporal Action Detection for each clip')
human_detections = []
predictions = []
index = 0
for timestamp, detection_txt_path in zip(timestamps, detection_txt_list):
proposals, scores = get_detection_result(
detection_txt_path, h, w,
(float)(config.DATASET.test['person_det_score_thr']))
if proposals.shape[0] == 0:
predictions.append(None)
human_detections.append(None)
continue
human_detections.append(proposals)
result = get_timestep_result(frame_dir,
timestamp,
clip_len,
frame_interval,
FPS=FPS)
result["proposals"] = proposals
result["scores"] = scores
new_result = data_process_pipeline(result)
proposals = new_result['proposals']
img_slow = new_result['imgs'][0]
img_slow = img_slow[np.newaxis, :]
img_fast = new_result['imgs'][1]
img_fast = img_fast[np.newaxis, :]
proposals = proposals[np.newaxis, :]
scores = scores[np.newaxis, :]
img_shape = np.asarray(new_result['img_shape'])
img_shape = img_shape[np.newaxis, :]
data = [
paddle.to_tensor(img_slow, dtype='float32'),
paddle.to_tensor(img_fast, dtype='float32'),
paddle.to_tensor(proposals, dtype='float32'), scores,
paddle.to_tensor(img_shape, dtype='int32')
]
with paddle.no_grad():
result = model(data, mode='infer')
result = result[0]
prediction = []
person_num = proposals.shape[1]
# N proposals
for i in range(person_num):
prediction.append([])
# Perform action score thr
for i in range(len(result)):
if i + 1 not in class_whitelist:
continue
for j in range(person_num):
if result[i][j, 4] > config.MODEL.head['action_thr']:
prediction[j].append((label_map[i + 1], result[i][j,
4]))
predictions.append(prediction)
index = index + 1
if index % 10 == 0:
print(index, "/", len(timestamps))
results = []
for human_detection, prediction in zip(human_detections, predictions):
results.append(pack_result(human_detection, prediction))
def dense_timestamps(timestamps, n):
"""Make it nx frames."""
old_frame_interval = (timestamps[1] - timestamps[0])
start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
new_frame_inds = np.arange(
len(timestamps) * n) * old_frame_interval / n + start
return new_frame_inds.astype(np.int)
dense_n = int(args.predict_stepsize / args.output_stepsize) #30
frames = [
cv2.imread(frame_paths[i - 1])
for i in dense_timestamps(timestamps, dense_n)
]
vis_frames = visualize(frames, results)
try:
import moviepy.editor as mpy
except ImportError:
raise ImportError('Please install moviepy to enable output file')
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
fps=args.output_fps)
vid.write_videofile(args.out_filename)
print("finish write !")
# delete tmp files and dirs
shutil.rmtree(frame_dir)
shutil.rmtree(detection_result_dir)
if __name__ == '__main__':
args = parse_args() #解析参数
main(args)