# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import paddle import os, sys import copy as cp import cv2 import math try: import ppdet except ImportError as e: print( f"Warning! {e}, [paddledet] package and it's dependencies is required for AVA." ) __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from paddlevideo.modeling.builder import build_model from paddlevideo.utils import get_config from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline from paddlevideo.metrics.ava_utils import read_labelmap import time from os import path as osp import numpy as np from paddlevideo.utils import get_config import pickle from paddlevideo.utils import (get_logger, load, mkdir, save) import shutil FONTFACE = cv2.FONT_HERSHEY_DUPLEX FONTSCALE = 0.5 FONTCOLOR = (255, 255, 255) # BGR, white MSGCOLOR = (128, 128, 128) # BGR, gray THICKNESS = 1 LINETYPE = 1 def hex2color(h): """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' plate_blue = plate_blue.split('-') plate_blue = [hex2color(h) for h in plate_blue] plate_green = '004b23-006400-007200-008000-38b000-70e000' plate_green = plate_green.split('-') plate_green = [hex2color(h) for h in plate_green] def abbrev(name): """Get the abbreviation of label name: 'take (an object) from (a person)' -> 'take ... from ...' """ while name.find('(') != -1: st, ed = name.find('('), name.find(')') name = name[:st] + '...' + name[ed + 1:] return name # annotations is pred results def visualize(frames, annotations, plate=plate_blue, max_num=5): """Visualize frames with predicted annotations. Args: frames (list[np.ndarray]): Frames for visualization, note that len(frames) % len(annotations) should be 0. annotations (list[list[tuple]]): The predicted results. plate (str): The plate used for visualization. Default: plate_blue. max_num (int): Max number of labels to visualize for a person box. Default: 5,目前不能大于5. Returns: list[np.ndarray]: Visualized frames. """ assert max_num + 1 <= len(plate) plate = [x[::-1] for x in plate] frames_ = cp.deepcopy(frames) nf, na = len(frames), len(annotations) assert nf % na == 0 nfpa = len(frames) // len(annotations) anno = None h, w, _ = frames[0].shape # proposals被归一化需要还原真实坐标值 scale_ratio = np.array([w, h, w, h]) for i in range(na): anno = annotations[i] if anno is None: continue for j in range(nfpa): ind = i * nfpa + j frame = frames_[ind] for ann in anno: box = ann[0] label = ann[1] if not len(label): continue score = ann[2] box = (box * scale_ratio).astype(np.int64) st, ed = tuple(box[:2]), tuple(box[2:]) cv2.rectangle(frame, st, ed, plate[0], 2) for k, lb in enumerate(label): if k >= max_num: break text = abbrev(lb) text = ': '.join([text, str(score[k])]) location = (0 + st[0], 18 + k * 18 + st[1]) textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, THICKNESS)[0] textwidth = textsize[0] diag0 = (location[0] + textwidth, location[1] - 14) diag1 = (location[0], location[1] + 2) cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) cv2.putText(frame, text, location, FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE) return frames_ def frame_extraction(video_path, target_dir): """Extract frames given video_path. Args: video_path (str): The video_path. """ if not os.path.exists(target_dir): os.makedirs(target_dir, exist_ok=True) # Should be able to handle videos up to several hours frame_tmpl = osp.join(target_dir, '{:05d}.jpg') vid = cv2.VideoCapture(video_path) FPS = int(vid.get(5)) frames = [] frame_paths = [] flag, frame = vid.read() index = 1 while flag: frames.append(frame) frame_path = frame_tmpl.format(index) frame_paths.append(frame_path) cv2.imwrite(frame_path, frame) index += 1 flag, frame = vid.read() return frame_paths, frames, FPS def parse_args(): def str2bool(v): return v.lower() in ("true", "t", "1") # general params parser = argparse.ArgumentParser("PaddleVideo Inference model script") parser.add_argument('-c', '--config', type=str, default='configs/example.yaml', help='config file path') parser.add_argument('--video_path', help='video file/url') parser.add_argument('-o', '--override', action='append', default=[], help='config options to be overridden') parser.add_argument('-w', '--weights', type=str, help='weights for finetuning or testing') #detection_model_name parser.add_argument('--detection_model_name', help='the name of detection model ') # detection_model_weights parser.add_argument('--detection_model_weights', help='the weights path of detection model ') # params for predict parser.add_argument('--out-filename', default='ava_det_demo.mp4', help='output filename') parser.add_argument('--predict-stepsize', default=8, type=int, help='give out a prediction per n frames') parser.add_argument( '--output-stepsize', default=4, type=int, help=('show one frame per n frames in the demo, we should have: ' 'predict_stepsize % output_stepsize == 0')) parser.add_argument('--output-fps', default=6, type=int, help='the fps of demo video output') return parser.parse_args() # 一帧的结果。根据概率大小进行排序 def pack_result(human_detection, result): """Short summary. Args: human_detection (np.ndarray): Human detection result. result (type): The predicted label of each human proposal. Returns: tuple: Tuple of human proposal, label name and label score. """ results = [] if result is None: return None for prop, res in zip(human_detection, result): res.sort(key=lambda x: -x[1]) results.append((prop, [x[0] for x in res], [x[1] for x in res])) return results # 构造数据处理需要的results def get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS): result = {} result["frame_dir"] = frame_dir frame_num = len(os.listdir(frame_dir)) dir_name = frame_dir.split("/")[-1] result["video_id"] = dir_name result['timestamp'] = timestamp timestamp_str = '{:04d}'.format(timestamp) img_key = dir_name + "," + timestamp_str result['img_key'] = img_key result['shot_info'] = (1, frame_num) result['fps'] = FPS result['suffix'] = '{:05}.jpg' result['timestamp_start'] = 1 result['timestamp_end'] = int(frame_num / result['fps']) return result def detection_inference(frame_paths, output_dir, model_name, weights_path): """Detect human boxes given frame paths. Args: frame_paths (list[str]): The paths of frames to do detection inference. Returns: list[np.ndarray]: The human detection results. """ detection_cfg = ppdet.model_zoo.get_config_file(model_name) detection_cfg = ppdet.core.workspace.load_config(detection_cfg) detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test') detection_trainer.load_weights(weights_path) print('Performing Human Detection for each frame') detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True) print("finish object detection") results = [] for frame_path in frame_paths: (file_dir, file_name) = os.path.split(frame_path) (file_path, ext) = os.path.splitext(frame_path) txt_file_name = file_name.replace(ext, ".txt") txt_path = os.path.join(output_dir, txt_file_name) results.append(txt_path) return results def get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr): """ 根据检测结果文件得到图像中人的检测框(proposals)和置信度(scores) txt_file_path:检测结果存放路径 img_h:图像高度 img_w:图像宽度 """ proposals = [] scores = [] with open(txt_file_path, 'r') as detection_file: lines = detection_file.readlines() for line in lines: # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375 items = line.split(" ") if items[0] != 'person': #只要人 continue score = items[1] if (float)(score) < person_det_score_thr: continue x1 = (float(items[2])) / img_w y1 = ((float)(items[3])) / img_h box_w = ((float)(items[4])) box_h = ((float)(items[5])) x2 = (float(items[2]) + box_w) / img_w y2 = (float(items[3]) + box_h) / img_h scores.append(score) proposals.append([x1, y1, x2, y2]) return np.array(proposals), np.array(scores) @paddle.no_grad() def main(args): config = get_config(args.config, show=False) #parse config file # extract frames from video video_path = args.video_path frame_dir = 'tmp_frames' frame_paths, frames, FPS = frame_extraction(video_path, frame_dir) num_frame = len(frame_paths) #视频秒数*FPS assert num_frame != 0 print("Frame Number:", num_frame) # 帧图像高度和宽度 h, w, _ = frames[0].shape # Get clip_len, frame_interval and calculate center index of each clip data_process_pipeline = build_pipeline(config.PIPELINE.test) #测试时输出处理流水配置 clip_len = config.PIPELINE.test.sample['clip_len'] assert clip_len % 2 == 0, 'We would like to have an even clip_len' frame_interval = config.PIPELINE.test.sample['frame_interval'] # 此处关键帧每秒取一个 clip_len = config.PIPELINE.test.sample['clip_len'] assert clip_len % 2 == 0, 'We would like to have an even clip_len' frame_interval = config.PIPELINE.test.sample['frame_interval'] window_size = clip_len * frame_interval timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2), args.predict_stepsize) print("timetamps number:", len(timestamps)) # get selected frame list according to timestamps selected_frame_list = [] for timestamp in timestamps: selected_frame_list.append(frame_paths[timestamp - 1]) # Load label_map label_map_path = config.DATASET.test['label_file'] categories, class_whitelist = read_labelmap(open(label_map_path)) label_map = {} for item in categories: id = item['id'] name = item['name'] label_map[id] = name # Construct model. if config.MODEL.backbone.get('pretrained'): config.MODEL.backbone.pretrained = '' # disable pretrain model init model = build_model(config.MODEL) model.eval() state_dicts = load(args.weights) model.set_state_dict(state_dicts) detection_result_dir = 'tmp_detection' detection_model_name = args.detection_model_name detection_model_weights = args.detection_model_weights detection_txt_list = detection_inference(selected_frame_list, detection_result_dir, detection_model_name, detection_model_weights) assert len(detection_txt_list) == len(timestamps) print('Performing SpatioTemporal Action Detection for each clip') human_detections = [] predictions = [] index = 0 for timestamp, detection_txt_path in zip(timestamps, detection_txt_list): proposals, scores = get_detection_result( detection_txt_path, h, w, (float)(config.DATASET.test['person_det_score_thr'])) if proposals.shape[0] == 0: predictions.append(None) human_detections.append(None) continue human_detections.append(proposals) result = get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS=FPS) result["proposals"] = proposals result["scores"] = scores new_result = data_process_pipeline(result) proposals = new_result['proposals'] img_slow = new_result['imgs'][0] img_slow = img_slow[np.newaxis, :] img_fast = new_result['imgs'][1] img_fast = img_fast[np.newaxis, :] proposals = proposals[np.newaxis, :] scores = scores[np.newaxis, :] img_shape = np.asarray(new_result['img_shape']) img_shape = img_shape[np.newaxis, :] data = [ paddle.to_tensor(img_slow, dtype='float32'), paddle.to_tensor(img_fast, dtype='float32'), paddle.to_tensor(proposals, dtype='float32'), scores, paddle.to_tensor(img_shape, dtype='int32') ] with paddle.no_grad(): result = model(data, mode='infer') result = result[0] prediction = [] person_num = proposals.shape[1] # N proposals for i in range(person_num): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in class_whitelist: continue for j in range(person_num): if result[i][j, 4] > config.MODEL.head['action_thr']: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) index = index + 1 if index % 10 == 0: print(index, "/", len(timestamps)) results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) #30 frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] vis_frames = visualize(frames, results) try: import moviepy.editor as mpy except ImportError: raise ImportError('Please install moviepy to enable output file') vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) print("finish write !") # delete tmp files and dirs shutil.rmtree(frame_dir) shutil.rmtree(detection_result_dir) if __name__ == '__main__': args = parse_args() #解析参数 main(args)