diff --git a/Bank_second_part/detect_process/PP_TSMv2_infer.py b/Bank_second_part/detect_process/PP_TSMv2_infer.py new file mode 100644 index 0000000..0a2b9de --- /dev/null +++ b/Bank_second_part/detect_process/PP_TSMv2_infer.py @@ -0,0 +1,164 @@ +import os +import os.path as osp +from paddlevideo.utils.config import get_config +from paddle.inference import Config, create_predictor +from tools.utils import build_inference_helper + +class PP_TSMv2_predict(object): + + """PP-TSMv2模型中常用的参数初始化""" + + def __init__(self,use_gpu=True,ir_optim=True, + disable_glog=False,save_name=None,enable_mklddn=False, + precision="fp32",gpu_mem=8000,cpu_threads=None): + + self.use_gpu = use_gpu #是否使用GPU + self.cpu_threads = cpu_threads #cpu线程数 + self.ir_optim = ir_optim #是否开启IR优化 + self.disable_glog = disable_glog + self.gpu_mem = gpu_mem #GPU存储大小 + self.enable_mkldnn = enable_mklddn #是否开启mkldnn + self.precision = precision #mfldnn精度 + self.save_name = save_name #转化推理模型存放名称 + + + + def parse_file_paths(self,input_path: str) -> list: + + """ + 获取模型输入数据 + input_path:模型的输入文件 + """ + if osp.isfile(input_path): + files = [ + input_path, + ] + else: + files = os.listdir(input_path) + files = [ + file for file in files + if (file.endswith(".avi") or file.endswith(".mp4")) + ] + files = [osp.join(input_path, file) for file in files] + return files + + + def create_paddle_predictor(self,model_f,pretr_p,cfg): + """ + 创建推理引擎 + model_f:可推理模型存放的路径+配置文件 + pretr_p:训练后的参数存放文件 + cfg:模型配置文件 + + """ + config = Config(model_f,pretr_p) + if self.use_gpu: + config.enable_use_gpu(self.gpu_mem,0) + else: + config.disable_gpu() + if self.cpu_threads: + config.set_cpu_math_library_num_threads(self.cpu_threads) + if self.enable_mkldnn: + config.set_mkldnn_cache_capacity(10) + config.enable_mkldnn() + if self.precision == "fp16": + config.enable_mkldnn_bfloat16() + + config.switch_ir_optim(self.ir_optim) + + config.enable_memory_optim() + config.switch_use_feed_fetch_ops(False) + + if self.disable_glog: + config.disable_glog_info() + + predictor = create_predictor(config) + + return config,predictor + + def create_inference_model(self,config,model_f,params_f): + """ + 创建推理模型以及引擎 + config:模型配置文件 + model_f:可推理模型的存放路径 + params_f:可推理模型的参数 + """ + cfg = get_config(config, overrides=None, show=False) + InferenceHelper = build_inference_helper(cfg.INFERENCE) + _, predictor = self.create_paddle_predictor(model_f, params_f, cfg) + + return InferenceHelper,predictor + + + def predict(self,input_f,batch_size,predictor,InferenceHelper): + + """ + 推理模型,对数据进行推理、预测 + config :PP-TSMv2模型的配置文件 + input_f:待推理数据集的存放路径 + batch_size:模型推理中所取数据的多少,default = 1 + predictor:推理引擎 + InferenceHelper:推理模型 + """ + result = {} + + # cfg = get_config(config, overrides=None, show=False) + # model_name = cfg.model_name + # print(f"Inference model({model_name})...") + + # get input_tensor and output_tensor + input_names = predictor.get_input_names() + output_names = predictor.get_output_names() + input_tensor_list = [] + output_tensor_list = [] + for item in input_names: + input_tensor_list.append(predictor.get_input_handle(item)) + for item in output_names: + output_tensor_list.append(predictor.get_output_handle(item)) + + files = self.parse_file_paths(input_f)#input_path=input_f + + batch_num = batch_size + for st_idx in range(0, len(files), batch_num): + ed_idx = min(st_idx + batch_num, len(files)) + + #输出数据预处理 + batched_inputs = InferenceHelper.preprocess_batch(files[st_idx:ed_idx]) + for i in range(len(input_tensor_list)): + input_tensor_list[i].copy_from_cpu(batched_inputs[i]) + + #推理引擎开始推理 + predictor.run() + + batched_outputs = [] + for j in range(len(output_tensor_list)): + batched_outputs.append(output_tensor_list[j].copy_to_cpu()) + + #输出推理结果 + res = InferenceHelper.postprocess(batched_outputs,False,True) + result["video_id"] = res[0]["video_id"] + result["topk_class"] = res[0]["topk_class"].tolist()[0] + result["topk_scores"] = res[0]["topk_scores"].tolist()[0] + # print(result) + + return result + + + +# def main(): +# config = 'D:/download/PaddleVideo1/output/output/pptsm_lcnet_k400_16frames_uniform.yaml' # 配置文件地址 +# input_file='C:/Users/Administrator/Pictures/video_seg_re_hand/test01_3.avi' #待推理数据集存放的地址 +# model_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdmodel' # 推理模型存放地址 +# params_file = 'D:/download/PaddleVideo1/output/output/ppTSMv2.pdiparams' +# batch_size= 1 #输出推理模型 +# infer,predictor = PP_TSMv2_predict().create_inference_model(config,model_file,params_file) +# PP_TSMv2_predict().predict(config,input_file,batch_size,predictor,infer) #推理模型推理、预测 + + + + +# if __name__ == "__main__": +# main() + + + diff --git a/Bank_second_part/detect_process/analysisPoint.py b/Bank_second_part/detect_process/analysisPoint.py new file mode 100644 index 0000000..2297fdc --- /dev/null +++ b/Bank_second_part/detect_process/analysisPoint.py @@ -0,0 +1,152 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MediaPipe solution drawing utils.""" + +import math +from typing import List, Mapping, Optional, Tuple, Union + +import cv2 +import dataclasses +import matplotlib.pyplot as plt +import numpy as np + +from mediapipe.framework.formats import detection_pb2 +from mediapipe.framework.formats import location_data_pb2 +from mediapipe.framework.formats import landmark_pb2 + +_PRESENCE_THRESHOLD = 0.5 +_VISIBILITY_THRESHOLD = 0.5 +_BGR_CHANNELS = 3 + +WHITE_COLOR = (224, 224, 224) +BLACK_COLOR = (0, 0, 0) +RED_COLOR = (0, 0, 255) +GREEN_COLOR = (0, 128, 0) +BLUE_COLOR = (255, 0, 0) + + +@dataclasses.dataclass +class DrawingSpec: + # Color for drawing the annotation. Default to the white color. + color: Tuple[int, int, int] = WHITE_COLOR + # Thickness for drawing the annotation. Default to 2 pixels. + thickness: int = 2 + # Circle radius. Default to 2 pixels. + circle_radius: int = 2 + + +def _normalized_to_pixel_coordinates( + normalized_x: float, normalized_y: float, image_width: int, + image_height: int) -> Union[None, Tuple[int, int]]: + """Converts normalized value pair to pixel coordinates.""" + + # Checks if the float value is between 0 and 1. + def is_valid_normalized_value(value: float) -> bool: + return (value > 0 or math.isclose(0, value)) and (value < 1 or + math.isclose(1, value)) + + if not (is_valid_normalized_value(normalized_x) and + is_valid_normalized_value(normalized_y)): + # TODO: Draw coordinates even if it's outside of the image bounds. + return None + x_px = min(math.floor(normalized_x * image_width), image_width - 1) + y_px = min(math.floor(normalized_y * image_height), image_height - 1) + return x_px, y_px + + + +def draw_landmarks( + image: np.ndarray, + landmark_list: landmark_pb2.NormalizedLandmarkList, + connections: Optional[List[Tuple[int, int]]] = None): + """Draws the landmarks and the connections on the image. + + Args: + image: A three channel BGR image represented as numpy ndarray. + landmark_list: A normalized landmark list proto message to be annotated on + the image. + connections: A list of landmark index tuples that specifies how landmarks to + be connected in the drawing. + landmark_drawing_spec: Either a DrawingSpec object or a mapping from hand + landmarks to the DrawingSpecs that specifies the landmarks' drawing + settings such as color, line thickness, and circle radius. If this + argument is explicitly set to None, no landmarks will be drawn. + connection_drawing_spec: Either a DrawingSpec object or a mapping from hand + connections to the DrawingSpecs that specifies the connections' drawing + settings such as color and line thickness. If this argument is explicitly + set to None, no landmark connections will be drawn. + + Raises: + ValueError: If one of the followings: + a) If the input image is not three channel BGR. + b) If any connetions contain invalid landmark index. + """ + if not landmark_list: + return + if image.shape[2] != _BGR_CHANNELS: + raise ValueError('Input image must contain three channel bgr data.') + image_rows, image_cols, _ = image.shape + + # 所有的点转换成坐标的字典 + idx_to_coordinates = {} + for idx, landmark in enumerate(landmark_list.landmark): + # print('landmark:',landmark) + if ((landmark.HasField('visibility') and + landmark.visibility < _VISIBILITY_THRESHOLD) or + (landmark.HasField('presence') and + landmark.presence < _PRESENCE_THRESHOLD)): + continue + landmark_px = _normalized_to_pixel_coordinates(landmark.x, landmark.y, + image_cols, image_rows) + # print('landmark_px:',landmark_px) + if landmark_px: + idx_to_coordinates[idx] = landmark_px + + + if connections: + num_landmarks = len(landmark_list.landmark) + # print('connections:',connections) + + # Draws the connections if the start and end landmarks are both visible. + + start_list = [] + end_list = [] + for connection in connections: + # print(connection) + + start_idx = connection[0] + end_idx = connection[1] + + start_list.append(start_idx) + end_list.append(end_idx) + + + point_list = [] + for point_idx in end_list: + + # if point_idx not in start_list: + + # print(point_idx) + point_list.append(point_idx) + + + point_axis_list = [] + for point in point_list: + + if point in list(idx_to_coordinates.keys()): + point_axis_list.append(idx_to_coordinates[point]) + + + return point_axis_list + \ No newline at end of file diff --git a/Bank_second_part/detect_process/holisticDet.py b/Bank_second_part/detect_process/holisticDet.py new file mode 100644 index 0000000..f282bdf --- /dev/null +++ b/Bank_second_part/detect_process/holisticDet.py @@ -0,0 +1,104 @@ +import cv2 +import mediapipe as mp + +import analysisPoint as mp_drawing +mp_holistic = mp.solutions.holistic +import numpy as np + +class MediapipeProcess: + + def mediapipe_det(image,holistic): + + ''' + 调用模型推理获得检测结果 + ''' + + image.flags.writeable = False + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + results = holistic.process(image) + + return results + + def get_analysis_result(image,results): + + ''' + images: 检测的图片 + results: 图片的检测结果 + 对上述结果进行分析 + ''' + + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + + face_result = mp_drawing.draw_landmarks( + image, + results.face_landmarks, + mp_holistic.FACEMESH_CONTOURS) + + right_hand_result = mp_drawing.draw_landmarks( + image, + results.right_hand_landmarks, + mp_holistic.HAND_CONNECTIONS) + + left_hand_result = mp_drawing.draw_landmarks( + image, + results.left_hand_landmarks, + mp_holistic.HAND_CONNECTIONS) + + face_bbox = MediapipeProcess.point_to_bbox(face_result) + right_hand_bbox = MediapipeProcess.point_to_bbox(right_hand_result) + left_hand_bbox = MediapipeProcess.point_to_bbox(left_hand_result) + + result_dict = {'face_bbox':[face_bbox],'hand_bbox':[right_hand_bbox,left_hand_bbox]} + + + return result_dict + + + + def point_to_bbox(result_list): + + ''' + 根据关键点坐标,获取坐标点的最小外接矩形 + ''' + + result_array = np.array(result_list) + + if result_array.all(): + + rect = cv2.minAreaRect(result_array) # 得到最小外接矩形的(中心(x,y), (宽,高), 旋转角度) + bbox = cv2.boxPoints(rect) # 获取最小外接矩形的4个顶点坐标(ps: cv2.boxPoints(rect) for OpenCV 3.x) + bbox = np.int0(bbox) + bbox=bbox.tolist() + + left_top = [min(bbox, key=lambda p: p[0])[0], min(bbox, key=lambda p: p[1])[1]] + right_bottom = [max(bbox, key=lambda p: p[0])[0], max(bbox, key=lambda p: p[1])[1]] + + bbox_list = left_top + right_bottom + + # print('bbox:',bbox) + # print('bbox_list:',bbox_list) + + + # bbox_list = [] + + # bbox_list.append(bbox[0][0]) + # bbox_list.append(bbox[0][1]) + # bbox_list.append(bbox[2][0]) + # bbox_list.append(bbox[2][1]) + + return bbox_list + + else: + pass + + + + + + + + +# if __name__ == '__main__': +# # media_holistic(video_file='E:/Bank_files/Bank_02/dataset/video_person/after_1/0711-1_199_0.avi', + # video_save_path='E:/Bank_files/Bank_02/videos_mediapipe/test_data/0725_test') \ No newline at end of file diff --git a/Bank_second_part/detect_process/paddlevideo/__init__.py b/Bank_second_part/detect_process/paddlevideo/__init__.py new file mode 100644 index 0000000..8b03acf --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .version import paddlevideo_version diff --git a/Bank_second_part/detect_process/paddlevideo/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..0c62e18 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/__pycache__/version.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/__pycache__/version.cpython-310.pyc new file mode 100644 index 0000000..4a30493 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/__pycache__/version.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__init__.py b/Bank_second_part/detect_process/paddlevideo/loader/__init__.py new file mode 100644 index 0000000..4ed9b11 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .builder import build_dataset, build_dataloader, build_batch_pipeline +from .dataset import VideoDataset +from .dali_loader import TSN_Dali_loader, get_input_data + +__all__ = [ + 'build_dataset', 'build_dataloader', 'build_batch_pipeline', 'VideoDataset', + 'TSN_Dali_loader', 'get_input_data' +] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..55be032 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/builder.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/builder.cpython-310.pyc new file mode 100644 index 0000000..44939ca Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/builder.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/dali_loader.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/dali_loader.cpython-310.pyc new file mode 100644 index 0000000..b3d04dc Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/dali_loader.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000..ffc2596 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/__pycache__/registry.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/builder.py b/Bank_second_part/detect_process/paddlevideo/loader/builder.py new file mode 100644 index 0000000..23a65c3 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/builder.py @@ -0,0 +1,132 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import signal +import os +import paddle +from paddle.io import DataLoader, DistributedBatchSampler +from .registry import DATASETS, PIPELINES +from ..utils.build_utils import build +from .pipelines.compose import Compose +from paddlevideo.utils import get_logger +from paddlevideo.utils.multigrid import DistributedShortSampler +import numpy as np + +logger = get_logger("paddlevideo") + + +def build_pipeline(cfg): + """Build pipeline. + Args: + cfg (dict): root config dict. + """ + if cfg == None: + return + return Compose(cfg) + + +def build_dataset(cfg): + """Build dataset. + Args: + cfg (dict): root config dict. + + Returns: + dataset: dataset. + """ + #XXX: ugly code here! + cfg_dataset, cfg_pipeline = cfg + cfg_dataset.pipeline = build_pipeline(cfg_pipeline) + dataset = build(cfg_dataset, DATASETS, key="format") + return dataset + + +def build_batch_pipeline(cfg): + + batch_pipeline = build(cfg, PIPELINES) + return batch_pipeline + + +def build_dataloader(dataset, + batch_size, + num_workers, + places, + shuffle=True, + drop_last=True, + multigrid=False, + collate_fn_cfg=None, + **kwargs): + """Build Paddle Dataloader. + + XXX explain how the dataloader work! + + Args: + dataset (paddle.dataset): A PaddlePaddle dataset object. + batch_size (int): batch size on single card. + num_worker (int): num_worker + shuffle(bool): whether to shuffle the data at every epoch. + """ + if multigrid: + sampler = DistributedShortSampler(dataset, + batch_sizes=batch_size, + shuffle=True, + drop_last=True) + else: + sampler = DistributedBatchSampler(dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last) + + #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix. + # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to: + # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose. + + def mix_collate_fn(batch): + pipeline = build_batch_pipeline(collate_fn_cfg) + batch = pipeline(batch) + slots = [] + for items in batch: + for i, item in enumerate(items): + if len(slots) < len(items): + slots.append([item]) + else: + slots[i].append(item) + return [np.stack(slot, axis=0) for slot in slots] + + #if collate_fn_cfg is not None: + #ugly code here. collate_fn is mix op config + # collate_fn = mix_collate_fn(collate_fn_cfg) + + data_loader = DataLoader( + dataset, + batch_sampler=sampler, + places=places, + num_workers=num_workers, + collate_fn=mix_collate_fn if collate_fn_cfg is not None else None, + return_list=True, + **kwargs) + + return data_loader + + +def term_mp(sig_num, frame): + """ kill all child processes + """ + pid = os.getpid() + pgid = os.getpgid(os.getpid()) + logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid)) + os.killpg(pgid, signal.SIGKILL) + return + + +signal.signal(signal.SIGINT, term_mp) +signal.signal(signal.SIGTERM, term_mp) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py b/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py new file mode 100644 index 0000000..4fb0e28 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dali_loader.py @@ -0,0 +1,206 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import math + +import paddle +from paddle.distributed import ParallelEnv +import paddle.distributed as dist +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + +try: + from nvidia.dali.pipeline import Pipeline + import nvidia.dali.ops as ops + import nvidia.dali.types as types + import tempfile + from nvidia.dali.plugin.paddle import DALIGenericIterator +except: + Pipeline = object + + +def get_input_data(data): + return paddle.to_tensor(data[0]['image']), paddle.to_tensor( + data[0]['label']) + + +class TSN_Dali_loader(object): + def __init__(self, cfg): + self.batch_size = cfg.batch_size + self.file_path = cfg.file_path + + self.num_seg = cfg.num_seg + self.seglen = cfg.seglen + self.short_size = cfg.short_size + self.target_size = cfg.target_size + + # set num_shards and shard_id when distributed training is implemented + self.num_shards = dist.get_world_size() + self.shard_id = ParallelEnv().local_rank + self.dali_mean = cfg.mean * (self.num_seg * self.seglen) + self.dali_std = cfg.std * (self.num_seg * self.seglen) + + def build_dali_reader(self): + """ + build dali training reader + """ + def reader_(): + with open(self.file_path) as flist: + full_lines = [line for line in flist] + if (not hasattr(reader_, 'seed')): + reader_.seed = 0 + random.Random(reader_.seed).shuffle(full_lines) + logger.info(f"reader shuffle seed: {reader_.seed}.") + if reader_.seed is not None: + reader_.seed += 1 + + per_node_lines = int( + math.ceil(len(full_lines) * 1.0 / self.num_shards)) + total_lines = per_node_lines * self.num_shards + + # aligned full_lines so that it can evenly divisible + full_lines += full_lines[:(total_lines - len(full_lines))] + assert len(full_lines) == total_lines + + # trainer get own sample + lines = full_lines[self.shard_id:total_lines:self.num_shards] + assert len(lines) == per_node_lines + + logger.info( + f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}" + ) + logger.info( + f"read videos from {self.shard_id * per_node_lines}, " + f"length: {per_node_lines}, " + f"lines length: {len(lines)}, " + f"total: {len(full_lines)}") + + video_files = ''.join([item for item in lines]) + tf = tempfile.NamedTemporaryFile() + tf.write(str.encode(video_files)) + tf.flush() + video_files = tf.name + + device_id = ParallelEnv().local_rank + logger.info(f'---------- device_id: {device_id} -----------') + + pipe = VideoPipe(batch_size=self.batch_size, + num_threads=1, + device_id=device_id, + file_list=video_files, + sequence_length=self.num_seg * self.seglen, + num_seg=self.num_seg, + seg_length=self.seglen, + resize_shorter_scale=self.short_size, + crop_target_size=self.target_size, + is_training=True, + num_shards=self.num_shards, + shard_id=self.shard_id, + dali_mean=self.dali_mean, + dali_std=self.dali_std) + + logger.info( + 'initializing dataset, it will take several minutes if it is too large .... ' + ) + video_loader = DALIGenericIterator([pipe], ['image', 'label'], + len(lines), + dynamic_shape=True, + auto_reset=True) + + return video_loader + + dali_reader = reader_() + return dali_reader + + +class VideoPipe(Pipeline): + def __init__(self, + batch_size, + num_threads, + device_id, + file_list, + sequence_length, + num_seg, + seg_length, + resize_shorter_scale, + crop_target_size, + is_training=False, + initial_prefetch_size=20, + num_shards=1, + shard_id=0, + dali_mean=0., + dali_std=1.0): + super(VideoPipe, self).__init__(batch_size, num_threads, device_id) + self.input = ops.VideoReader(device="gpu", + file_list=file_list, + sequence_length=sequence_length, + num_seg=num_seg, + seg_length=seg_length, + is_training=is_training, + num_shards=num_shards, + shard_id=shard_id, + random_shuffle=is_training, + initial_fill=initial_prefetch_size) + # the sequece data read by ops.VideoReader is of shape [F, H, W, C] + # Because the ops.Resize does not support sequence data, + # it will be transposed into [H, W, F, C], + # then reshaped to [H, W, FC], and then resized like a 2-D image. + self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3]) + self.reshape = ops.Reshape(device="gpu", + rel_shape=[1.0, 1.0, -1], + layout='HWC') + self.resize = ops.Resize(device="gpu", + resize_shorter=resize_shorter_scale) + # crops and mirror are applied by ops.CropMirrorNormalize. + # Normalization will be implemented in paddle due to the difficulty of dimension broadcast, + # It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead. + self.pos_rng_x = ops.Uniform(range=(0.0, 1.0)) + self.pos_rng_y = ops.Uniform(range=(0.0, 1.0)) + self.mirror_generator = ops.Uniform(range=(0.0, 1.0)) + self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32) + self.crop_mirror_norm = ops.CropMirrorNormalize( + device="gpu", + crop=[crop_target_size, crop_target_size], + mean=dali_mean, + std=dali_std) + self.reshape_back = ops.Reshape( + device="gpu", + shape=[num_seg, seg_length * 3, crop_target_size, crop_target_size], + layout='FCHW') + self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64) + + def define_graph(self): + output, label = self.input(name="Reader") + output = self.transpose(output) + output = self.reshape(output) + + output = self.resize(output) + output = output / 255. + pos_x = self.pos_rng_x() + pos_y = self.pos_rng_y() + mirror_flag = self.mirror_generator() + mirror_flag = (mirror_flag > 0.5) + mirror_flag = self.cast_mirror(mirror_flag) + output = self.crop_mirror_norm(output, + crop_pos_x=pos_x, + crop_pos_y=pos_y, + mirror=mirror_flag) + output = self.reshape_back(output) + label = self.cast_label(label) + return output, label + + def __len__(self): + return self.epoch_size() diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py new file mode 100644 index 0000000..990cb87 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI.py @@ -0,0 +1,109 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class MRIDataset(BaseDataset): + """Rawframe dataset for action recognition. + The dataset loads raw frames from frame files, and apply specified transform operatation them. + The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace. + Example of an index file: + + .. code-block:: txt + + file_path-1 150 1 + file_path-2 160 1 + file_path-3 170 2 + file_path-4 180 2 + + Args: + file_path (str): Path to the index file. + pipeline(XXX): + data_prefix (str): directory path of the data. Default: None. + test_mode (bool): Whether to bulid the test dataset. Default: False. + suffix (str): suffix of file. Default: 'img_{:05}.jpg'. + + """ + def __init__(self, + file_path, + pipeline, + num_retries=5, + data_prefix=None, + test_mode=False, + suffix='img_{:05}.jpg'): + self.num_retries = num_retries + self.suffix = suffix + super().__init__(file_path, pipeline, data_prefix, test_mode) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split() + frame_dir, frames_len, labels = line_split + if self.data_prefix is not None: + frame_dir = osp.join(self.data_prefix, frame_dir) + info.append( + dict( + frame_dir=frame_dir, + #suffix=self.suffix, + frames_len=frames_len, + labels=int(labels))) + return info + + def prepare_train(self, idx): + """Prepare the frames for training/valid gisven index. """ + #Try to catch Exception caused by reading missing frames files + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['frame_dir'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return np.array(results['imgs']), np.array([results['labels']]) + + def prepare_test(self, idx): + """Prepare the frames for test given index. """ + #Try to catch Exception caused by reading missing frames files + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['frame_dir'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return np.array(results['imgs']), np.array([results['labels']]) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py new file mode 100644 index 0000000..db905e4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/MRI_SlowFast.py @@ -0,0 +1,111 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class SFMRIDataset(BaseDataset): + """Rawframe dataset for action recognition. + The dataset loads raw frames from frame files, and apply specified transform operatation them. + The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace. + Example of an index file: + + .. code-block:: txt + + file_path-1 150 1 + file_path-2 160 1 + file_path-3 170 2 + file_path-4 180 2 + + Args: + file_path (str): Path to the index file. + pipeline(XXX): + data_prefix (str): directory path of the data. Default: None. + test_mode (bool): Whether to bulid the test dataset. Default: False. + suffix (str): suffix of file. Default: 'img_{:05}.jpg'. + + """ + def __init__(self, + file_path, + pipeline, + num_retries=5, + data_prefix=None, + test_mode=False, + suffix='img_{:05}.jpg'): + self.num_retries = num_retries + self.suffix = suffix + super().__init__(file_path, pipeline, data_prefix, test_mode) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split() + frame_dir, frames_len, labels = line_split + if self.data_prefix is not None: + frame_dir = osp.join(self.data_prefix, frame_dir) + info.append( + dict( + frame_dir=frame_dir, + #suffix=self.suffix, + frames_len=frames_len, + labels=int(labels))) + return info + + def prepare_train(self, idx): + """Prepare the frames for training/valid gisven index. """ + #Try to catch Exception caused by reading missing frames files + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['frame_dir'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return np.array(results['imgs'][0]), np.array( + results['imgs'][1]), np.array([results['labels']]) + + def prepare_test(self, idx): + """Prepare the frames for test given index. """ + #Try to catch Exception caused by reading missing frames files + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['frame_dir'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return np.array(results['imgs'][0]), np.array( + results['imgs'][1]), np.array([results['labels']]) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__init__.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__init__.py new file mode 100644 index 0000000..e974191 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__init__.py @@ -0,0 +1,41 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .actbert_dataset import ActBertDataset +from .ava_dataset import AVADataset +from .bmn_dataset import BMNDataset +from .davis_dataset import DavisDataset +from .feature import FeatureDataset +from .frame import FrameDataset, FrameDataset_Sport +from .MRI import MRIDataset +from .MRI_SlowFast import SFMRIDataset +from .msrvtt import MSRVTTDataset +from .actbert_dataset import ActBertDataset +from .asrf_dataset import ASRFDataset +from .ms_tcn_dataset import MSTCNDataset +from .oxford import MonoDataset +from .skeleton import SkeletonDataset +from .slowfast_video import SFVideoDataset +from .video import VideoDataset +from .ucf101_skeleton import UCF101SkeletonDataset +from .ucf24_dataset import UCF24Dataset + + +__all__ = [ + 'VideoDataset', 'FrameDataset', 'SFVideoDataset', 'BMNDataset', + 'FeatureDataset', 'SkeletonDataset', 'AVADataset', 'MonoDataset', + 'MSRVTTDataset', 'ActBertDataset', 'DavisDataset', 'MRIDataset', + 'SFMRIDataset', 'FrameDataset_Sport', 'MSTCNDataset', 'ASRFDataset', + 'UCF101SkeletonDataset', 'UCF24Dataset' +] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI.cpython-310.pyc new file mode 100644 index 0000000..58b431f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI_SlowFast.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI_SlowFast.cpython-310.pyc new file mode 100644 index 0000000..08be8bc Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/MRI_SlowFast.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..8da8e88 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/actbert_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/actbert_dataset.cpython-310.pyc new file mode 100644 index 0000000..444ce36 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/actbert_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/asrf_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/asrf_dataset.cpython-310.pyc new file mode 100644 index 0000000..6dd42b0 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/asrf_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ava_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ava_dataset.cpython-310.pyc new file mode 100644 index 0000000..03146ed Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ava_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..5264812 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/bmn_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/bmn_dataset.cpython-310.pyc new file mode 100644 index 0000000..725e9cb Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/bmn_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/davis_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/davis_dataset.cpython-310.pyc new file mode 100644 index 0000000..2069db9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/davis_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/feature.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/feature.cpython-310.pyc new file mode 100644 index 0000000..5e41573 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/feature.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/frame.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/frame.cpython-310.pyc new file mode 100644 index 0000000..a3b9379 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/frame.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ms_tcn_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ms_tcn_dataset.cpython-310.pyc new file mode 100644 index 0000000..470c45f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ms_tcn_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/msrvtt.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/msrvtt.cpython-310.pyc new file mode 100644 index 0000000..6bbe257 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/msrvtt.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/oxford.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/oxford.cpython-310.pyc new file mode 100644 index 0000000..16d0a42 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/oxford.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/skeleton.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/skeleton.cpython-310.pyc new file mode 100644 index 0000000..9a297c2 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/skeleton.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/slowfast_video.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/slowfast_video.cpython-310.pyc new file mode 100644 index 0000000..32e6237 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/slowfast_video.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf101_skeleton.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf101_skeleton.cpython-310.pyc new file mode 100644 index 0000000..2ff935a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf101_skeleton.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf24_dataset.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf24_dataset.cpython-310.pyc new file mode 100644 index 0000000..9840716 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/ucf24_dataset.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/video.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/video.cpython-310.pyc new file mode 100644 index 0000000..918e27f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/dataset/__pycache__/video.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py new file mode 100644 index 0000000..8cccf5c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/actbert_dataset.py @@ -0,0 +1,74 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np +try: + import lmdb +except ImportError as e: + print( + f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT." + ) +import pickle +import json +try: + from paddlenlp.transformers import BertTokenizer +except ImportError as e: + print( + f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT." + ) +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class ActBertDataset(BaseDataset): + """ActBert dataset. + """ + def __init__( + self, + file_path, + pipeline, + bert_model="bert-base-uncased", + data_prefix=None, + test_mode=False, + ): + self.bert_model = bert_model + super().__init__(file_path, pipeline, data_prefix, test_mode) + + def load_file(self): + """Load index file to get video information.""" + feature_data = np.load(self.file_path, allow_pickle=True) + self.tokenizer = BertTokenizer.from_pretrained(self.bert_model, + do_lower_case=True) + self.info = [] + for item in feature_data: + self.info.append(dict(feature=item, tokenizer=self.tokenizer)) + return self.info + + def prepare_train(self, idx): + """Prepare the frames for training/valid given index. """ + results = copy.deepcopy(self.info[idx]) + #print('==results==', results) + results = self.pipeline(results) + return results['features'] + + def prepare_test(self, idx): + """Prepare the frames for test given index. """ + pass diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py new file mode 100644 index 0000000..15bd35a --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/asrf_dataset.py @@ -0,0 +1,104 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy + +import os +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class ASRFDataset(BaseDataset): + """Video dataset for action segmentation. + """ + + def __init__( + self, + file_path, + pipeline, + feature_path, + label_path, + boundary_path, + **kwargs, + ): + super().__init__(file_path, pipeline, **kwargs) + self.label_path = label_path + self.boundary_path = boundary_path + self.feature_path = feature_path + + def load_file(self): + """Load index file to get video information.""" + file_ptr = open(self.file_path, 'r') + info = file_ptr.read().split('\n')[:-1] + file_ptr.close() + return info + + def prepare_train(self, idx): + """TRAIN & VALID: Prepare data for training/valid given the index.""" + results = {} + video_name = self.info[idx] + # load video feature + file_name = video_name.split('.')[0] + ".npy" + feat_file_path = os.path.join(self.feature_path, file_name) + #TODO: check path + video_feat = np.load(feat_file_path) + + # load label + file_name = video_name.split('.')[0] + ".npy" + label_file_path = os.path.join(self.label_path, file_name) + label = np.load(label_file_path).astype(np.int64) + + # load boundary + file_name = video_name.split('.')[0] + ".npy" + boundary_file_path = os.path.join(self.boundary_path, file_name) + boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32) + + results['video_feat'] = copy.deepcopy(video_feat) + results['video_label'] = copy.deepcopy(label) + results['video_boundary'] = copy.deepcopy(boundary) + + results = self.pipeline(results) + return results['video_feat'], results['video_label'], results['video_boundary'] + + def prepare_test(self, idx): + """TEST: Prepare the data for test given the index.""" + results = {} + video_name = self.info[idx] + # load video feature + file_name = video_name.split('.')[0] + ".npy" + feat_file_path = os.path.join(self.feature_path, file_name) + #TODO: check path + video_feat = np.load(feat_file_path) + + # load label + file_name = video_name.split('.')[0] + ".npy" + label_file_path = os.path.join(self.label_path, file_name) + label = np.load(label_file_path).astype(np.int64) + + # load boundary + file_name = video_name.split('.')[0] + ".npy" + boundary_file_path = os.path.join(self.boundary_path, file_name) + boundary = np.expand_dims(np.load(boundary_file_path),axis=0).astype(np.float32) + + results['video_feat'] = copy.deepcopy(video_feat) + results['video_label'] = copy.deepcopy(label) + results['video_boundary'] = copy.deepcopy(boundary) + + results = self.pipeline(results) + return results['video_feat'], results['video_label'], results['video_boundary'] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py new file mode 100644 index 0000000..744e15b --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ava_dataset.py @@ -0,0 +1,249 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np +import sys +import os +import pickle +from datetime import datetime +from ...metrics.ava_utils import ava_evaluate_results +from ..registry import DATASETS +from .base import BaseDataset +from collections import defaultdict + + +@DATASETS.register() +class AVADataset(BaseDataset): + """AVA dataset for spatial temporal detection. + the dataset loads raw frames, bounding boxes, proposals and applies + transformations to return the frame tensors and other information. + """ + + _FPS = 30 + + def __init__(self, + pipeline, + file_path=None, + exclude_file=None, + label_file=None, + suffix='{:05}.jpg', + proposal_file=None, + person_det_score_thr=0.9, + num_classes=81, + data_prefix=None, + test_mode=False, + num_max_proposals=1000, + timestamp_start=900, + timestamp_end=1800): + self.custom_classes = None + self.exclude_file = exclude_file + self.label_file = label_file + self.proposal_file = proposal_file + assert 0 <= person_det_score_thr <= 1, ( + 'The value of ' + 'person_det_score_thr should in [0, 1]. ') + self.person_det_score_thr = person_det_score_thr + self.num_classes = num_classes + self.suffix = suffix + self.num_max_proposals = num_max_proposals + self.timestamp_start = timestamp_start + self.timestamp_end = timestamp_end + super().__init__( + file_path, + pipeline, + data_prefix, + test_mode, + ) + if self.proposal_file is not None: + self.proposals = self._load(self.proposal_file) + else: + self.proposals = None + if not test_mode: + valid_indexes = self.filter_exclude_file() + self.info = self.info = [self.info[i] for i in valid_indexes] + + def _load(self, path): + f = open(path, 'rb') + res = pickle.load(f) + f.close() + return res + + def parse_img_record(self, img_records): + bboxes, labels, entity_ids = [], [], [] + while len(img_records) > 0: + img_record = img_records[0] + num_img_records = len(img_records) + selected_records = list( + filter( + lambda x: np.array_equal(x['entity_box'], img_record[ + 'entity_box']), img_records)) + num_selected_records = len(selected_records) + img_records = list( + filter( + lambda x: not np.array_equal(x['entity_box'], img_record[ + 'entity_box']), img_records)) + assert len(img_records) + num_selected_records == num_img_records + + bboxes.append(img_record['entity_box']) + valid_labels = np.array([ + selected_record['label'] for selected_record in selected_records + ]) + + label = np.zeros(self.num_classes, dtype=np.float32) + label[valid_labels] = 1. + + labels.append(label) + entity_ids.append(img_record['entity_id']) + + bboxes = np.stack(bboxes) + labels = np.stack(labels) + entity_ids = np.stack(entity_ids) + return bboxes, labels, entity_ids + + def filter_exclude_file(self): + valid_indexes = [] + if self.exclude_file is None: + valid_indexes = list(range(len(self.info))) + else: + exclude_video_infos = [ + x.strip().split(',') for x in open(self.exclude_file) + ] + for i, video_info in enumerate(self.info): + valid_indexes.append(i) + for video_id, timestamp in exclude_video_infos: + if (video_info['video_id'] == video_id + and video_info['timestamp'] == int(timestamp)): + valid_indexes.pop() + break + return valid_indexes + + def load_file(self): + """Load index file to get video information.""" + info = [] + records_dict_by_img = defaultdict(list) + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split(',') + + video_id = line_split[0] + timestamp = int(line_split[1]) + img_key = f'{video_id},{timestamp:04d}' + + entity_box = np.array(list(map(float, line_split[2:6]))) + label = int(line_split[6]) + entity_id = int(line_split[7]) + shot_info = (0, (self.timestamp_end - self.timestamp_start) * + self._FPS) + + video_info = dict(video_id=video_id, + timestamp=timestamp, + entity_box=entity_box, + label=label, + entity_id=entity_id, + shot_info=shot_info) + records_dict_by_img[img_key].append(video_info) + + for img_key in records_dict_by_img: + video_id, timestamp = img_key.split(',') + bboxes, labels, entity_ids = self.parse_img_record( + records_dict_by_img[img_key]) + ann = dict(gt_bboxes=bboxes, + gt_labels=labels, + entity_ids=entity_ids) + frame_dir = video_id + if self.data_prefix is not None: + frame_dir = osp.join(self.data_prefix, frame_dir) + video_info = dict(frame_dir=frame_dir, + video_id=video_id, + timestamp=int(timestamp), + img_key=img_key, + shot_info=shot_info, + fps=self._FPS, + ann=ann) + info.append(video_info) + + return info + + def prepare_train(self, idx): + results = copy.deepcopy(self.info[idx]) + img_key = results['img_key'] + + results['suffix'] = self.suffix + results['timestamp_start'] = self.timestamp_start + results['timestamp_end'] = self.timestamp_end + + if self.proposals is not None: + if img_key not in self.proposals: + results['proposals'] = np.array([[0, 0, 1, 1]]) + results['scores'] = np.array([1]) + else: + proposals = self.proposals[img_key] + assert proposals.shape[-1] in [4, 5] + if proposals.shape[-1] == 5: + thr = min(self.person_det_score_thr, max(proposals[:, 4])) + positive_inds = (proposals[:, 4] >= thr) + proposals = proposals[positive_inds] + proposals = proposals[:self.num_max_proposals] + results['proposals'] = proposals[:, :4] + results['scores'] = proposals[:, 4] + else: + proposals = proposals[:self.num_max_proposals] + results['proposals'] = proposals + + ann = results.pop('ann') + results['gt_bboxes'] = ann['gt_bboxes'] + results['gt_labels'] = ann['gt_labels'] + results['entity_ids'] = ann['entity_ids'] + + #ret = self.pipeline(results, "") + ret = self.pipeline(results) + #padding for dataloader + len_proposals = ret['proposals'].shape[0] + len_gt_bboxes = ret['gt_bboxes'].shape[0] + len_gt_labels = ret['gt_labels'].shape[0] + len_scores = ret['scores'].shape[0] + len_entity_ids = ret['entity_ids'].shape[0] + padding_len = 128 + ret['proposals'] = self.my_padding_2d(ret['proposals'], padding_len) + ret['gt_bboxes'] = self.my_padding_2d(ret['gt_bboxes'], padding_len) + ret['gt_labels'] = self.my_padding_2d(ret['gt_labels'], padding_len) + ret['scores'] = self.my_padding_1d(ret['scores'], padding_len) + ret['entity_ids'] = self.my_padding_1d(ret['entity_ids'], padding_len) + return ret['imgs'][0], ret['imgs'][1], ret['proposals'], ret[ + 'gt_bboxes'], ret['gt_labels'], ret['scores'], ret[ + 'entity_ids'], np.array( + ret['img_shape'], dtype=int + ), idx, len_proposals, len_gt_bboxes, len_gt_labels, len_scores, len_entity_ids + + def my_padding_2d(self, feat, max_len): + feat_add = np.zeros((max_len - feat.shape[0], feat.shape[1]), + dtype=np.float32) + feat_pad = np.concatenate((feat, feat_add), axis=0) + return feat_pad + + def my_padding_1d(self, feat, max_len): + feat_add = np.zeros((max_len - feat.shape[0]), dtype=np.float32) + feat_pad = np.concatenate((feat, feat_add), axis=0) + return feat_pad + + def prepare_test(self, idx): + return self.prepare_train(idx) + + def evaluate(self, results): + return ava_evaluate_results(self.info, len(self), results, + self.custom_classes, self.label_file, + self.file_path, self.exclude_file) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py new file mode 100644 index 0000000..2549dc4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/base.py @@ -0,0 +1,80 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import numpy as np +from abc import ABC, abstractmethod + +import paddle +from paddle.io import Dataset + + +class BaseDataset(Dataset, ABC): + """Base class for datasets + + All datasets should subclass it. + All subclass should overwrite: + + - Method: `load_file`, load info from index file. + - Method: `prepare_train`, providing train data. + - Method: `prepare_test`, providing test data. + + Args: + file_path (str): index file path. + pipeline (Sequence XXX) + data_prefix (str): directory path of the data. Default: None. + test_mode (bool): whether to build test dataset. Default: False. + + """ + def __init__(self, file_path, pipeline, data_prefix=None, test_mode=False): + super().__init__() + self.file_path = file_path + self.data_prefix = osp.realpath(data_prefix) if \ + data_prefix is not None and osp.isdir(data_prefix) else data_prefix + self.test_mode = test_mode + self.pipeline = pipeline + self.info = self.load_file() + + @abstractmethod + def load_file(self): + """load the video information from the index file path.""" + pass + + def prepare_train(self, idx): + """TRAIN & VALID. Prepare the data for training/valid given the index.""" + #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + #unsqueeze label to list + return results['imgs'], np.array([results['labels']]) + + def prepare_test(self, idx): + """TEST: Prepare the data for test given the index.""" + #Note: For now, paddle.io.DataLoader cannot support dict type retval, so convert to list here + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + #unsqueeze label to list + return results['imgs'], np.array([results['labels']]) + + def __len__(self): + """get the size of the dataset.""" + return len(self.info) + + def __getitem__(self, idx): + """ Get the sample for either training or testing given index""" + if self.test_mode: + return self.prepare_test(idx) + else: + return self.prepare_train(idx) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py new file mode 100644 index 0000000..44c7651 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/bmn_dataset.py @@ -0,0 +1,72 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class BMNDataset(BaseDataset): + """Video dataset for action localization. + """ + def __init__( + self, + file_path, + pipeline, + subset, + **kwargs, + ): + self.subset = subset + super().__init__(file_path, pipeline, **kwargs) + + def load_file(self): + """Load index file to get video information.""" + info = [] + annos = json.load(open(self.file_path)) + for video_name in annos.keys(): + video_subset = annos[video_name]["subset"] + if self.subset in video_subset: + info.append( + dict( + video_name=video_name, + video_info=annos[video_name], + )) + #sort by video_name + sort_f = lambda elem: elem['video_name'] + info.sort(key=sort_f) + #add video_idx to info + for idx, elem in enumerate(info): + info[idx]['video_idx'] = idx + logger.info("{} subset video numbers: {}".format( + self.subset, len(info))) + return info + + def prepare_train(self, idx): + """TRAIN & VALID: Prepare data for training/valid given the index.""" + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + return results['video_feat'], results['gt_iou_map'], results['gt_start'],\ + results['gt_end'] + + def prepare_test(self, idx): + """TEST: Prepare the data for test given the index.""" + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + return results['video_feat'], results['gt_iou_map'], results['gt_start'], \ + results['gt_end'], results['video_idx'] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py new file mode 100644 index 0000000..20a2759 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/davis_dataset.py @@ -0,0 +1,189 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp +import copy +import random +import numpy as np +import shutil +from PIL import Image +import cv2 +from paddle.io import Dataset + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +class VOS_Test(Dataset): + """process frames in each video + """ + def __init__(self, + image_root, + label_root, + seq_name, + images, + labels, + pipeline=None, + rgb=False, + resolution=None): + self.image_root = image_root + self.label_root = label_root + self.seq_name = seq_name + self.images = images # image file list + self.labels = labels + self.obj_num = 1 + self.num_frame = len(self.images) + self.pipeline = pipeline + self.rgb = rgb + self.resolution = resolution + + self.obj_nums = [] + temp_obj_num = 0 + for img_name in self.images: + self.obj_nums.append(temp_obj_num) + current_label_name = img_name.split('.')[0] + '.png' + if current_label_name in self.labels: + current_label = self.read_label(current_label_name) + if temp_obj_num < np.unique( + current_label)[-1]: #get object number from label_id + temp_obj_num = np.unique(current_label)[-1] + + def __len__(self): + return len(self.images) + + def read_image(self, idx): + img_name = self.images[idx] + img_path = os.path.join(self.image_root, self.seq_name, img_name) + img = cv2.imread(img_path) + img = np.array(img, dtype=np.float32) + if self.rgb: + img = img[:, :, [2, 1, 0]] + return img + + def read_label(self, label_name): + label_path = os.path.join(self.label_root, self.seq_name, label_name) + label = Image.open(label_path) + label = np.array(label, dtype=np.uint8) + return label + + def __getitem__(self, idx): + img_name = self.images[idx] + current_img = self.read_image(idx) + current_img = np.array(current_img) + height, width, channels = current_img.shape + if self.resolution is not None: + width = int(np.ceil(float(width) * self.resolution / float(height))) + height = int(self.resolution) + + current_label_name = img_name.split('.')[0] + '.png' + obj_num = self.obj_nums[idx] + + if current_label_name in self.labels: + current_label = self.read_label(current_label_name) + current_label = np.array(current_label) + sample = { + 'current_img': current_img, + 'current_label': current_label + } + else: + sample = { + 'current_img': current_img + } #only the first frame contains label + + sample['meta'] = { + 'seq_name': self.seq_name, + 'frame_num': self.num_frame, + 'obj_num': obj_num, + 'current_name': img_name, + 'height': height, + 'width': width, + 'flip': False + } + if self.pipeline is not None: + sample = self.pipeline(sample) + for s in sample: + s['current_img'] = np.array(s['current_img']) + if 'current_label' in s.keys(): + s['current_label'] = s['current_label'] + return sample + + +@DATASETS.register() +class DavisDataset(BaseDataset): + """Davis 2017 dataset. + """ + def __init__( + self, + file_path, + result_root, + pipeline, + data_prefix=None, + test_mode=False, + year=2017, + rgb=False, + resolution='480p', + ): + self.rgb = rgb + self.result_root = result_root + self.resolution = resolution + self.year = year + self.spt = 'val' if test_mode else 'train' + super().__init__(file_path, pipeline, data_prefix, test_mode) + + def load_file(self): + self.image_root = os.path.join(self.file_path, 'JPEGImages', + self.resolution) + self.label_root = os.path.join(self.file_path, 'Annotations', + self.resolution) + seq_names = [] + with open( + os.path.join(self.file_path, 'ImageSets', str(self.year), + self.spt + '.txt')) as f: + seqs_tmp = f.readlines() + seqs_tmp = list(map(lambda elem: elem.strip(), seqs_tmp)) + seq_names.extend(seqs_tmp) + self.info = list(np.unique(seq_names)) + return self.info + + def prepare_test(self, idx): + seq_name = self.info[idx] #video name + images = list( + np.sort(os.listdir(os.path.join(self.image_root, seq_name)))) + labels = [images[0].replace('jpg', 'png')] #we have first frame target + + # copy first frame target + if not os.path.isfile( + os.path.join(self.result_root, seq_name, labels[0])): + if not os.path.exists(os.path.join(self.result_root, seq_name)): + os.makedirs(os.path.join(self.result_root, seq_name)) + source_label_path = os.path.join(self.label_root, seq_name, + labels[0]) + result_label_path = os.path.join(self.result_root, seq_name, + labels[0]) + + shutil.copy(source_label_path, result_label_path) + + seq_dataset = VOS_Test(self.image_root, + self.label_root, + seq_name, + images, + labels, + self.pipeline, + rgb=self.rgb, + resolution=480) + return seq_dataset diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py new file mode 100644 index 0000000..df5e33e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/feature.py @@ -0,0 +1,80 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import os.path as osp + +from ..registry import DATASETS +from .base import BaseDataset + + +@DATASETS.register() +class FeatureDataset(BaseDataset): + """Feature dataset for action recognition + Example:(TODO) + Args:(TODO) + """ + def __init__( + self, + file_path, + pipeline, + data_prefix=None, + test_mode=False, + suffix=None, + ): + self.suffix = suffix + super().__init__(file_path, pipeline, data_prefix, test_mode) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + filename = line.strip().split()[0] + if self.data_prefix is not None: + filename = osp.join(self.data_prefix, filename) + if self.suffix is not None: + filename = filename + self.suffix + + info.append(dict(filename=filename)) + return info + + def prepare_train(self, idx): + """TRAIN & VALID. Prepare the data for training/valid given the index.""" + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + + if 'iou_norm' in results: + return results['rgb_data'], results['rgb_len'], results[ + 'rgb_mask'], results['audio_data'], results[ + 'audio_len'], results['audio_mask'], results[ + 'labels'], results['iou_norm'] + else: + return results['rgb_data'], results['rgb_len'], results[ + 'rgb_mask'], results['audio_data'], results[ + 'audio_len'], results['audio_mask'], results['labels'] + + def prepare_test(self, idx): + """TEST. Prepare the data for testing given the index.""" + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + + if 'iou_norm' in results: + return results['rgb_data'], results['rgb_len'], results[ + 'rgb_mask'], results['audio_data'], results[ + 'audio_len'], results['audio_mask'], results[ + 'labels'], results['iou_norm'] + else: + return results['rgb_data'], results['rgb_len'], results[ + 'rgb_mask'], results['audio_data'], results[ + 'audio_len'], results['audio_mask'], results['labels'] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py new file mode 100644 index 0000000..b02f526 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/frame.py @@ -0,0 +1,177 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class FrameDataset(BaseDataset): + """Rawframe dataset for action recognition. + The dataset loads raw frames from frame files, and apply specified transform operatation them. + The indecx file is a text file with multiple lines, and each line indicates the directory of frames of a video, toatl frames of the video, and its label, which split with a whitespace. + Example of an index file: + + .. code-block:: txt + + file_path-1 150 1 + file_path-2 160 1 + file_path-3 170 2 + file_path-4 180 2 + + Args: + file_path (str): Path to the index file. + pipeline(XXX): + data_prefix (str): directory path of the data. Default: None. + test_mode (bool): Whether to bulid the test dataset. Default: False. + suffix (str): suffix of file. Default: 'img_{:05}.jpg'. + + """ + def __init__(self, + file_path, + pipeline, + num_retries=5, + data_prefix=None, + test_mode=False, + suffix='img_{:05}.jpg'): + self.num_retries = num_retries + self.suffix = suffix + super().__init__(file_path, pipeline, data_prefix, test_mode) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split() + frame_dir, frames_len, labels = line_split + if self.data_prefix is not None: + frame_dir = osp.join(self.data_prefix, frame_dir) + info.append( + dict(frame_dir=frame_dir, + suffix=self.suffix, + frames_len=frames_len, + labels=int(labels))) + return info + + def prepare_train(self, idx): + """Prepare the frames for training/valid given index. """ + #Try to catch Exception caused by reading missing frames files + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['frame_dir'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'], np.array([results['labels']]) + + def prepare_test(self, idx): + """Prepare the frames for test given index. """ + #Try to catch Exception caused by reading missing frames files + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['frame_dir'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'], np.array([results['labels']]) + + +@DATASETS.register() +class FrameDataset_Sport(BaseDataset): + """Video dataset for action recognition + The dataset loads raw videos and apply specified transforms on them. + The index file is a file with multiple lines, and each line indicates + a sample video with the filepath and label, which are split with a whitesapce. + Example of a inde file: + .. code-block:: txt + path/000.mp4 1 + path/001.mp4 1 + path/002.mp4 2 + path/003.mp4 2 + Args: + file_path(str): Path to the index file. + pipeline(XXX): A sequence of data transforms. + **kwargs: Keyword arguments for ```BaseDataset```. + """ + def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs): + self.num_retries = num_retries + self.suffix = suffix + super().__init__(file_path, pipeline, **kwargs) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split() + frame_dir = line_split[0] + if self.data_prefix is not None: + frame_dir = osp.join(self.data_prefix, frame_dir) + info.append(dict(frame_dir=frame_dir, suffix=self.suffix)) + return info + + def prepare_train(self, idx): + """TRAIN & VALID. Prepare the data for training/valid given the index.""" + #Try to catch Exception caused by reading corrupted video file + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['filename'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'], np.array([results['labels']]) + + def prepare_test(self, idx): + """TEST. Prepare the data for test given the index.""" + #Try to catch Exception caused by reading corrupted video file + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['filename'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'], np.array([results['labels']]) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py new file mode 100644 index 0000000..56e3b7b --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ms_tcn_dataset.py @@ -0,0 +1,110 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy + +import os +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class MSTCNDataset(BaseDataset): + """Video dataset for action segmentation. + """ + + def __init__( + self, + file_path, + pipeline, + feature_path, + gt_path, + actions_map_file_path, + **kwargs, + ): + super().__init__(file_path, pipeline, **kwargs) + self.gt_path = gt_path + self.actions_map_file_path = actions_map_file_path + self.feature_path = feature_path + + # actions dict generate + file_ptr = open(self.actions_map_file_path, 'r') + actions = file_ptr.read().split('\n')[:-1] + file_ptr.close() + self.actions_dict = dict() + for a in actions: + self.actions_dict[a.split()[1]] = int(a.split()[0]) + + self.num_classes = len(self.actions_dict.keys()) + + def load_file(self): + """Load index file to get video information.""" + file_ptr = open(self.file_path, 'r') + info = file_ptr.read().split('\n')[:-1] + file_ptr.close() + return info + + def prepare_train(self, idx): + """TRAIN & VALID: Prepare data for training/valid given the index.""" + results = {} + video_name = self.info[idx] + # load video feature + file_name = video_name.split('.')[0] + ".npy" + feat_file_path = os.path.join(self.feature_path, file_name) + #TODO: check path + video_feat = np.load(feat_file_path) + + # load label + target_file_path = os.path.join(self.gt_path, video_name) + file_ptr = open(target_file_path, 'r') + content = file_ptr.read().split('\n')[:-1] + classes = np.zeros(min(np.shape(video_feat)[1], len(content)), dtype='int64') + for i in range(len(classes)): + classes[i] = self.actions_dict[content[i]] + # classes = classes * (-100) + + results['video_feat'] = copy.deepcopy(video_feat) + results['video_gt'] = copy.deepcopy(classes) + + results = self.pipeline(results) + return results['video_feat'], results['video_gt'] + + def prepare_test(self, idx): + """TEST: Prepare the data for test given the index.""" + results = {} + video_name = self.info[idx] + # load video feature + file_name = video_name.split('.')[0] + ".npy" + feat_file_path = os.path.join(self.feature_path, file_name) + #TODO: check path + video_feat = np.load(feat_file_path) + + # load label + target_file_path = os.path.join(self.gt_path, video_name) + file_ptr = open(target_file_path, 'r') + content = file_ptr.read().split('\n')[:-1] + classes = np.zeros(min(np.shape(video_feat)[1], len(content))) + for i in range(len(classes)): + classes[i] = self.actions_dict[content[i]] + # classes = classes * (-100) + + results['video_feat'] = copy.deepcopy(video_feat) + results['video_gt'] = copy.deepcopy(classes) + + results = self.pipeline(results) + return results['video_feat'], results['video_gt'] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py new file mode 100644 index 0000000..0e5294f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/msrvtt.py @@ -0,0 +1,220 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np +try: + import lmdb +except ImportError as e: + print( + f"Warning! {e}, [lmdb] package and it's dependencies is required for ActBERT." + ) +import pickle +try: + from paddlenlp.transformers import BertTokenizer +except ImportError as e: + print( + f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT." + ) +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class MSRVTTDataset(BaseDataset): + """MSR-VTT dataset for text-video clip retrieval. + """ + def __init__( + self, + file_path, + pipeline, + features_path, + bert_model="bert-base-uncased", + padding_index=0, + max_seq_length=36, + max_region_num=36, + max_action_num=5, + vision_feature_dim=2048, + action_feature_dim=2048, + spatials_dim=5, + data_prefix=None, + test_mode=False, + ): + self.features_path = features_path + self.bert_model = bert_model + self.padding_index = padding_index + self.max_seq_length = max_seq_length + self.max_region_num = max_region_num + self._max_action_num = max_action_num + self.vision_feature_dim = vision_feature_dim + self.action_feature_dim = action_feature_dim + self.spatials_dim = spatials_dim + self._tokenizer = BertTokenizer.from_pretrained(bert_model, + do_lower_case=True) + super().__init__(file_path, pipeline, data_prefix, test_mode) + self.tokenize() + self.gen_feature() + + def load_file(self): + """Load index file to get video information.""" + with open(self.file_path) as fin: + self.image_entries = [] + self.caption_entries = [] + for line in fin.readlines(): + line = line.strip() + vid_id = line.split(',')[0] + self.image_entries.append(vid_id) + self.caption_entries.append({ + "caption": line.split(',')[1], + "vid_id": vid_id + }) + self.env = lmdb.open(self.features_path) + + def tokenize(self): + for entry in self.caption_entries: + tokens = [] + tokens.append("[CLS]") + for token in self._tokenizer.tokenize(entry["caption"]): + tokens.append(token) + tokens.append("[SEP]") + tokens = self._tokenizer.convert_tokens_to_ids(tokens) + + segment_ids = [0] * len(tokens) + input_mask = [1] * len(tokens) + + if len(tokens) < self.max_seq_length: + padding = [self.padding_index + ] * (self.max_seq_length - len(tokens)) + tokens = tokens + padding + input_mask += padding + segment_ids += padding + + entry["token"] = np.array(tokens).astype('int64') + entry["input_mask"] = np.array(input_mask) + entry["segment_ids"] = np.array(segment_ids).astype('int64') + + def get_image_feature(self, video_id): + video_id = str(video_id).encode() + with self.env.begin(write=False) as txn: + item = pickle.loads(txn.get(video_id)) + video_id = item["video_id"] + image_h = int(item["image_h"]) + image_w = int(item["image_w"]) + + features = item["features"].reshape(-1, self.vision_feature_dim) + boxes = item["boxes"].reshape(-1, 4) + + num_boxes = features.shape[0] + g_feat = np.sum(features, axis=0) / num_boxes + num_boxes = num_boxes + 1 + features = np.concatenate( + [np.expand_dims(g_feat, axis=0), features], axis=0) + + action_features = item["action_features"].reshape( + -1, self.action_feature_dim) + + image_location = np.zeros((boxes.shape[0], self.spatials_dim), + dtype=np.float32) + image_location[:, :4] = boxes + image_location[:, + 4] = ((image_location[:, 3] - image_location[:, 1]) * + (image_location[:, 2] - image_location[:, 0]) / + (float(image_w) * float(image_h))) + + image_location[:, 0] = image_location[:, 0] / float(image_w) + image_location[:, 1] = image_location[:, 1] / float(image_h) + image_location[:, 2] = image_location[:, 2] / float(image_w) + image_location[:, 3] = image_location[:, 3] / float(image_h) + + g_location = np.array([0, 0, 1, 1, 1]) + image_location = np.concatenate( + [np.expand_dims(g_location, axis=0), image_location], axis=0) + return features, num_boxes, image_location, action_features + + def gen_feature(self): + num_inst = len(self.image_entries) #1000 + self.features_all = np.zeros( + (num_inst, self.max_region_num, self.vision_feature_dim)) + self.action_features_all = np.zeros( + (num_inst, self._max_action_num, self.action_feature_dim)) + self.spatials_all = np.zeros( + (num_inst, self.max_region_num, self.spatials_dim)) + self.image_mask_all = np.zeros((num_inst, self.max_region_num)) + self.action_mask_all = np.zeros((num_inst, self._max_action_num)) + + for i, image_id in enumerate(self.image_entries): + features, num_boxes, boxes, action_features = self.get_image_feature( + image_id) + + mix_num_boxes = min(int(num_boxes), self.max_region_num) + mix_boxes_pad = np.zeros((self.max_region_num, self.spatials_dim)) + mix_features_pad = np.zeros( + (self.max_region_num, self.vision_feature_dim)) + + image_mask = [1] * (int(mix_num_boxes)) + while len(image_mask) < self.max_region_num: + image_mask.append(0) + action_mask = [1] * (self._max_action_num) + while len(action_mask) < self._max_action_num: + action_mask.append(0) + + mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes] + mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes] + + self.features_all[i] = mix_features_pad + x = action_features.shape[0] + self.action_features_all[i][:x] = action_features[:] + self.image_mask_all[i] = np.array(image_mask) + self.action_mask_all[i] = np.array(action_mask) + self.spatials_all[i] = mix_boxes_pad + + self.features_all = self.features_all.astype("float32") + self.action_features_all = self.action_features_all.astype("float32") + self.image_mask_all = self.image_mask_all.astype("int64") + self.action_mask_all = self.action_mask_all.astype("int64") + self.spatials_all = self.spatials_all.astype("float32") + + def prepare_train(self, idx): + pass + + def prepare_test(self, idx): + entry = self.caption_entries[idx] + caption = entry["token"] + input_mask = entry["input_mask"] + segment_ids = entry["segment_ids"] + + target_all = np.zeros(1000) + for i, image_id in enumerate(self.image_entries): + if image_id == entry["vid_id"]: + target_all[i] = 1 + + return ( + caption, + self.action_features_all, + self.features_all, + self.spatials_all, + segment_ids, + input_mask, + self.image_mask_all, + self.action_mask_all, + target_all, + ) + + def __len__(self): + return len(self.caption_entries) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py new file mode 100644 index 0000000..a9e65c6 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/oxford.py @@ -0,0 +1,62 @@ +# Copyright Niantic 2019. Patent Pending. All rights reserved. +# +# This software is licensed under the terms of the Monodepth2 licence +# which allows for non-commercial use only, the full terms of which are made +# available in the LICENSE file. + +from __future__ import absolute_import, division, print_function + +import copy +from os import path as osp + +from PIL import Image + +from ..registry import DATASETS +from .base import BaseDataset + + +def pil_loader(path): + # open path as file to avoid ResourceWarning + # (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + with Image.open(f) as img: + return img.convert('RGB') + + +@DATASETS.register() +class MonoDataset(BaseDataset): + def __init__(self, + file_path, + data_prefix, + pipeline, + num_retries=0, + suffix='.png', + **kwargs): + self.num_retries = num_retries + self.suffix = suffix + super().__init__(file_path, pipeline, data_prefix, **kwargs) + + def load_file(self): + info = [] + with open(self.file_path, 'r') as f: + for line in f: + filename = line.strip() + self.suffix + folder = osp.dirname(filename) + frame_index = line.strip().split('/')[1] + info.append( + dict(data_path=self.data_prefix, + filename=filename, + folder=folder, + frame_index=int(frame_index))) + return info + + def prepare_train(self, idx): + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + results['imgs']['idx'] = idx + return results['imgs'], results['day_or_night'] + + def prepare_test(self, idx): + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + return results['imgs'], results['day_or_night'] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py new file mode 100644 index 0000000..30a3f3e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/skeleton.py @@ -0,0 +1,78 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np +import pickle + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class SkeletonDataset(BaseDataset): + """ + Skeleton dataset for action recognition. + The dataset loads skeleton feature, and apply norm operatations. + Args: + file_path (str): Path to the index file. + pipeline(obj): Define the pipeline of data preprocessing. + data_prefix (str): directory path of the data. Default: None. + test_mode (bool): Whether to bulid the test dataset. Default: False. + """ + def __init__(self, file_path, pipeline, label_path=None, test_mode=False): + self.label_path = label_path + super().__init__(file_path, pipeline, test_mode=test_mode) + + def load_file(self): + """Load feature file to get skeleton information.""" + logger.info("Loading data, it will take some moment...") + self.data = np.load(self.file_path) + if self.label_path: + if self.label_path.endswith('npy'): + self.label = np.load(self.label_path) + elif self.label_path.endswith('pkl'): + with open(self.label_path, 'rb') as f: + sample_name, self.label = pickle.load(f) + else: + logger.info( + "Label path not provided when test_mode={}, here just output predictions." + .format(self.test_mode)) + logger.info("Data Loaded!") + return self.data # used for __len__ + + def prepare_train(self, idx): + """Prepare the feature for training/valid given index. """ + results = dict() + results['data'] = copy.deepcopy(self.data[idx]) + results['label'] = copy.deepcopy(self.label[idx]) + results = self.pipeline(results) + return results['data'], results['label'] + + def prepare_test(self, idx): + """Prepare the feature for test given index. """ + results = dict() + results['data'] = copy.deepcopy(self.data[idx]) + if self.label_path: + results['label'] = copy.deepcopy(self.label[idx]) + results = self.pipeline(results) + return results['data'], results['label'] + else: + results = self.pipeline(results) + return [results['data']] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py new file mode 100644 index 0000000..1adf89c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/slowfast_video.py @@ -0,0 +1,143 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + +@DATASETS.register() +class SFVideoDataset(BaseDataset): + """Video dataset for action recognition + The dataset loads raw videos and apply specified transforms on them. + + The index file is a file with multiple lines, and each line indicates + a sample video with the filepath and label, which are split with a whitesapce. + Example of a inde file: + + .. code-block:: txt + + path/000.mp4 1 + path/001.mp4 1 + path/002.mp4 2 + path/003.mp4 2 + + Args: + file_path(str): Path to the index file. + pipeline(XXX): A sequence of data transforms. + num_ensemble_views(int): temporal segment when multi-crop test + num_spatial_crops(int): spatial crop number when multi-crop test + **kwargs: Keyword arguments for ```BaseDataset```. + + """ + def __init__( + self, + file_path, + pipeline, + num_ensemble_views=1, + num_spatial_crops=1, + num_retries=5, + num_samples_precise_bn=None, + **kwargs, + ): + self.num_ensemble_views = num_ensemble_views + self.num_spatial_crops = num_spatial_crops + self.num_retries = num_retries + self.num_samples_precise_bn = num_samples_precise_bn + super().__init__(file_path, pipeline, **kwargs) + #set random seed + random.seed(0) + np.random.seed(0) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split() + filename, labels = line_split + if self.data_prefix is not None: + filename = osp.join(self.data_prefix, filename) + for tidx in range(self.num_ensemble_views): + for sidx in range(self.num_spatial_crops): + info.append( + dict( + filename=filename, + labels=int(labels), + temporal_sample_index=tidx, + spatial_sample_index=sidx, + temporal_num_clips=self.num_ensemble_views, + spatial_num_clips=self.num_spatial_crops, + )) + return info + + def prepare_train(self, idx): + """TRAIN & VALID. Prepare the data for training given the index.""" + #Try to catch Exception caused by reading corrupted video file + short_cycle = False + if isinstance(idx, tuple): + idx, short_cycle_idx = idx + short_cycle = True + for ir in range(self.num_retries): + try: + #Multi-grid short cycle + if short_cycle: + results = copy.deepcopy(self.info[idx]) + results['short_cycle_idx'] = short_cycle_idx + else: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['filename'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + + return results['imgs'][0], results['imgs'][1], np.array( + [results['labels']]) + + def prepare_test(self, idx): + """TEST. Prepare the data for test given the index.""" + #Try to catch Exception caused by reading corrupted video file + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['filename'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'][0], results['imgs'][1], np.array( + [results['labels']]), np.array([idx]) + + def __len__(self): + """get the size of the dataset.""" + if self.num_samples_precise_bn is None: + return len(self.info) + else: + random.shuffle(self.info) + return min(self.num_samples_precise_bn, len(self.info)) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py new file mode 100644 index 0000000..8177933 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf101_skeleton.py @@ -0,0 +1,89 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np +import pickle + +import paddle +from paddle.io import Dataset + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class UCF101SkeletonDataset(BaseDataset): + """ + Skeleton dataset for action recognition. + The dataset loads skeleton feature, and apply norm operatations. + Args: + file_path (str): Path to the index file. + pipeline(obj): Define the pipeline of data preprocessing. + test_mode (bool): Whether to bulid the test dataset. Default: False. + """ + + def __init__(self, + file_path, + pipeline, + split, + repeat_times, + test_mode=False): + self.split = split + self.repeat_times = repeat_times + super().__init__(file_path, pipeline, test_mode=test_mode) + self._ori_len = len(self.info) + self.start_index = 0 + self.modality = "Pose" + + def load_file(self): + """Load annotation file to get video information.""" + assert self.file_path.endswith('.pkl') + return self.load_pkl_annotations() + + def load_pkl_annotations(self): + with open(self.file_path, "rb") as f: + data = pickle.load(f) + + if self.split: + split, data = data['split'], data['annotations'] + identifier = 'filename' if 'filename' in data[0] else 'frame_dir' + data = [x for x in data if x[identifier] in split[self.split]] + + return data + + def prepare_train(self, idx): + """Prepare the frames for training given the index.""" + results = copy.deepcopy(self.info[idx % self._ori_len]) + results['modality'] = self.modality + results['start_index'] = self.start_index + + return self.pipeline(results) + + def prepare_test(self, idx): + """Prepare the frames for testing given the index.""" + results = copy.deepcopy(self.info[idx % self._ori_len]) + results['modality'] = self.modality + results['start_index'] = self.start_index + + return self.pipeline(results) + + def __len__(self): + """get the size of the dataset.""" + return len(self.info) * self.repeat_times diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py new file mode 100644 index 0000000..ad2e84e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/ucf24_dataset.py @@ -0,0 +1,76 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class UCF24Dataset(BaseDataset): + """Dataset for YOWO + The dataset loads raw videos and apply specified transforms on them. + The index file is a file with multiple lines, and each line indicates + a sample video with the filepath and label, which are split with a whitesapce. + Example of a inde file: + .. code-block:: txt + + Args: + file_path(str): Path to the index file. + pipeline(XXX): A sequence of data transforms. + **kwargs: Keyword arguments for ```BaseDataset```. + """ + + def __init__(self, file_path, pipeline, num_retries=5, **kwargs): + self.num_retries = num_retries + super().__init__(file_path, pipeline, **kwargs) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + lines = fin.readlines() + for line in lines: + line = line.strip() # 'data/ucf24/labels/class_name/video_name/key_frame.txt' + filename = line.replace('txt', 'jpg').replace( + 'labels', 'rgb-images') # key frame path + + info.append(dict(filename=filename)) + return info + + def prepare_train(self, idx): + """TRAIN & VALID. Prepare the data for training/valid given the index.""" + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + im_path = results['filename'] + im_path = im_path.replace('jpg', 'txt') + im_split = im_path.split('/') + frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5] + return results['imgs'], np.array([results['labels']]), frame_index + + def prepare_test(self, idx): + """TEST. Prepare the data for test given the index.""" + # Try to catch Exception caused by reading corrupted video file + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + im_path = results['filename'] + im_path = im_path.replace('jpg', 'txt') + im_split = im_path.split('/') + frame_index = im_split[3] + '_' + im_split[4] + '_' + im_split[5] + return results['imgs'], np.array([results['labels']]), frame_index diff --git a/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py b/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py new file mode 100644 index 0000000..f2d8f89 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/dataset/video.py @@ -0,0 +1,95 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import copy +import random +import numpy as np + +from ..registry import DATASETS +from .base import BaseDataset +from ...utils import get_logger + +logger = get_logger("paddlevideo") + + +@DATASETS.register() +class VideoDataset(BaseDataset): + """Video dataset for action recognition + The dataset loads raw videos and apply specified transforms on them. + The index file is a file with multiple lines, and each line indicates + a sample video with the filepath and label, which are split with a whitesapce. + Example of a inde file: + .. code-block:: txt + path/000.mp4 1 + path/001.mp4 1 + path/002.mp4 2 + path/003.mp4 2 + Args: + file_path(str): Path to the index file. + pipeline(XXX): A sequence of data transforms. + **kwargs: Keyword arguments for ```BaseDataset```. + """ + def __init__(self, file_path, pipeline, num_retries=5, suffix='', **kwargs): + self.num_retries = num_retries + self.suffix = suffix + super().__init__(file_path, pipeline, **kwargs) + + def load_file(self): + """Load index file to get video information.""" + info = [] + with open(self.file_path, 'r') as fin: + for line in fin: + line_split = line.strip().split() + filename, labels = line_split + #TODO(hj): Required suffix format: may mp4/avi/wmv + filename = filename + self.suffix + if self.data_prefix is not None: + filename = osp.join(self.data_prefix, filename) + info.append(dict(filename=filename, labels=int(labels))) + return info + + def prepare_train(self, idx): + """TRAIN & VALID. Prepare the data for training/valid given the index.""" + #Try to catch Exception caused by reading corrupted video file + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['filename'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'], np.array([results['labels']]) + + def prepare_test(self, idx): + """TEST. Prepare the data for test given the index.""" + #Try to catch Exception caused by reading corrupted video file + for ir in range(self.num_retries): + try: + results = copy.deepcopy(self.info[idx]) + results = self.pipeline(results) + except Exception as e: + #logger.info(e) + if ir < self.num_retries - 1: + logger.info( + "Error when loading {}, have {} trys, will try again". + format(results['filename'], ir)) + idx = random.randint(0, len(self.info) - 1) + continue + return results['imgs'], np.array([results['labels']]) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__init__.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__init__.py new file mode 100644 index 0000000..6e6afdc --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__init__.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .anet_pipeline import GetMatchMap, GetVideoLabel, LoadFeat +from .augmentations import (CenterCrop, ColorJitter, GroupRandomFlip, + GroupResize, Image2Array, JitterScale, MultiCrop, + Normalization, PackOutput, RandomCrop, RandomFlip, + RandomResizedCrop, Scale, TenCrop, ToArray, + UniformCrop, RandomGamma, MultiCenterCrop, + RandomBrightness, RandomHue, RandomSaturation, YowoAug) +from .augmentations_ava import * +from .compose import Compose +from .decode import FeatureDecoder, FrameDecoder, VideoDecoder, ActionFeatureDecoder +from .decode_image import ImageDecoder +from .decode_sampler import DecodeSampler +from .mix import Cutmix, Mixup, VideoMix +from .multimodal import FeaturePadding, RandomCap, RandomMask, Tokenize +from .sample import Sampler, SamplerPkl +from .sample_ava import * +from .segmentation import MultiNorm, MultiRestrictSize +from .skeleton_pipeline import AutoPadding, Iden, SkeletonNorm +from .skeleton_pipeline import SketeonCropSample, SketeonModalityTransform, RandomRotation +from .skeleton_pipeline import (UniformSampleFrames, PoseDecode, PoseCompact, + RandomResizedCrop_V2, Flip_V2, CenterCrop_V2, + GeneratePoseTarget, FormatShape, Collect) +from .decode_sampler_MRI import SFMRI_DecodeSampler +from .segmentation_pipline import SegmentationSampler +from .sample_ucf24 import SamplerUCF24 + +__all__ = [ + 'ImageDecoder', 'RandomMask', 'UniformCrop', 'SkeletonNorm', 'Tokenize', + 'Sampler', 'FeatureDecoder', 'DecodeSampler', 'TenCrop', 'Compose', + 'AutoPadding', 'Normalization', 'Mixup', 'Image2Array', 'Scale', + 'GroupResize', 'VideoDecoder', 'FrameDecoder', 'PackOutput', + 'ActionFeatureDecoder', 'GetVideoLabel', 'Cutmix', 'CenterCrop', + 'RandomCrop', 'LoadFeat', 'RandomCap', 'JitterScale', 'Iden', 'VideoMix', + 'ColorJitter', 'RandomFlip', 'ToArray', 'FeaturePadding', 'GetMatchMap', + 'GroupRandomFlip', 'MultiCrop', 'SFMRI_DecodeSampler', 'MultiRestrictSize', + 'MultiNorm', 'RandomResizedCrop', 'SamplerPkl', 'SegmentationSampler', + 'SketeonCropSample', 'SketeonModalityTransform', 'RandomRotation', + 'RandomGamma', 'MultiCenterCrop', 'RandomBrightness', 'RandomHue', + 'RandomSaturation', 'UniformSampleFrames', 'PoseDecode', 'PoseCompact', + 'Resize', 'RandomResizedCrop_V2', 'Flip_V2', 'GeneratePoseTarget', + 'FormatShape', 'Collect', 'RandomSaturation', 'SamplerUCF24', 'YowoAug' +] diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..303f568 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/anet_pipeline.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/anet_pipeline.cpython-310.pyc new file mode 100644 index 0000000..5e4b7eb Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/anet_pipeline.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations.cpython-310.pyc new file mode 100644 index 0000000..d785703 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations_ava.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations_ava.cpython-310.pyc new file mode 100644 index 0000000..ce471b7 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/augmentations_ava.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/compose.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/compose.cpython-310.pyc new file mode 100644 index 0000000..8a2e06c Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/compose.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode.cpython-310.pyc new file mode 100644 index 0000000..72e5884 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_image.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_image.cpython-310.pyc new file mode 100644 index 0000000..ddf1ab1 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_image.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler.cpython-310.pyc new file mode 100644 index 0000000..99b86d9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler_MRI.cpython-310.pyc new file mode 100644 index 0000000..17917fa Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/decode_sampler_MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/mix.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/mix.cpython-310.pyc new file mode 100644 index 0000000..0d47a42 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/mix.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/multimodal.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/multimodal.cpython-310.pyc new file mode 100644 index 0000000..fcf9da5 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/multimodal.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample.cpython-310.pyc new file mode 100644 index 0000000..b90beeb Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ava.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ava.cpython-310.pyc new file mode 100644 index 0000000..0187f00 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ava.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ucf24.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ucf24.cpython-310.pyc new file mode 100644 index 0000000..0cf5b4f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/sample_ucf24.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation.cpython-310.pyc new file mode 100644 index 0000000..7dd91b3 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation_pipline.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation_pipline.cpython-310.pyc new file mode 100644 index 0000000..00d87e4 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/segmentation_pipline.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/skeleton_pipeline.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/skeleton_pipeline.cpython-310.pyc new file mode 100644 index 0000000..e4ddc34 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/__pycache__/skeleton_pipeline.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py new file mode 100644 index 0000000..210d733 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/anet_pipeline.py @@ -0,0 +1,150 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +from ..registry import PIPELINES +"""pipeline ops for Activity Net. +""" + + +@PIPELINES.register() +class LoadFeat(object): + def __init__(self, feat_path): + self.feat_path = feat_path + + def __call__(self, results): + video_name = results['video_name'] + file_name = video_name + ".npy" + file_path = os.path.join(self.feat_path, file_name) + #TODO: check path + video_feat = np.load(file_path) + video_feat = video_feat.T + video_feat = video_feat.astype("float32") + results['video_feat'] = video_feat + return results + + +@PIPELINES.register() +class GetMatchMap(object): + def __init__(self, tscale): + self.tscale = tscale + self.tgap = 1. / self.tscale + + def __call__(self, results): + match_map = [] + for idx in range(self.tscale): + tmp_match_window = [] + xmin = self.tgap * idx + for jdx in range(1, self.tscale + 1): + xmax = xmin + self.tgap * jdx + tmp_match_window.append([xmin, xmax]) + match_map.append(tmp_match_window) + match_map = np.array(match_map) + match_map = np.transpose(match_map, [1, 0, 2]) + match_map = np.reshape(match_map, [-1, 2]) + + anchor_xmin = [self.tgap * i for i in range(self.tscale)] + anchor_xmax = [self.tgap * i for i in range(1, self.tscale + 1)] + + results['match_map'] = match_map + results['anchor_xmin'] = anchor_xmin + results['anchor_xmax'] = anchor_xmax + return results + + +@PIPELINES.register() +class GetVideoLabel(object): + def __init__(self, tscale, dscale, datatype="float32"): + self.tscale = tscale + self.dscale = dscale + self.tgap = 1. / self.tscale + self.datatype = datatype + + def iou_with_anchors(self, anchors_min, anchors_max, box_min, box_max): + """Compute jaccard score between a box and the anchors. + """ + len_anchors = anchors_max - anchors_min + int_xmin = np.maximum(anchors_min, box_min) + int_xmax = np.minimum(anchors_max, box_max) + inter_len = np.maximum(int_xmax - int_xmin, 0.) + union_len = len_anchors - inter_len + box_max - box_min + jaccard = np.divide(inter_len, union_len) + return jaccard + + def ioa_with_anchors(self, anchors_min, anchors_max, box_min, box_max): + """Compute intersection between score a box and the anchors. + """ + len_anchors = anchors_max - anchors_min + int_xmin = np.maximum(anchors_min, box_min) + int_xmax = np.minimum(anchors_max, box_max) + inter_len = np.maximum(int_xmax - int_xmin, 0.) + scores = np.divide(inter_len, len_anchors) + return scores + + def __call__(self, results): + video_info = results['video_info'] + match_map = results['match_map'] + anchor_xmin = results['anchor_xmin'] + anchor_xmax = results['anchor_xmax'] + + video_second = video_info['duration_second'] + video_labels = video_info['annotations'] + + gt_bbox = [] + gt_iou_map = [] + for gt in video_labels: + tmp_start = max(min(1, gt["segment"][0] / video_second), 0) + tmp_end = max(min(1, gt["segment"][1] / video_second), 0) + gt_bbox.append([tmp_start, tmp_end]) + tmp_gt_iou_map = self.iou_with_anchors(match_map[:, 0], + match_map[:, 1], tmp_start, + tmp_end) + tmp_gt_iou_map = np.reshape(tmp_gt_iou_map, + [self.dscale, self.tscale]) + gt_iou_map.append(tmp_gt_iou_map) + gt_iou_map = np.array(gt_iou_map) + gt_iou_map = np.max(gt_iou_map, axis=0) + + gt_bbox = np.array(gt_bbox) + gt_xmins = gt_bbox[:, 0] + gt_xmaxs = gt_bbox[:, 1] + gt_len_small = 3 * self.tgap + gt_start_bboxs = np.stack( + (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) + gt_end_bboxs = np.stack( + (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) + + match_score_start = [] + for jdx in range(len(anchor_xmin)): + match_score_start.append( + np.max( + self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], + gt_start_bboxs[:, 0], + gt_start_bboxs[:, 1]))) + match_score_end = [] + for jdx in range(len(anchor_xmin)): + match_score_end.append( + np.max( + self.ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], + gt_end_bboxs[:, 0], gt_end_bboxs[:, + 1]))) + + gt_start = np.array(match_score_start) + gt_end = np.array(match_score_end) + + results['gt_iou_map'] = gt_iou_map.astype(self.datatype) + results['gt_start'] = gt_start.astype(self.datatype) + results['gt_end'] = gt_end.astype(self.datatype) + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py new file mode 100644 index 0000000..24f3c71 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations.py @@ -0,0 +1,1427 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import random +from collections.abc import Sequence + +import cv2 +import numpy as np +import paddle +import paddle.nn.functional as F +from PIL import Image + +from ..registry import PIPELINES + + +@PIPELINES.register() +class Scale(object): + """ + Scale images. + Args: + short_size(float | int): Short size of an image will be scaled to the short_size. + fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True + do_round(bool): Whether to round up when calculating the zoom ratio. default: False + backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow' + """ + def __init__(self, + short_size, + fixed_ratio=True, + keep_ratio=None, + do_round=False, + backend='pillow'): + self.short_size = short_size + assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \ + f"fixed_ratio and keep_ratio cannot be true at the same time" + self.fixed_ratio = fixed_ratio + self.keep_ratio = keep_ratio + self.do_round = do_round + + assert backend in [ + 'pillow', 'cv2' + ], f"Scale's backend must be pillow or cv2, but get {backend}" + self.backend = backend + + def __call__(self, results): + """ + Performs resize operations. + Args: + imgs (Sequence[PIL.Image]): List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + resized_imgs: List where each item is a PIL.Image after scaling. + """ + imgs = results['imgs'] + resized_imgs = [] + for i in range(len(imgs)): + img = imgs[i] + if isinstance(img, np.ndarray): + h, w, _ = img.shape + elif isinstance(img, Image.Image): + w, h = img.size + else: + raise NotImplementedError + if (w <= h and w == self.short_size) or (h <= w + and h == self.short_size): + if self.backend == 'pillow' and not isinstance( + img, Image.Image): + img = Image.fromarray(img) + resized_imgs.append(img) + continue + + if w <= h: + ow = self.short_size + if self.fixed_ratio: + oh = int(self.short_size * 4.0 / 3.0) + elif self.keep_ratio is False: + oh = self.short_size + else: + scale_factor = self.short_size / w + oh = int(h * float(scale_factor) + + 0.5) if self.do_round else int(h * + self.short_size / w) + ow = int(w * float(scale_factor) + + 0.5) if self.do_round else self.short_size + else: + oh = self.short_size + if self.fixed_ratio: + ow = int(self.short_size * 4.0 / 3.0) + elif self.keep_ratio is False: + ow = self.short_size + else: + scale_factor = self.short_size / h + oh = int(h * float(scale_factor) + + 0.5) if self.do_round else self.short_size + ow = int(w * float(scale_factor) + + 0.5) if self.do_round else int(w * + self.short_size / h) + if self.backend == 'pillow': + resized_imgs.append(img.resize((ow, oh), Image.BILINEAR)) + elif self.backend == 'cv2' and (self.keep_ratio is not None): + resized_imgs.append( + cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR)) + else: + resized_imgs.append( + Image.fromarray( + cv2.resize(np.asarray(img), (ow, oh), + interpolation=cv2.INTER_LINEAR))) + results['imgs'] = resized_imgs + return results + + +@PIPELINES.register() +class RandomCrop(object): + """ + Random crop images. + Args: + target_size(int): Random crop a square with the target_size from an image. + """ + def __init__(self, target_size): + self.target_size = target_size + + def __call__(self, results): + """ + Performs random crop operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + crop_imgs: List where each item is a PIL.Image after random crop. + """ + imgs = results['imgs'] + if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w] + h, w = imgs.shape[2:] + else: + w, h = imgs[0].size + th, tw = self.target_size, self.target_size + + assert (w >= self.target_size) and (h >= self.target_size), \ + "image width({}) and height({}) should be larger than crop size".format( + w, h, self.target_size) + + crop_images = [] + if 'backend' in results and results['backend'] == 'pyav': + x1 = np.random.randint(0, w - tw) + y1 = np.random.randint(0, h - th) + crop_images = imgs[:, :, y1:y1 + th, x1:x1 + tw] # [C, T, th, tw] + else: + x1 = random.randint(0, w - tw) + y1 = random.randint(0, h - th) + for img in imgs: + if w == tw and h == th: + crop_images.append(img) + else: + crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th))) + results['imgs'] = crop_images + return results + + +@PIPELINES.register() +class RandomResizedCrop(RandomCrop): + def __init__(self, + area_range=(0.08, 1.0), + aspect_ratio_range=(3 / 4, 4 / 3), + target_size=224, + backend='cv2'): + + self.area_range = area_range + self.aspect_ratio_range = aspect_ratio_range + self.target_size = target_size + self.backend = backend + + @staticmethod + def get_crop_bbox(img_shape, + area_range, + aspect_ratio_range, + max_attempts=10): + + assert 0 < area_range[0] <= area_range[1] <= 1 + assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1] + + img_h, img_w = img_shape + area = img_h * img_w + + min_ar, max_ar = aspect_ratio_range + aspect_ratios = np.exp( + np.random.uniform(np.log(min_ar), np.log(max_ar), + size=max_attempts)) + target_areas = np.random.uniform(*area_range, size=max_attempts) * area + candidate_crop_w = np.round(np.sqrt(target_areas * + aspect_ratios)).astype(np.int32) + candidate_crop_h = np.round(np.sqrt(target_areas / + aspect_ratios)).astype(np.int32) + + for i in range(max_attempts): + crop_w = candidate_crop_w[i] + crop_h = candidate_crop_h[i] + if crop_h <= img_h and crop_w <= img_w: + x_offset = random.randint(0, img_w - crop_w) + y_offset = random.randint(0, img_h - crop_h) + return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h + + # Fallback + crop_size = min(img_h, img_w) + x_offset = (img_w - crop_size) // 2 + y_offset = (img_h - crop_size) // 2 + return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size + + def __call__(self, results): + imgs = results['imgs'] + if self.backend == 'pillow': + img_w, img_h = imgs[0].size + elif self.backend == 'cv2': + img_h, img_w, _ = imgs[0].shape + elif self.backend == 'pyav': + img_h, img_w = imgs.shape[2:] # [cthw] + else: + raise NotImplementedError + + left, top, right, bottom = self.get_crop_bbox( + (img_h, img_w), self.area_range, self.aspect_ratio_range) + + if self.backend == 'pillow': + img_w, img_h = imgs[0].size + imgs = [img.crop(left, top, right, bottom) for img in imgs] + elif self.backend == 'cv2': + img_h, img_w, _ = imgs[0].shape + imgs = [img[top:bottom, left:right] for img in imgs] + elif self.backend == 'pyav': + img_h, img_w = imgs.shape[2:] # [cthw] + imgs = imgs[:, :, top:bottom, left:right] + else: + raise NotImplementedError + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class CenterCrop(object): + """ + Center crop images. + Args: + target_size(int): Center crop a square with the target_size from an image. + do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True + """ + def __init__(self, target_size, do_round=True, backend='pillow'): + self.target_size = target_size + self.do_round = do_round + self.backend = backend + + def __call__(self, results): + """ + Performs Center crop operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + ccrop_imgs: List where each item is a PIL.Image after Center crop. + """ + imgs = results['imgs'] + ccrop_imgs = [] + th, tw = self.target_size, self.target_size + if isinstance(imgs, paddle.Tensor): + h, w = imgs.shape[-2:] + x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2 + y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2 + ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw] + else: + for img in imgs: + if self.backend == 'pillow': + w, h = img.size + elif self.backend == 'cv2': + h, w, _ = img.shape + else: + raise NotImplementedError + assert (w >= self.target_size) and (h >= self.target_size), \ + "image width({}) and height({}) should be larger than crop size".format( + w, h, self.target_size) + x1 = int(round( + (w - tw) / 2.0)) if self.do_round else (w - tw) // 2 + y1 = int(round( + (h - th) / 2.0)) if self.do_round else (h - th) // 2 + if self.backend == 'cv2': + ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw]) + elif self.backend == 'pillow': + ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th))) + results['imgs'] = ccrop_imgs + return results + + +@PIPELINES.register() +class MultiScaleCrop(object): + """ + Random crop images in with multiscale sizes + Args: + target_size(int): Random crop a square with the target_size from an image. + scales(int): List of candidate cropping scales. + max_distort(int): Maximum allowable deformation combination distance. + fix_crop(int): Whether to fix the cutting start point. + allow_duplication(int): Whether to allow duplicate candidate crop starting points. + more_fix_crop(int): Whether to allow more cutting starting points. + """ + def __init__( + self, + target_size, # NOTE: named target size now, but still pass short size in it! + scales=None, + max_distort=1, + fix_crop=True, + allow_duplication=False, + more_fix_crop=True, + backend='pillow'): + + self.target_size = target_size + self.scales = scales if scales else [1, .875, .75, .66] + self.max_distort = max_distort + self.fix_crop = fix_crop + self.allow_duplication = allow_duplication + self.more_fix_crop = more_fix_crop + assert backend in [ + 'pillow', 'cv2' + ], f"MultiScaleCrop's backend must be pillow or cv2, but get {backend}" + self.backend = backend + + def __call__(self, results): + """ + Performs MultiScaleCrop operations. + Args: + imgs: List where wach item is a PIL.Image. + XXX: + results: + + """ + imgs = results['imgs'] + + input_size = [self.target_size, self.target_size] + + im_size = imgs[0].size + + # get random crop offset + def _sample_crop_size(im_size): + image_w, image_h = im_size[0], im_size[1] + + base_size = min(image_w, image_h) + crop_sizes = [int(base_size * x) for x in self.scales] + crop_h = [ + input_size[1] if abs(x - input_size[1]) < 3 else x + for x in crop_sizes + ] + crop_w = [ + input_size[0] if abs(x - input_size[0]) < 3 else x + for x in crop_sizes + ] + + pairs = [] + for i, h in enumerate(crop_h): + for j, w in enumerate(crop_w): + if abs(i - j) <= self.max_distort: + pairs.append((w, h)) + crop_pair = random.choice(pairs) + if not self.fix_crop: + w_offset = random.randint(0, image_w - crop_pair[0]) + h_offset = random.randint(0, image_h - crop_pair[1]) + else: + w_step = (image_w - crop_pair[0]) / 4 + h_step = (image_h - crop_pair[1]) / 4 + + ret = list() + ret.append((0, 0)) # upper left + if self.allow_duplication or w_step != 0: + ret.append((4 * w_step, 0)) # upper right + if self.allow_duplication or h_step != 0: + ret.append((0, 4 * h_step)) # lower left + if self.allow_duplication or (h_step != 0 and w_step != 0): + ret.append((4 * w_step, 4 * h_step)) # lower right + if self.allow_duplication or (h_step != 0 or w_step != 0): + ret.append((2 * w_step, 2 * h_step)) # center + + if self.more_fix_crop: + ret.append((0, 2 * h_step)) # center left + ret.append((4 * w_step, 2 * h_step)) # center right + ret.append((2 * w_step, 4 * h_step)) # lower center + ret.append((2 * w_step, 0 * h_step)) # upper center + + ret.append((1 * w_step, 1 * h_step)) # upper left quarter + ret.append((3 * w_step, 1 * h_step)) # upper right quarter + ret.append((1 * w_step, 3 * h_step)) # lower left quarter + ret.append((3 * w_step, 3 * h_step)) # lower righ quarter + + w_offset, h_offset = random.choice(ret) + + return crop_pair[0], crop_pair[1], w_offset, h_offset + + crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size) + crop_img_group = [ + img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) + for img in imgs + ] + if self.backend == 'pillow': + ret_img_group = [ + img.resize((input_size[0], input_size[1]), Image.BILINEAR) + for img in crop_img_group + ] + else: + ret_img_group = [ + Image.fromarray( + cv2.resize(np.asarray(img), + dsize=(input_size[0], input_size[1]), + interpolation=cv2.INTER_LINEAR)) + for img in crop_img_group + ] + results['imgs'] = ret_img_group + return results + + +@PIPELINES.register() +class RandomFlip(object): + """ + Random Flip images. + Args: + p(float): Random flip images with the probability p. + """ + def __init__(self, p=0.5): + self.p = p + + def __call__(self, results): + """ + Performs random flip operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + flip_imgs: List where each item is a PIL.Image after random flip. + """ + imgs = results['imgs'] + v = random.random() + if v < self.p: + if isinstance(imgs, paddle.Tensor): + results['imgs'] = paddle.flip(imgs, axis=[3]) + elif isinstance(imgs[0], np.ndarray): + results['imgs'] = [cv2.flip(img, 1, img) for img in imgs + ] # [[h,w,c], [h,w,c], ..., [h,w,c]] + else: + results['imgs'] = [ + img.transpose(Image.FLIP_LEFT_RIGHT) for img in imgs + ] + else: + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class RandomBrightness(object): + """ + Random Brightness images. + Args: + p(float): Random brightness images with the probability p. + """ + def __init__(self, p=0.1, brightness=1): + self.p = p + self.brightness = brightness + + def __call__(self, results): + """ + Performs random brightness operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + brightness_imgs: List where each item is a PIL.Image after random brightness. + """ + imgs = results['imgs'] + v = random.random() + + if v < self.p: + transform = ColorJitter(brightness=self.brightness) + results['imgs'] = [transform(img) for img in imgs] + else: + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class RandomSaturation(object): + """ + Random Saturation images. + Args: + p(float): Random saturation images with the probability p. + """ + def __init__(self, p=0.1, saturation=2): + self.p = p + self.saturation = saturation + + def __call__(self, results): + """ + Performs random saturation operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + saturation_imgs: List where each item is a PIL.Image after random saturation. + """ + imgs = results['imgs'] + v = random.random() + + if v < self.p: + transform = ColorJitter(saturation=self.saturation) + results['imgs'] = [transform(img) for img in imgs] + else: + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class RandomHue(object): + """ + Random Hue images. + Args: + p(float): Random hue images with the probability p. + """ + def __init__(self, p=0.1, hue=0.5): + self.p = p + self.hue = hue + + def __call__(self, results): + """ + Performs random hue operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + hue_imgs: List where each item is a PIL.Image after random hue. + """ + imgs = results['imgs'] + v = random.random() + + if v < self.p: + transform = ColorJitter(hue=self.hue) + results['imgs'] = [transform(img) for img in imgs] + else: + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class RandomGamma(object): + """ + Random Gamma images. + Args: + p(float): Random gamma images with the probability p. + gamma (float): Non negative real number, same as `\\gamma` in the equation. + gamma larger than 1 make the shadows darker, + while gamma smaller than 1 make dark regions lighter. + """ + def __init__(self, p=0.1, gamma=0.2): + self.p = p + self.value = [1 - gamma, 1 + gamma] + self.value[0] = max(self.value[0], 0) + + def _adust_gamma(self, img, gamma, gain=1.0): + flag = False + if isinstance(img, np.ndarray): + flag = True + img = Image.fromarray(img) + input_mode = img.mode + img = img.convert("RGB") + gamma_map = [ + int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma)) + for ele in range(256) + ] * 3 + img = img.point( + gamma_map) # use PIL's point-function to accelerate this part + img = img.convert(input_mode) + if flag: + img = np.array(img) + return img + + def __call__(self, results): + """ + Performs random gamma operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + gamma_imgs: List where each item is a PIL.Image after random gamma. + """ + imgs = results['imgs'] + v = random.random() + + if v < self.p: + gamma = random.uniform(self.value[0], self.value[1]) + results['imgs'] = [self._adust_gamma(img, gamma) for img in imgs] + else: + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class Image2Array(object): + """ + transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'. + Args: + transpose: whether to transpose or not, default True, False for slowfast. + """ + def __init__(self, transpose=True, data_format='tchw'): + assert data_format in [ + 'tchw', 'cthw' + ], f"Target format must in ['tchw', 'cthw'], but got {data_format}" + self.transpose = transpose + self.data_format = data_format + + def __call__(self, results): + """ + Performs Image to NumpyArray operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + np_imgs: Numpy array. + """ + imgs = results['imgs'] + if 'backend' in results and results[ + 'backend'] == 'pyav': # [T,H,W,C] in [0, 1] + if self.transpose: + if self.data_format == 'tchw': + t_imgs = imgs.transpose((0, 3, 1, 2)) # tchw + else: + t_imgs = imgs.transpose((3, 0, 1, 2)) # cthw + results['imgs'] = t_imgs + else: + t_imgs = np.stack(imgs).astype('float32') + if self.transpose: + if self.data_format == 'tchw': + t_imgs = t_imgs.transpose(0, 3, 1, 2) # tchw + else: + t_imgs = t_imgs.transpose(3, 0, 1, 2) # cthw + results['imgs'] = t_imgs + return results + + +@PIPELINES.register() +class Normalization(object): + """ + Normalization. + Args: + mean(Sequence[float]): mean values of different channels. + std(Sequence[float]): std values of different channels. + tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3] + """ + def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False): + if not isinstance(mean, Sequence): + raise TypeError( + f'Mean must be list, tuple or np.ndarray, but got {type(mean)}') + if not isinstance(std, Sequence): + raise TypeError( + f'Std must be list, tuple or np.ndarray, but got {type(std)}') + + self.inplace = inplace + if not inplace: + self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32) + self.std = np.array(std).reshape(tensor_shape).astype(np.float32) + else: + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + + def __call__(self, results): + """ + Performs normalization operations. + Args: + imgs: Numpy array. + return: + np_imgs: Numpy array after normalization. + """ + if self.inplace: + n = len(results['imgs']) + h, w, c = results['imgs'][0].shape + norm_imgs = np.empty((n, h, w, c), dtype=np.float32) + for i, img in enumerate(results['imgs']): + norm_imgs[i] = img + + for img in norm_imgs: # [n,h,w,c] + mean = np.float64(self.mean.reshape(1, -1)) # [1, 3] + stdinv = 1 / np.float64(self.std.reshape(1, -1)) # [1, 3] + cv2.subtract(img, mean, img) + cv2.multiply(img, stdinv, img) + else: + imgs = results['imgs'] + norm_imgs = imgs / 255.0 + norm_imgs -= self.mean + norm_imgs /= self.std + if 'backend' in results and results['backend'] == 'pyav': + norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32) + results['imgs'] = norm_imgs + return results + + +@PIPELINES.register() +class JitterScale(object): + """ + Scale image, while the target short size is randomly select between min_size and max_size. + Args: + min_size: Lower bound for random sampler. + max_size: Higher bound for random sampler. + """ + def __init__(self, + min_size, + max_size, + short_cycle_factors=[0.5, 0.7071], + default_min_size=256): + self.default_min_size = default_min_size + self.orig_min_size = self.min_size = min_size + self.max_size = max_size + self.short_cycle_factors = short_cycle_factors + + def __call__(self, results): + """ + Performs jitter resize operations. + Args: + imgs (Sequence[PIL.Image]): List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + resized_imgs: List where each item is a PIL.Image after scaling. + """ + short_cycle_idx = results.get('short_cycle_idx') + if short_cycle_idx in [0, 1]: + self.min_size = int( + round(self.short_cycle_factors[short_cycle_idx] * + self.default_min_size)) + else: + self.min_size = self.orig_min_size + + imgs = results['imgs'] + size = int(round(np.random.uniform(self.min_size, self.max_size))) + assert (len(imgs) >= 1), \ + "len(imgs):{} should be larger than 1".format(len(imgs)) + + if 'backend' in results and results['backend'] == 'pyav': + height, width = imgs.shape[2:] + else: + width, height = imgs[0].size + if (width <= height and width == size) or (height <= width + and height == size): + return results + + new_width = size + new_height = size + if width < height: + new_height = int(math.floor((float(height) / width) * size)) + else: + new_width = int(math.floor((float(width) / height) * size)) + + if 'backend' in results and results['backend'] == 'pyav': + frames_resize = F.interpolate(imgs, + size=(new_height, new_width), + mode="bilinear", + align_corners=False) # [c,t,h,w] + else: + frames_resize = [] + for j in range(len(imgs)): + img = imgs[j] + scale_img = img.resize((new_width, new_height), Image.BILINEAR) + frames_resize.append(scale_img) + + results['imgs'] = frames_resize + return results + + +@PIPELINES.register() +class MultiCenterCrop(object): + """ + center crop, left center crop right center crop + Args: + target_size(int): Random crop a square with the target_size from an image. + """ + def __init__(self, target_size): + self.target_size = target_size + + def __call__(self, results): + """ + Performs random crop operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + crop_imgs: List where each item is a PIL.Image after random crop. + """ + imgs = results['imgs'] + if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w] + h, w = imgs.shape[2:] + else: + w, h = imgs[0].size + th, tw = self.target_size, self.target_size + + assert (w >= self.target_size) and (h >= self.target_size), \ + "image width({}) and height({}) should be larger than crop size".format( + w, h, self.target_size) + + crop_images = [] + #just for tensor + crop_imgs_center = [] + crop_imgs_left = [] + crop_imgs_right = [] + if 'backend' in results and results['backend'] == 'pyav': + #center_corp + x1 = 0 + if w > self.target_size: + x1 = int((w - self.target_size) / 2.0) + y1 = 0 + if h > self.target_size: + y1 = int((h - self.target_size) / 2.0) + crop_imgs_center = imgs[:, :, y1:y1 + th, + x1:x1 + tw].numpy() # [C, T, th, tw] + #left_crop + x1 = 0 + y1 = 0 + if h > self.target_size: + y1 = int((h - self.target_size) / 2.0) + crop_imgs_left = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy() + #right_crop + x1 = 0 + y1 = 0 + if w > self.target_size: + x1 = w - self.target_size + if h > self.target_size: + y1 = int((h - self.target_size) / 2.0) + crop_imgs_right = imgs[:, :, y1:y1 + th, x1:x1 + tw].numpy() + crop_imgs = np.concatenate( + (crop_imgs_center, crop_imgs_left, crop_imgs_right), axis=1) + crop_images = paddle.to_tensor(crop_imgs) + + else: + x1 = 0 + if w > self.target_size: + x1 = random.randint(0, w - tw) + y1 = 0 + if h > self.target_size: + y1 = random.randint(0, h - th) + for img in imgs: + if w == tw and h == th: + crop_images.append(img) + else: + crop_images.append(img.crop((x1, y1, x1 + tw, y1 + th))) + results['imgs'] = crop_images + return results + + +@PIPELINES.register() +class MultiCrop(object): + """ + Random crop image. + This operation can perform multi-crop during multi-clip test, as in slowfast model. + Args: + target_size(int): Random crop a square with the target_size from an image. + """ + def __init__(self, + target_size, + default_crop_size=224, + short_cycle_factors=[0.5, 0.7071], + test_mode=False): + self.orig_target_size = self.target_size = target_size + self.short_cycle_factors = short_cycle_factors + self.default_crop_size = default_crop_size + self.test_mode = test_mode + + def __call__(self, results): + """ + Performs random crop operations. + Args: + imgs: List where each item is a PIL.Image. + For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...] + return: + crop_imgs: List where each item is a PIL.Image after random crop. + """ + imgs = results['imgs'] + spatial_sample_index = results['spatial_sample_index'] + spatial_num_clips = results['spatial_num_clips'] + + short_cycle_idx = results.get('short_cycle_idx') + if short_cycle_idx in [0, 1]: + self.target_size = int( + round(self.short_cycle_factors[short_cycle_idx] * + self.default_crop_size)) + else: + self.target_size = self.orig_target_size # use saved value before call + + w, h = imgs[0].size + if w == self.target_size and h == self.target_size: + return results + + assert (w >= self.target_size) and (h >= self.target_size), \ + "image width({}) and height({}) should be larger than crop size({},{})".format(w, h, self.target_size, self.target_size) + frames_crop = [] + if not self.test_mode: + x_offset = random.randint(0, w - self.target_size) + y_offset = random.randint(0, h - self.target_size) + else: # multi-crop + x_gap = int( + math.ceil((w - self.target_size) / (spatial_num_clips - 1))) + y_gap = int( + math.ceil((h - self.target_size) / (spatial_num_clips - 1))) + if h > w: + x_offset = int(math.ceil((w - self.target_size) / 2)) + if spatial_sample_index == 0: + y_offset = 0 + elif spatial_sample_index == spatial_num_clips - 1: + y_offset = h - self.target_size + else: + y_offset = y_gap * spatial_sample_index + else: + y_offset = int(math.ceil((h - self.target_size) / 2)) + if spatial_sample_index == 0: + x_offset = 0 + elif spatial_sample_index == spatial_num_clips - 1: + x_offset = w - self.target_size + else: + x_offset = x_gap * spatial_sample_index + + for img in imgs: + nimg = img.crop((x_offset, y_offset, x_offset + self.target_size, + y_offset + self.target_size)) + frames_crop.append(nimg) + results['imgs'] = frames_crop + return results + + +@PIPELINES.register() +class PackOutput(object): + """ + In slowfast model, we want to get slow pathway from fast pathway based on + alpha factor. + Args: + alpha(int): temporal length of fast/slow + """ + def __init__(self, alpha): + self.alpha = alpha + + def __call__(self, results): + fast_pathway = results['imgs'] + + # sample num points between start and end + slow_idx_start = 0 + slow_idx_end = fast_pathway.shape[0] - 1 + slow_idx_num = fast_pathway.shape[0] // self.alpha + slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end, + slow_idx_num).astype("int64") + slow_pathway = fast_pathway[slow_idxs_select] + + # T H W C -> C T H W. + slow_pathway = slow_pathway.transpose(3, 0, 1, 2) + fast_pathway = fast_pathway.transpose(3, 0, 1, 2) + + # slow + fast + frames_list = [slow_pathway, fast_pathway] + results['imgs'] = frames_list + return results + + +@PIPELINES.register() +class GroupFullResSample(object): + def __init__(self, crop_size, flip=False): + self.crop_size = crop_size if not isinstance(crop_size, int) else ( + crop_size, crop_size) + self.flip = flip + + def __call__(self, results): + img_group = results['imgs'] + + image_w, image_h = img_group[0].size + crop_w, crop_h = self.crop_size + + w_step = (image_w - crop_w) // 4 + h_step = (image_h - crop_h) // 4 + + offsets = list() + offsets.append((0 * w_step, 2 * h_step)) # left + offsets.append((4 * w_step, 2 * h_step)) # right + offsets.append((2 * w_step, 2 * h_step)) # center + + oversample_group = list() + for o_w, o_h in offsets: + normal_group = list() + flip_group = list() + for i, img in enumerate(img_group): + crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h)) + normal_group.append(crop) + if self.flip: + flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT) + flip_group.append(flip_crop) + + oversample_group.extend(normal_group) + if self.flip: + oversample_group.extend(flip_group) + + results['imgs'] = oversample_group + return results + + +@PIPELINES.register() +class TenCrop: + """ + Crop out 5 regions (4 corner points + 1 center point) from the picture, + and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust. + Args: + target_size(int | tuple[int]): (w, h) of target size for crop. + """ + def __init__(self, target_size): + self.target_size = (target_size, target_size) + + def __call__(self, results): + imgs = results['imgs'] + img_w, img_h = imgs[0].size + crop_w, crop_h = self.target_size + w_step = (img_w - crop_w) // 4 + h_step = (img_h - crop_h) // 4 + offsets = [ + (0, 0), + (4 * w_step, 0), + (0, 4 * h_step), + (4 * w_step, 4 * h_step), + (2 * w_step, 2 * h_step), + ] + img_crops = list() + for x_offset, y_offset in offsets: + crop = [ + img.crop( + (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h)) + for img in imgs + ] + crop_fliped = [ + timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop + ] + img_crops.extend(crop) + img_crops.extend(crop_fliped) + + results['imgs'] = img_crops + return results + + +@PIPELINES.register() +class UniformCrop: + """ + Perform uniform spatial sampling on the images, + select the two ends of the long side and the middle position (left middle right or top middle bottom) 3 regions. + Args: + target_size(int | tuple[int]): (w, h) of target size for crop. + """ + def __init__(self, target_size, backend='cv2'): + if isinstance(target_size, tuple): + self.target_size = target_size + elif isinstance(target_size, int): + self.target_size = (target_size, target_size) + else: + raise TypeError( + f'target_size must be int or tuple[int], but got {type(target_size)}' + ) + self.backend = backend + + def __call__(self, results): + + imgs = results['imgs'] + if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w] + img_h, img_w = imgs.shape[2:] + elif self.backend == 'pillow': + img_w, img_h = imgs[0].size + else: + img_h, img_w = imgs[0].shape[:2] + + crop_w, crop_h = self.target_size + if crop_h == img_h: + w_step = (img_w - crop_w) // 2 + offsets = [ + (0, 0), + (w_step * 2, 0), + (w_step, 0), + ] + elif crop_w == img_w: + h_step = (img_h - crop_h) // 2 + offsets = [ + (0, 0), + (0, h_step * 2), + (0, h_step), + ] + else: + raise ValueError( + f"img_w({img_w}) == crop_w({crop_w}) or img_h({img_h}) == crop_h({crop_h})" + ) + img_crops = [] + if 'backend' in results and results['backend'] == 'pyav': # [c,t,h,w] + for x_offset, y_offset in offsets: + crop = imgs[:, :, y_offset:y_offset + crop_h, + x_offset:x_offset + crop_w] + img_crops.append(crop) + img_crops = paddle.concat(img_crops, axis=1) + else: + if self.backend == 'pillow': + for x_offset, y_offset in offsets: + crop = [ + img.crop((x_offset, y_offset, x_offset + crop_w, + y_offset + crop_h)) for img in imgs + ] + img_crops.extend(crop) + else: + for x_offset, y_offset in offsets: + crop = [ + img[y_offset:y_offset + crop_h, + x_offset:x_offset + crop_w] for img in imgs + ] + img_crops.extend(crop) + results['imgs'] = img_crops + return results + + +@PIPELINES.register() +class GroupResize(object): + def __init__(self, height, width, scale, K, mode='train'): + self.height = height + self.width = width + self.scale = scale + self.resize = {} + self.K = np.array(K, dtype=np.float32) + self.mode = mode + for i in range(self.scale): + s = 2**i + self.resize[i] = paddle.vision.transforms.Resize( + (self.height // s, self.width // s), interpolation='lanczos') + + def __call__(self, results): + if self.mode == 'infer': + imgs = results['imgs'] + for k in list(imgs): # ("color", 0, -1) + if "color" in k or "color_n" in k: + n, im, _ = k + for i in range(self.scale): + imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)]) + else: + imgs = results['imgs'] + for scale in range(self.scale): + K = self.K.copy() + + K[0, :] *= self.width // (2**scale) + K[1, :] *= self.height // (2**scale) + + inv_K = np.linalg.pinv(K) + imgs[("K", scale)] = K + imgs[("inv_K", scale)] = inv_K + + for k in list(imgs): + if "color" in k or "color_n" in k: + n, im, i = k + for i in range(self.scale): + imgs[(n, im, i)] = self.resize[i](imgs[(n, im, i - 1)]) + + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class ColorJitter(object): + """Randomly change the brightness, contrast, saturation and hue of an image. + """ + def __init__(self, + brightness=0, + contrast=0, + saturation=0, + hue=0, + mode='train', + p=0.5, + keys=None): + self.mode = mode + self.colorjitter = paddle.vision.transforms.ColorJitter( + brightness, contrast, saturation, hue) + self.p = p + + def __call__(self, results): + """ + Args: + results (PIL Image): Input image. + + Returns: + PIL Image: Color jittered image. + """ + + do_color_aug = random.random() > self.p + imgs = results['imgs'] + for k in list(imgs): + f = imgs[k] + if "color" in k or "color_n" in k: + n, im, i = k + imgs[(n, im, i)] = f + if do_color_aug: + imgs[(n + "_aug", im, i)] = self.colorjitter(f) + else: + imgs[(n + "_aug", im, i)] = f + if self.mode == "train": + for i in results['frame_idxs']: + del imgs[("color", i, -1)] + del imgs[("color_aug", i, -1)] + del imgs[("color_n", i, -1)] + del imgs[("color_n_aug", i, -1)] + else: + for i in results['frame_idxs']: + del imgs[("color", i, -1)] + del imgs[("color_aug", i, -1)] + + results['img'] = imgs + return results + + +@PIPELINES.register() +class GroupRandomFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, results): + + imgs = results['imgs'] + do_flip = random.random() > self.p + if do_flip: + for k in list(imgs): + if "color" in k or "color_n" in k: + n, im, i = k + imgs[(n, im, + i)] = imgs[(n, im, + i)].transpose(Image.FLIP_LEFT_RIGHT) + if "depth_gt" in imgs: + imgs['depth_gt'] = np.array(np.fliplr(imgs['depth_gt'])) + + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class ToArray(object): + def __init__(self): + pass + + def __call__(self, results): + imgs = results['imgs'] + for k in list(imgs): + if "color" in k or "color_n" in k or "color_aug" in k or "color_n_aug" in k: + n, im, i = k + imgs[(n, im, + i)] = np.array(imgs[(n, im, i)]).astype('float32') / 255.0 + imgs[(n, im, i)] = imgs[(n, im, i)].transpose((2, 0, 1)) + if "depth_gt" in imgs: + imgs['depth_gt'] = np.array(imgs['depth_gt']).astype('float32') + + results['imgs'] = imgs + return results + + +@PIPELINES.register() +class YowoAug(object): + def __init__(self, target_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5, valid_mode=False): + self.shape = (target_size, target_size) + self.jitter = jitter + self.hue = hue + self.saturation = saturation + self.exposure = exposure + self.valid_mode = valid_mode + + def _rand_scale(self, s): + scale = random.uniform(1, s) + if (random.randint(1, 10000) % 2): + return scale + return 1. / scale + + def _distort_image(self, im, hue, sat, val): + im = im.convert('HSV') + cs = list(im.split()) + cs[1] = cs[1].point(lambda i: i * sat) + cs[2] = cs[2].point(lambda i: i * val) + + def _change_hue(x): + x += hue * 255 + if x > 255: + x -= 255 + if x < 0: + x += 255 + return x + + cs[0] = cs[0].point(_change_hue) + im = Image.merge(im.mode, tuple(cs)) + + im = im.convert('RGB') + # constrain_image(im) + return im + + def _random_distort_image(self, im, dhue, dsat, dexp): + res = self._distort_image(im, dhue, dsat, dexp) + return res + + def _read_truths_args(self, lab_path, min_box_scale): + truths = np.loadtxt(lab_path) + truths = np.reshape(truths, (truths.size // 5, 5)) + new_truths = [] + for i in range(truths.shape[0]): + cx = (truths[i][1] + truths[i][3]) / (2 * 320) + cy = (truths[i][2] + truths[i][4]) / (2 * 240) + imgw = (truths[i][3] - truths[i][1]) / 320 + imgh = (truths[i][4] - truths[i][2]) / 240 + truths[i][0] = truths[i][0] - 1 + truths[i][1] = cx + truths[i][2] = cy + truths[i][3] = imgw + truths[i][4] = imgh + + if truths[i][3] < min_box_scale: + continue + new_truths.append([truths[i][0], truths[i][1], truths[i][2], truths[i][3], truths[i][4]]) + return np.array(new_truths) + + def _fill_truth_detection(self, labpath, flip, dx, dy, sx, sy): + max_boxes = 50 + label = np.zeros((max_boxes, 5)) + bs = np.loadtxt(labpath) + bs = np.reshape(bs, (-1, 5)) + + for i in range(bs.shape[0]): + cx = (bs[i][1] + bs[i][3]) / (2 * 320) + cy = (bs[i][2] + bs[i][4]) / (2 * 240) + imgw = (bs[i][3] - bs[i][1]) / 320 + imgh = (bs[i][4] - bs[i][2]) / 240 + bs[i][0] = bs[i][0] - 1 + bs[i][1] = cx + bs[i][2] = cy + bs[i][3] = imgw + bs[i][4] = imgh + + cc = 0 + for i in range(bs.shape[0]): + x1 = bs[i][1] - bs[i][3] / 2 + y1 = bs[i][2] - bs[i][4] / 2 + x2 = bs[i][1] + bs[i][3] / 2 + y2 = bs[i][2] + bs[i][4] / 2 + + x1 = min(0.999, max(0, x1 * sx - dx)) + y1 = min(0.999, max(0, y1 * sy - dy)) + x2 = min(0.999, max(0, x2 * sx - dx)) + y2 = min(0.999, max(0, y2 * sy - dy)) + + bs[i][1] = (x1 + x2) / 2 + bs[i][2] = (y1 + y2) / 2 + bs[i][3] = (x2 - x1) + bs[i][4] = (y2 - y1) + + if flip: + bs[i][1] = 0.999 - bs[i][1] + + if bs[i][3] < 0.001 or bs[i][4] < 0.001: + continue + label[cc] = bs[i] + cc += 1 + if cc >= 50: + break + + label = np.reshape(label, (-1)) + return label + + def __call__(self, results): + clip = results['imgs'] + frame_num = len(clip) + oh = clip[0].height + ow = clip[0].width + labpath = results['filename'].replace('jpg', 'txt').replace('rgb-images', 'labels') + if not self.valid_mode: + dw = int(ow * self.jitter) + dh = int(oh * self.jitter) + + pleft = random.randint(-dw, dw) + pright = random.randint(-dw, dw) + ptop = random.randint(-dh, dh) + pbot = random.randint(-dh, dh) + + swidth = ow - pleft - pright + sheight = oh - ptop - pbot + + sx = float(swidth) / ow + sy = float(sheight) / oh + + dx = (float(pleft) / ow) / sx + dy = (float(ptop) / oh) / sy + + flip = random.randint(1, 10000) % 2 + + dhue = random.uniform(-self.hue, self.hue) + dsat = self._rand_scale(self.saturation) + dexp = self._rand_scale(self.exposure) + + # Augment + cropped = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in clip] + + sized = [img.resize(self.shape) for img in cropped] + + if flip: + sized = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in sized] + + clip = [self._random_distort_image(img, dhue, dsat, dexp) for img in sized] + + label = self._fill_truth_detection(labpath, flip, dx, dy, 1. / sx, 1. / sy) + + else: + label = np.zeros([50 * 5]) + tmp = self._read_truths_args(labpath, 8.0 / clip[0].width).astype('float32') + tmp = np.reshape(tmp, [-1]) + tsz = tmp.size + if tsz > 50 * 5: + label = tmp[0:50 * 5] + elif tsz > 0: + label[0:tsz] = tmp + clip = [img.resize(self.shape) for img in clip] + + clip = [np.asarray(img).astype('float32') / 255.0 for img in clip] + clip = np.concatenate(clip, 0).reshape([frame_num, 224, 224, 3]) + clip = np.transpose(clip, [3, 0, 1, 2]) + results['imgs'] = clip + results['labels'] = label + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py new file mode 100644 index 0000000..4f0c43d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/augmentations_ava.py @@ -0,0 +1,749 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import numpy as np +import math +from PIL import Image +from ..registry import PIPELINES +from collections.abc import Sequence +import cv2 + +pillow_interp_codes = { + 'nearest': Image.NEAREST, + 'bilinear': Image.BILINEAR, + 'bicubic': Image.BICUBIC, + 'box': Image.BOX, + 'lanczos': Image.LANCZOS, + 'hamming': Image.HAMMING +} + +cv2_interp_codes = { + 'nearest': cv2.INTER_NEAREST, + 'bilinear': cv2.INTER_LINEAR, + 'bicubic': cv2.INTER_CUBIC, + 'area': cv2.INTER_AREA, + 'lanczos': cv2.INTER_LANCZOS4 +} + + +def _init_lazy_if_proper(results, lazy): + """Initialize lazy operation properly. + + Make sure that a lazy operation is properly initialized, + and avoid a non-lazy operation accidentally getting mixed in. + + Required keys in results are "imgs" if "img_shape" not in results, + otherwise, Required keys in results are "img_shape", add or modified keys + are "img_shape", "lazy". + Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip", + "flip_direction", "interpolation". + + Args: + results (dict): A dict stores data pipeline result. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + if 'img_shape' not in results: + results['img_shape'] = results['imgs'][0].shape[:2] + if lazy: + if 'lazy' not in results: + img_h, img_w = results['img_shape'] + lazyop = dict() + lazyop['original_shape'] = results['img_shape'] + lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h], + dtype=np.float32) + lazyop['flip'] = False + lazyop['flip_direction'] = None + lazyop['interpolation'] = None + results['lazy'] = lazyop + else: + assert 'lazy' not in results, 'Use Fuse after lazy operations' + + +def _scale_size(size, scale): + """Rescale a size by a ratio. + + Args: + size (tuple[int]): (w, h). + scale (float): Scaling factor. + + Returns: + tuple[int]: scaled size. + """ + w, h = size + return int(w * float(scale) + 0.5), int(h * float(scale) + 0.5) + + +def rescale_size(old_size, scale, return_scale=False): + """Calculate the new size to be rescaled to. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image size. + + Returns: + tuple[int]: The new rescaled image size. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError( + f'Scale must be a number or tuple of int, but got {type(scale)}') + + new_size = _scale_size((w, h), scale_factor) + + if return_scale: + return new_size, scale_factor + else: + return new_size + + +def imresize(img, + size, + return_scale=False, + interpolation='bilinear', + out=None, + backend=None): + """Resize image to a given size. """ + h, w = img.shape[:2] + if backend is None: + backend = 'cv2' + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported for resize.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + pil_image = Image.fromarray(img) + pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) + resized_img = np.array(pil_image) + else: + resized_img = cv2.resize( + img, size, dst=out, interpolation=cv2_interp_codes[interpolation]) + if not return_scale: + return resized_img + else: + w_scale = size[0] / w + h_scale = size[1] / h + return resized_img, w_scale, h_scale + + +@PIPELINES.register() +class EntityBoxRescale: + """Rescale the entity box and proposals according to the image shape. + + Required keys are "proposals", "gt_bboxes", added or modified keys are + "gt_bboxes". If original "proposals" is not None, "proposals" and + will be added or modified. + + Args: + scale_factor (np.ndarray): The scale factor used entity_box rescaling. + """ + + def __init__(self, scale_factor): + self.scale_factor = scale_factor + + def __call__(self, results): + scale_factor = np.concatenate([self.scale_factor, self.scale_factor]) + + if 'gt_bboxes' in results: + gt_bboxes = results['gt_bboxes'] + results['gt_bboxes'] = gt_bboxes * scale_factor + + if 'proposals' in results: + proposals = results['proposals'] + if proposals is not None: + assert proposals.shape[1] == 4, ( + 'proposals shape should be in ' + f'(n, 4), but got {proposals.shape}') + results['proposals'] = proposals * scale_factor + + return results + + def __repr__(self): + return f'{self.__class__.__name__}(scale_factor={self.scale_factor})' + + +@PIPELINES.register() +class EntityBoxCrop: + """Crop the entity boxes and proposals according to the cropped images. + + Required keys are "proposals", "gt_bboxes", added or modified keys are + "gt_bboxes". If original "proposals" is not None, "proposals" will be + modified. + + Args: + crop_bbox(np.ndarray | None): The bbox used to crop the original image. + """ + + def __init__(self, crop_bbox): + self.crop_bbox = crop_bbox + + def __call__(self, results): + proposals = results['proposals'] + gt_bboxes = results['gt_bboxes'] + + if self.crop_bbox is None: + return results + + x1, y1, x2, y2 = self.crop_bbox + img_w, img_h = x2 - x1, y2 - y1 + + assert gt_bboxes.shape[-1] == 4 + gt_bboxes_ = gt_bboxes.copy() + gt_bboxes_[..., 0::2] = np.clip(gt_bboxes[..., 0::2] - x1, 0, img_w - 1) + gt_bboxes_[..., 1::2] = np.clip(gt_bboxes[..., 1::2] - y1, 0, img_h - 1) + results['gt_bboxes'] = gt_bboxes_ + + if proposals is not None: + assert proposals.shape[-1] == 4 + proposals_ = proposals.copy() + proposals_[..., 0::2] = np.clip(proposals[..., 0::2] - x1, 0, + img_w - 1) + proposals_[..., 1::2] = np.clip(proposals[..., 1::2] - y1, 0, + img_h - 1) + results['proposals'] = proposals_ + return results + + def __repr__(self): + return f'{self.__class__.__name__}(crop_bbox={self.crop_bbox})' + + +@PIPELINES.register() +class EntityBoxFlip: + """Flip the entity boxes and proposals with a probability. + + Reverse the order of elements in the given bounding boxes and proposals + with a specific direction. The shape of them are preserved, but the + elements are reordered. Only the horizontal flip is supported (seems + vertical flipping makes no sense). Required keys are "proposals", + "gt_bboxes", added or modified keys are "gt_bboxes". If "proposals" + is not None, it will also be modified. + + Args: + img_shape (tuple[int]): The img shape. + """ + + def __init__(self, img_shape): + self.img_shape = img_shape + + def __call__(self, results): + proposals = results['proposals'] + gt_bboxes = results['gt_bboxes'] + img_h, img_w = self.img_shape + + assert gt_bboxes.shape[-1] == 4 + gt_bboxes_ = gt_bboxes.copy() + gt_bboxes_[..., 0::4] = img_w - gt_bboxes[..., 2::4] - 1 + gt_bboxes_[..., 2::4] = img_w - gt_bboxes[..., 0::4] - 1 + if proposals is not None: + assert proposals.shape[-1] == 4 + proposals_ = proposals.copy() + proposals_[..., 0::4] = img_w - proposals[..., 2::4] - 1 + proposals_[..., 2::4] = img_w - proposals[..., 0::4] - 1 + else: + proposals_ = None + + results['proposals'] = proposals_ + results['gt_bboxes'] = gt_bboxes_ + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}(img_shape={self.img_shape})' + return repr_str + + +@PIPELINES.register() +class Resize: + """Resize images to a specific size. + + Required keys are "imgs", "img_shape", "modality", added or modified + keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy", + "resize_size". Required keys in "lazy" is None, added or modified key is + "interpolation". + + Args: + scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling + factor or maximum size: + If it is a float number, the image will be rescaled by this + factor, else if it is a tuple of 2 integers, the image will + be rescaled as large as possible within the scale. + Otherwise, it serves as (w, h) of output size. + keep_ratio (bool): If set to True, Images will be resized without + changing the aspect ratio. Otherwise, it will resize images to a + given size. Default: True. + interpolation (str): Algorithm used for interpolation: + "nearest" | "bilinear". Default: "bilinear". + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, + scale, + keep_ratio=True, + interpolation='bilinear', + lazy=False): + if isinstance(scale, str): + scale = eval(scale) + if isinstance(scale, float): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + if max_short_edge == -1: + # assign np.inf to long edge for rescaling short edge later. + scale = (np.inf, max_long_edge) + else: + raise TypeError( + f'Scale must be float or tuple of int, but got {type(scale)}') + self.scale = scale + self.keep_ratio = keep_ratio + self.interpolation = interpolation + self.lazy = lazy + + def __call__(self, results): + """Performs the Resize augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + _init_lazy_if_proper(results, self.lazy) + + if 'scale_factor' not in results: + results['scale_factor'] = np.array([1, 1], dtype=np.float32) + img_h, img_w = results['img_shape'] + + if self.keep_ratio: + new_w, new_h = rescale_size((img_w, img_h), self.scale) + else: + new_w, new_h = self.scale + + self.scale_factor = np.array([new_w / img_w, new_h / img_h], + dtype=np.float32) + results['img_shape'] = (new_h, new_w) + results['keep_ratio'] = self.keep_ratio + results['scale_factor'] = results['scale_factor'] * self.scale_factor + + if not self.lazy: + if 'imgs' in results: + results['imgs'] = [ + imresize( + img, (new_w, new_h), interpolation=self.interpolation) + for img in results['imgs'] + ] + if 'keypoint' in results: + results['keypoint'] = results['keypoint'] * self.scale_factor + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + lazyop['interpolation'] = self.interpolation + + #if 'gt_bboxes' in results: + assert not self.lazy + entity_box_rescale = EntityBoxRescale(self.scale_factor) + results = entity_box_rescale(results) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'scale={self.scale}, keep_ratio={self.keep_ratio}, ' + f'interpolation={self.interpolation}, ' + f'lazy={self.lazy})') + return repr_str + + +@PIPELINES.register() +class RandomRescale: + """Randomly resize images so that the short_edge is resized to a specific + size in a given range. The scale ratio is unchanged after resizing. + """ + + def __init__(self, scale_range, interpolation='bilinear'): + scale_range = eval(scale_range) + self.scale_range = scale_range + + assert len(scale_range) == 2 + assert scale_range[0] < scale_range[1] + assert np.all([x > 0 for x in scale_range]) + + self.keep_ratio = True + self.interpolation = interpolation + + def __call__(self, results): + """Performs the Resize augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + short_edge = np.random.randint(self.scale_range[0], + self.scale_range[1] + 1) + resize = Resize((-1, short_edge), + keep_ratio=True, + interpolation=self.interpolation, + lazy=False) + results = resize(results) + + results['short_edge'] = short_edge + return results + + def __repr__(self): + scale_range = self.scale_range + repr_str = (f'{self.__class__.__name__}(' + f'scale_range=({scale_range[0]}, {scale_range[1]}), ' + f'interpolation={self.interpolation})') + return repr_str + + +@PIPELINES.register() +class Rescale: + """resize images so that the short_edge is resized to a specific + size in a given range. The scale ratio is unchanged after resizing. + + Required keys are "imgs", "img_shape", "modality", added or modified + keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "resize_size", + "short_edge". + + Args: + scale_range (tuple[int]): The range of short edge length. A closed + interval. + interpolation (str): Algorithm used for interpolation: + "nearest" | "bilinear". Default: "bilinear". + """ + + def __init__(self, scale_range, interpolation='bilinear'): + scale_range = eval(scale_range) + self.scale_range = scale_range + + self.keep_ratio = True + self.interpolation = interpolation + + def __call__(self, results): + """Performs the Resize augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + resize = Resize( + self.scale_range, + keep_ratio=True, + interpolation=self.interpolation, + lazy=False) + results = resize(results) + return results + + def __repr__(self): + scale_range = self.scale_range + repr_str = (f'{self.__class__.__name__}(' + f'scale_range=({scale_range[0]}, {scale_range[1]}), ' + f'interpolation={self.interpolation})') + return repr_str + + +@PIPELINES.register() +class RandomCrop_v2: + """Vanilla square random crop that specifics the output size. + + Required keys in results are "imgs" and "img_shape", added or + modified keys are "imgs", "lazy"; Required keys in "lazy" are "flip", + "crop_bbox", added or modified key is "crop_bbox". + + Args: + size (int): The output size of the images. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, size, lazy=False): + if not isinstance(size, int): + raise TypeError(f'Size must be an int, but got {type(size)}') + self.size = size + self.lazy = lazy + + def __call__(self, results): + """Performs the RandomCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + + img_h, img_w = results['img_shape'] + assert self.size <= img_h and self.size <= img_w + + y_offset = 0 + x_offset = 0 + if img_h > self.size: + y_offset = int(np.random.randint(0, img_h - self.size)) + if img_w > self.size: + x_offset = int(np.random.randint(0, img_w - self.size)) + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = x_offset / img_w, y_offset / img_h + w_ratio, h_ratio = self.size / img_w, self.size / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_x_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + new_h, new_w = self.size, self.size + + results['crop_bbox'] = np.array( + [x_offset, y_offset, x_offset + new_w, y_offset + new_h]) + results['img_shape'] = (new_h, new_w) + + if not self.lazy: + results['imgs'] = [ + img[y_offset:y_offset + new_h, x_offset:x_offset + new_w] + for img in results['imgs'] + ] + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = x_offset * (lazy_right - lazy_left) / img_w + right = (x_offset + new_w) * (lazy_right - lazy_left) / img_w + top = y_offset * (lazy_bottom - lazy_top) / img_h + bottom = (y_offset + new_h) * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array( + [(lazy_left + left), (lazy_top + top), (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + # Process entity boxes + if 'gt_bboxes' in results: + assert not self.lazy + entity_box_crop = EntityBoxCrop(results['crop_bbox']) + results = entity_box_crop(results) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(size={self.size}, ' + f'lazy={self.lazy})') + return repr_str + + +def imflip_(img, direction='horizontal'): + """Inplace flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image (inplace). + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return cv2.flip(img, 1, img) + elif direction == 'vertical': + return cv2.flip(img, 0, img) + else: + return cv2.flip(img, -1, img) + + +def iminvert(img): + """Invert (negate) an image. + + Args: + img (ndarray): Image to be inverted. + + Returns: + ndarray: The inverted image. + """ + return np.full_like(img, 255) - img + + +@PIPELINES.register() +class Flip: + """Flip the input images with a probability. + + Reverse the order of elements in the given imgs with a specific direction. + The shape of the imgs is preserved, but the elements are reordered. + Required keys are "imgs", "img_shape", "modality", added or modified + keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is + None, added or modified key are "flip" and "flip_direction". The Flip + augmentation should be placed after any cropping / reshaping augmentations, + to make sure crop_quadruple is calculated properly. + + Args: + flip_ratio (float): Probability of implementing flip. Default: 0.5. + direction (str): Flip imgs horizontally or vertically. Options are + "horizontal" | "vertical". Default: "horizontal". + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + _directions = ['horizontal', 'vertical'] + + def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False): + if direction not in self._directions: + raise ValueError(f'Direction {direction} is not supported. ' + f'Currently support ones are {self._directions}') + self.flip_ratio = flip_ratio + self.direction = direction + self.lazy = lazy + + def __call__(self, results): + """Performs the Flip augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + flip = np.random.rand() < self.flip_ratio + + results['flip'] = flip + results['flip_direction'] = self.direction + + if not self.lazy: + if flip: + for i, img in enumerate(results['imgs']): + imflip_(img, self.direction) + lt = len(results['imgs']) + else: + results['imgs'] = list(results['imgs']) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Use one Flip please') + lazyop['flip'] = flip + lazyop['flip_direction'] = self.direction + + if 'gt_bboxes' in results and flip: + assert not self.lazy and self.direction == 'horizontal' + entity_box_flip = EntityBoxFlip(results['img_shape']) + results = entity_box_flip(results) + + return results + + def __repr__(self): + repr_str = ( + f'{self.__class__.__name__}(' + f'flip_ratio={self.flip_ratio}, direction={self.direction}, ' + f'lazy={self.lazy})') + return repr_str + + +def imnormalize_(img, mean, std, to_rgb=True): + """Inplace normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + # cv2 inplace normalization does not accept uint8 + assert img.dtype != np.uint8 + mean = np.float64(mean.reshape(1, -1)) + stdinv = 1 / np.float64(std.reshape(1, -1)) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + cv2.subtract(img, mean, img) # inplace + cv2.multiply(img, stdinv, img) # inplace + return img + + +@PIPELINES.register() +class Normalize: + """Normalize images with the given mean and std value. + + Required keys are "imgs", "img_shape", "modality", added or modified + keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional + keys "scale_factor" is required + + Args: + mean (Sequence[float]): Mean values of different channels. + std (Sequence[float]): Std values of different channels. + to_bgr (bool): Whether to convert channels from RGB to BGR. + Default: False. + adjust_magnitude (bool): Indicate whether to adjust the flow magnitude + on 'scale_factor' when modality is 'Flow'. Default: False. + """ + + def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False): + if not isinstance(mean, Sequence): + raise TypeError( + f'Mean must be list, tuple or np.ndarray, but got {type(mean)}') + + if not isinstance(std, Sequence): + raise TypeError( + f'Std must be list, tuple or np.ndarray, but got {type(std)}') + + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_bgr = to_bgr + self.adjust_magnitude = adjust_magnitude + + def __call__(self, results): + n = len(results['imgs']) + h, w, c = results['imgs'][0].shape + imgs = np.empty((n, h, w, c), dtype=np.float32) + for i, img in enumerate(results['imgs']): + imgs[i] = img + + for img in imgs: + imnormalize_(img, self.mean, self.std, self.to_bgr) + + results['imgs'] = imgs + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_bgr=self.to_bgr) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'mean={self.mean}, ' + f'std={self.std}, ' + f'to_bgr={self.to_bgr}, ' + f'adjust_magnitude={self.adjust_magnitude})') + return repr_str diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py new file mode 100644 index 0000000..76eb4ed --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/compose.py @@ -0,0 +1,76 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Sequence +from ..registry import PIPELINES +import traceback +from ...utils import build +from ...utils import get_logger + + +@PIPELINES.register() +class Compose(object): + """ + Composes several pipelines(include decode func, sample func, and transforms) together. + + Note: To deal with ```list``` type cfg temporaray, like: + + transform: + - Crop: # A list + attribute: 10 + - Resize: # A list + attribute: 20 + + every key of list will pass as the key name to build a module. + XXX: will be improved in the future. + + Args: + pipelines (list): List of transforms to compose. + Returns: + A compose object which is callable, __call__ for this Compose + object will call each given :attr:`transforms` sequencely. + """ + def __init__(self, pipelines): + #assert isinstance(pipelines, Sequence) + self.pipelines = [] + for p in pipelines.values(): + if isinstance(p, dict): + p = build(p, PIPELINES) + self.pipelines.append(p) + elif isinstance(p, list): + for t in p: + #XXX: to deal with old format cfg, ugly code here! + temp_dict = dict(name=list(t.keys())[0]) + for all_sub_t in t.values(): + if all_sub_t is not None: + temp_dict.update(all_sub_t) + + t = build(temp_dict, PIPELINES) + self.pipelines.append(t) + elif callable(p): + self.pipelines.append(p) + else: + raise TypeError(f'pipelines must be callable or a dict,' + f'but got {type(p)}') + def __call__(self, data): + for p in self.pipelines: + try: + data = p(data) + except Exception as e: + stack_info = traceback.format_exc() + logger = get_logger("paddlevideo") + logger.info("fail to perform transform [{}] with error: " + "{} and stack:\n{}".format(p, e, str(stack_info))) + raise e + return data diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py new file mode 100644 index 0000000..2611272 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode.py @@ -0,0 +1,348 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +try: + import av +except ImportError as e: + print( + f"Warning! {e}, [av] package and it's dependencies is required for TimeSformer and other models." + ) +import cv2 +import pickle +import decord as de +import math +import random +from ..registry import PIPELINES + + +def get_start_end_idx(video_size, clip_size, clip_idx, num_clips): + delta = max(video_size - clip_size, 0) + if clip_idx == -1: # here + # Random temporal sampling. + start_idx = random.uniform(0, delta) + else: # ignore + # Uniformly sample the clip with the given index. + start_idx = delta * clip_idx / num_clips + end_idx = start_idx + clip_size - 1 + return start_idx, end_idx + + +@PIPELINES.register() +class VideoDecoder(object): + """ + Decode mp4 file to frames. + Args: + filepath: the file path of mp4 file + """ + def __init__(self, + backend='cv2', + mode='train', + sampling_rate=32, + num_seg=8, + num_clips=1, + target_fps=30): + + self.backend = backend + # params below only for TimeSformer + self.mode = mode + self.sampling_rate = sampling_rate + self.num_seg = num_seg + self.num_clips = num_clips + self.target_fps = target_fps + + def __call__(self, results): + """ + Perform mp4 decode operations. + return: + List where each item is a numpy array after decoder. + """ + file_path = results['filename'] + results['format'] = 'video' + results['backend'] = self.backend + + if self.backend == 'cv2': + cap = cv2.VideoCapture(file_path) + videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + sampledFrames = [] + for i in range(videolen): + ret, frame = cap.read() + # maybe first frame is empty + if ret == False: + continue + img = frame[:, :, ::-1] + sampledFrames.append(img) + results['frames'] = sampledFrames + results['frames_len'] = len(sampledFrames) + + elif self.backend == 'decord': + container = de.VideoReader(file_path) + frames_len = len(container) + results['frames'] = container + results['frames_len'] = frames_len + + elif self.backend == 'pyav': # for TimeSformer + if self.mode in ["train", "valid"]: + clip_idx = -1 + elif self.mode in ["test"]: + clip_idx = 0 + else: + raise NotImplementedError + + container = av.open(file_path) + + num_clips = 1 # always be 1 + + # decode process + fps = float(container.streams.video[0].average_rate) + + frames_length = container.streams.video[0].frames + duration = container.streams.video[0].duration + + if duration is None: + # If failed to fetch the decoding information, decode the entire video. + decode_all_video = True + video_start_pts, video_end_pts = 0, math.inf + else: + decode_all_video = False + start_idx, end_idx = get_start_end_idx( + frames_length, + self.sampling_rate * self.num_seg / self.target_fps * fps, + clip_idx, num_clips) + timebase = duration / frames_length + video_start_pts = int(start_idx * timebase) + video_end_pts = int(end_idx * timebase) + + frames = None + # If video stream was found, fetch video frames from the video. + if container.streams.video: + margin = 1024 + seek_offset = max(video_start_pts - margin, 0) + + container.seek(seek_offset, + any_frame=False, + backward=True, + stream=container.streams.video[0]) + tmp_frames = {} + buffer_count = 0 + max_pts = 0 + for frame in container.decode(**{"video": 0}): + max_pts = max(max_pts, frame.pts) + if frame.pts < video_start_pts: + continue + if frame.pts <= video_end_pts: + tmp_frames[frame.pts] = frame + else: + buffer_count += 1 + tmp_frames[frame.pts] = frame + if buffer_count >= 0: + break + video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)] + + container.close() + + frames = [frame.to_rgb().to_ndarray() for frame in video_frames] + clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps + + start_idx, end_idx = get_start_end_idx( + len(frames), # frame_len + clip_sz, + clip_idx if decode_all_video else + 0, # If decode all video, -1 in train and valid, 0 in test; + # else, always 0 in train, valid and test, as we has selected clip size frames when decode. + 1) + results['frames'] = frames + results['frames_len'] = len(frames) + results['start_idx'] = start_idx + results['end_idx'] = end_idx + else: + raise NotImplementedError + # pass + return results + + +@PIPELINES.register() +class FrameDecoder(object): + """just parse results + """ + def __init__(self): + pass + + def __call__(self, results): + results['format'] = 'frame' + return results + + +@PIPELINES.register() +class MRIDecoder(object): + """just parse results + """ + def __init__(self): + pass + + def __call__(self, results): + results['format'] = 'MRI' + return results + + +@PIPELINES.register() +class FeatureDecoder(object): + """ + Perform feature decode operations.e.g.youtube8m + """ + def __init__(self, num_classes, max_len=512, has_label=True): + self.max_len = max_len + self.num_classes = num_classes + self.has_label = has_label + + def __call__(self, results): + """ + Perform feature decode operations. + return: + List where each item is a numpy array after decoder. + """ + #1. load pkl + #2. parse to rgb/audio/ + #3. padding + + filepath = results['filename'] + data = pickle.load(open(filepath, 'rb'), encoding='bytes') + + record = data + nframes = record['nframes'] if 'nframes' in record else record[ + b'nframes'] + rgb = record['feature'].astype( + float) if 'feature' in record else record[b'feature'].astype(float) + audio = record['audio'].astype( + float) if 'audio' in record else record[b'audio'].astype(float) + if self.has_label: + label = record['label'] if 'label' in record else record[b'label'] + one_hot_label = self.make_one_hot(label, self.num_classes) + + rgb = rgb[0:nframes, :] + audio = audio[0:nframes, :] + + rgb = self.dequantize(rgb, + max_quantized_value=2., + min_quantized_value=-2.) + audio = self.dequantize(audio, + max_quantized_value=2, + min_quantized_value=-2) + + if self.has_label: + results['labels'] = one_hot_label.astype("float32") + + feat_pad_list = [] + feat_len_list = [] + mask_list = [] + vitem = [rgb, audio] + for vi in range(2): #rgb and audio + if vi == 0: + prefix = "rgb_" + else: + prefix = "audio_" + feat = vitem[vi] + results[prefix + 'len'] = feat.shape[0] + #feat pad step 1. padding + feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]), + dtype=np.float32) + feat_pad = np.concatenate((feat, feat_add), axis=0) + results[prefix + 'data'] = feat_pad.astype("float32") + #feat pad step 2. mask + feat_mask_origin = np.ones(feat.shape, dtype=np.float32) + feat_mask_add = feat_add + feat_mask = np.concatenate((feat_mask_origin, feat_mask_add), + axis=0) + results[prefix + 'mask'] = feat_mask.astype("float32") + + return results + + def dequantize(self, + feat_vector, + max_quantized_value=2., + min_quantized_value=-2.): + """ + Dequantize the feature from the byte format to the float format + """ + + assert max_quantized_value > min_quantized_value + quantized_range = max_quantized_value - min_quantized_value + scalar = quantized_range / 255.0 + bias = (quantized_range / 512.0) + min_quantized_value + + return feat_vector * scalar + bias + + def make_one_hot(self, label, dim=3862): + one_hot_label = np.zeros(dim) + one_hot_label = one_hot_label.astype(float) + for ind in label: + one_hot_label[int(ind)] = 1 + return one_hot_label + + +@PIPELINES.register() +class ActionFeatureDecoder(object): + """ + Perform feature decode operations on footballaction + """ + def __init__(self, num_classes, max_len=512, has_label=True): + self.max_len = max_len + self.num_classes = num_classes + self.has_label = has_label + + def __call__(self, results): + """ + Perform feature decode operations. + return: + List where each item is a numpy array after decoder. + """ + #1. load pkl + #2. parse to rgb/audio/ + #3. padding + + filepath = results['filename'] + data = pickle.load(open(filepath, 'rb'), encoding='bytes') + + pkl_data = data + rgb = pkl_data['image_feature'].astype(float) + audio = pkl_data['audio_feature'].astype(float) + label_id_info = pkl_data['label_info'] + label_cls = [label_id_info['label']] + label_one = int(label_cls[0]) + if len(label_cls) > 1: + label_index = random.randint(0, 1) + label_one = int(label_cls[label_index]) + iou_norm = float(label_id_info['norm_iou']) + results['labels'] = np.array([label_one]) + results['iou_norm'] = float(iou_norm) + + vitem = [rgb, audio] + for vi in range(2): #rgb and audio + if vi == 0: + prefix = "rgb_" + else: + prefix = "audio_" + feat = vitem[vi] + results[prefix + 'len'] = feat.shape[0] + #feat pad step 1. padding + feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]), + dtype=np.float32) + feat_pad = np.concatenate((feat, feat_add), axis=0) + results[prefix + 'data'] = feat_pad.astype("float32") + #feat pad step 2. mask + feat_mask_origin = np.ones(feat.shape, dtype=np.float32) + feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0) + results[prefix + 'mask'] = feat_mask.astype("float32") + + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py new file mode 100644 index 0000000..64a7e2f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_image.py @@ -0,0 +1,206 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import PIL.Image as pil + +try: + import skimage.transform +except ImportError as e: + print( + f"Warning! {e}, [scikit-image] package and it's dependencies is required for ADDS." + ) +from PIL import Image + +from ..registry import PIPELINES + + +@PIPELINES.register() +class ImageDecoder(object): + """Decode Image + """ + def __init__(self, + dataset, + frame_idxs, + num_scales, + side_map, + full_res_shape, + img_ext, + backend='cv2'): + self.backend = backend + self.dataset = dataset + self.frame_idxs = frame_idxs + self.num_scales = num_scales + self.side_map = side_map + self.full_res_shape = full_res_shape + self.img_ext = img_ext + + def _pil_loader(self, path): + with open(path, 'rb') as f: + with Image.open(f) as img: + return img.convert('RGB') + + def get_color(self, folder, frame_index, side): + color = self._pil_loader( + self.get_image_path(self.dataset, folder, frame_index, side)) + return color + + def get_image_path(self, dataset, folder, frame_index, side): + if dataset == "kitti": + f_str = "{:010d}{}".format(frame_index, self.img_ext) + image_path = os.path.join(self.data_path, folder, f_str) + elif dataset == "kitti_odom": + f_str = "{:06d}{}".format(frame_index, self.img_ext) + image_path = os.path.join(self.data_path, + "sequences/{:02d}".format(int(folder)), + "image_{}".format(self.side_map[side]), + f_str) + elif dataset == "kitti_depth": + f_str = "{:010d}{}".format(frame_index, self.img_ext) + image_path = os.path.join( + self.data_path, folder, + "image_0{}/data".format(self.side_map[side]), f_str) + + return image_path + + def get_depth(self, dataset, folder, frame_index, side): + if dataset == "kitii_depth": + f_str = "{:010d}.png".format(frame_index) + depth_path = os.path.join( + self.data_path, folder, + "proj_depth/groundtruth/image_0{}".format(self.side_map[side]), + f_str) + + depth_gt = pil.open(depth_path) + depth_gt = depth_gt.resize(self.full_res_shape, pil.NEAREST) + depth_gt = np.array(depth_gt).astype(np.float32) / 256 + + else: + f_str = "{:010d}{}".format(frame_index, self.img_ext) + depth_path = os.path.join(self.data_path, folder + '_gt', f_str) + + img_file = Image.open(depth_path) + depth_png = np.array(img_file, dtype=int) + img_file.close() + # make sure we have a proper 16bit depth map here.. not 8bit! + assert np.max(depth_png) > 255, \ + "np.max(depth_png)={}, path={}".format(np.max(depth_png), depth_path) + + depth_gt = depth_png.astype(np.float) / 256. + + depth_gt = depth_gt[160:960 - 160, :] + + depth_gt = skimage.transform.resize(depth_gt, + self.full_res_shape[::-1], + order=0, + preserve_range=True, + mode='constant') + + return depth_gt + + def __call__(self, results): + """ + Perform mp4 decode operations. + return: + List where each item is a numpy array after decoder. + """ + if results.get('mode', None) == 'infer': + imgs = {} + imgs[("color", 0, + -1)] = Image.open(results["filename"]).convert("RGB") + results['imgs'] = imgs + return results + + self.data_path = results['data_path'] + results['backend'] = self.backend + + imgs = {} + + results['frame_idxs'] = self.frame_idxs + results['num_scales'] = self.num_scales + + file_name = results['filename'] + folder = results['folder'] + frame_index = results['frame_index'] + line = file_name.split('/') + istrain = folder.split('_')[1] + if 'mode' not in results: + results['mode'] = istrain + results['day_or_night'] = folder.split('_')[0] + + if istrain == "train": + if folder[0] == 'd': + folder2 = folder + '_fake_night' + flag = 0 + else: + folder2 = folder + '_fake_day' + tmp = folder + folder = folder2 + folder2 = tmp + flag = 1 + + if len(line) == 3: + side = line[2] + else: + side = None + + results['side'] = side + + for i in self.frame_idxs: + + if i == "s": + other_side = {"r": "l", "l": "r"}[side] + imgs[("color", i, + -1)] = self.get_color(folder, frame_index, other_side) + imgs[("color_n", i, + -1)] = self.get_color(folder2, frame_index, + other_side) + else: + imgs[("color", i, + -1)] = self.get_color(folder, frame_index + i, side) + imgs[("color_n", i, + -1)] = self.get_color(folder2, frame_index + i, side) + + istrain = folder.split('_')[1] + if istrain != 'train': + if flag: + depth_gt = self.get_depth(folder2, frame_index, side) + else: + depth_gt = self.get_depth(folder, frame_index, side) + imgs["depth_gt"] = np.expand_dims(depth_gt, 0) + elif istrain == 'val': + if len(line) == 3: + side = line[2] + else: + side = None + + for i in self.frame_idxs: + if i == "s": + other_side = {"r": "l", "l": "r"}[side] + imgs[("color", i, + -1)] = self.get_color(folder, frame_index, other_side) + else: + + imgs[("color", i, + -1)] = self.get_color(folder, frame_index + i, side) + + # adjusting intrinsics to match each scale in the pyramid + + depth_gt = self.get_depth(self.dataset, folder, frame_index, side) + imgs["depth_gt"] = np.expand_dims(depth_gt, 0) + results['imgs'] = imgs + + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py new file mode 100644 index 0000000..12a8f76 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler.py @@ -0,0 +1,93 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import numpy as np +from PIL import Image +import decord as de +from ..registry import PIPELINES + + +@PIPELINES.register() +class DecodeSampler(object): + """ + We use 'decord' for decode and sampling, which is faster than opencv. + This is used in slowfast model. + Args: + num_frames(int): the number of frames we want to sample. + sampling_rate(int): sampling rate for video data. + target_fps(int): desired fps, default 30 + test_mode(bool): whether test or train/valid. In slowfast, we use multicrop when test. + """ + def __init__(self, + num_frames, + sampling_rate, + default_sampling_rate=2, + target_fps=30, + test_mode=False): + self.num_frames = num_frames + self.orig_sampling_rate = self.sampling_rate = sampling_rate + self.default_sampling_rate = default_sampling_rate + self.target_fps = target_fps + self.test_mode = test_mode + + def get_start_end_idx(self, video_size, clip_size, clip_idx, + temporal_num_clips): + delta = max(video_size - clip_size, 0) + if not self.test_mode: + # Random temporal sampling. + start_idx = random.uniform(0, delta) + else: + # Uniformly sample the clip with the given index. + start_idx = delta * clip_idx / temporal_num_clips + end_idx = start_idx + clip_size - 1 + return start_idx, end_idx + + def __call__(self, results): + """ + Perform mp4 decode operations. + return: + List where each item is a numpy array after decoder. + """ + short_cycle_idx = results.get('short_cycle_idx') + if short_cycle_idx: + self.sampling_rate = random.randint(self.default_sampling_rate, + self.orig_sampling_rate) + + filepath = results['filename'] + temporal_sample_index = results['temporal_sample_index'] + temporal_num_clips = results['temporal_num_clips'] + + vr = de.VideoReader(filepath) + videolen = len(vr) + + # fps = vr.get_avg_fps() + clip_size = self.num_frames * self.sampling_rate * fps / self.target_fps + + start_idx, end_idx = self.get_start_end_idx(videolen, clip_size, + temporal_sample_index, + temporal_num_clips) + index = np.linspace(start_idx, end_idx, self.num_frames).astype("int64") + index = np.clip(index, 0, videolen) + + frames_select = vr.get_batch(index) #1 for buffer + + # dearray_to_img + np_frames = frames_select.asnumpy() + frames_select_list = [] + for i in range(np_frames.shape[0]): + imgbuf = np_frames[i] + frames_select_list.append(Image.fromarray(imgbuf, mode='RGB')) + results['imgs'] = frames_select_list + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py new file mode 100644 index 0000000..08d1dd0 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/decode_sampler_MRI.py @@ -0,0 +1,224 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random + +import numpy as np +from PIL import Image +try: + import SimpleITK as sitk +except ImportError as e: + print( + f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care." + ) +import cv2 + +from ..registry import PIPELINES + + +@PIPELINES.register() +class SFMRI_DecodeSampler(object): + """ + Sample frames id. + NOTE: Use PIL to read image here, has diff with CV2 + Args: + num_seg(int): number of segments. + seg_len(int): number of sampled frames in each segment. + valid_mode(bool): True or False. + select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode. + Returns: + frames_idx: the index of sampled #frames. + """ + def __init__(self, + num_seg, + seg_len, + valid_mode=False, + select_left=False, + dense_sample=False, + linspace_sample=False): + self.num_seg = num_seg + self.seg_len = seg_len + self.valid_mode = valid_mode + self.select_left = select_left + self.dense_sample = dense_sample + self.linspace_sample = linspace_sample + + def _get(self, frames_idx_s, frames_idx_f, results): + + frame_dir = results['frame_dir'] + imgs_s = [] + imgs_f = [] + MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir)) + for idx in frames_idx_s: + item = MRI[idx] + item = cv2.resize(item, (224, 224)) + imgs_s.append(item) + + for idx in frames_idx_f: + item = MRI[idx] + item = cv2.resize(item, (224, 224)) + imgs_f.append(item) + + results['imgs'] = [imgs_s, imgs_f] + return results + + def __call__(self, results): + """ + Args: + frames_len: length of frames. + return: + sampling id. + """ + frames_len = int(results['frames_len']) + average_dur1 = int(frames_len / self.num_seg[0]) + average_dur2 = int(frames_len / self.num_seg[1]) + frames_idx_s = [] + frames_idx_f = [] + if self.linspace_sample: + if 'start_idx' in results and 'end_idx' in results: + offsets_s = np.linspace(results['start_idx'], + results['end_idx'], self.num_seg[0]) + offsets_f = np.linspace(results['start_idx'], + results['end_idx'], self.num_seg[1]) + else: + offsets_s = np.linspace(0, frames_len - 1, self.num_seg[0]) + offsets_f = np.linspace(0, frames_len - 1, self.num_seg[1]) + offsets_s = np.clip(offsets_s, 0, frames_len - 1).astype(np.int64) + offsets_f = np.clip(offsets_f, 0, frames_len - 1).astype(np.int64) + + frames_idx_s = list(offsets_s) + frames_idx_f = list(offsets_f) + + return self._get(frames_idx_s, frames_idx_f, results) + + if not self.select_left: + if self.dense_sample: # For ppTSM + if not self.valid_mode: # train + sample_pos = max(1, 1 + frames_len - 64) + t_stride1 = 64 // self.num_seg[0] + t_stride2 = 64 // self.num_seg[1] + start_idx = 0 if sample_pos == 1 else np.random.randint( + 0, sample_pos - 1) + offsets_s = [(idx * t_stride1 + start_idx) % frames_len + 1 + for idx in range(self.num_seg[0])] + offsets_f = [(idx * t_stride2 + start_idx) % frames_len + 1 + for idx in range(self.num_seg[1])] + frames_idx_s = offsets_s + frames_idx_f = offsets_f + else: + sample_pos = max(1, 1 + frames_len - 64) + t_stride1 = 64 // self.num_seg[0] + t_stride2 = 64 // self.num_seg[1] + start_list = np.linspace(0, + sample_pos - 1, + num=10, + dtype=int) + offsets_s = [] + offsets_f = [] + for start_idx in start_list.tolist(): + offsets_s += [ + (idx * t_stride1 + start_idx) % frames_len + 1 + for idx in range(self.num_seg[0]) + ] + for start_idx in start_list.tolist(): + offsets_f += [ + (idx * t_stride2 + start_idx) % frames_len + 1 + for idx in range(self.num_seg[1]) + ] + frames_idx_s = offsets_s + frames_idx_f = offsets_f + else: + for i in range(self.num_seg[0]): + idx = 0 + if not self.valid_mode: + if average_dur1 >= self.seg_len: + idx = random.randint(0, average_dur1 - self.seg_len) + idx += i * average_dur1 + elif average_dur1 >= 1: + idx += i * average_dur1 + else: + idx = i + else: + if average_dur1 >= self.seg_len: + idx = (average_dur1 - 1) // 2 + idx += i * average_dur1 + elif average_dur1 >= 1: + idx += i * average_dur1 + else: + idx = i + for jj in range(idx, idx + self.seg_len): + frames_idx_s.append(jj) + + for i in range(self.num_seg[1]): + idx = 0 + if not self.valid_mode: + if average_dur2 >= self.seg_len: + idx = random.randint(0, average_dur2 - self.seg_len) + idx += i * average_dur2 + elif average_dur2 >= 1: + idx += i * average_dur2 + else: + idx = i + else: + if average_dur2 >= self.seg_len: + idx = (average_dur2 - 1) // 2 + idx += i * average_dur2 + elif average_dur2 >= 1: + idx += i * average_dur2 + else: + idx = i + for jj in range(idx, idx + self.seg_len): + frames_idx_f.append(jj) + + return self._get(frames_idx_s, frames_idx_f, results) + + else: # for TSM + if not self.valid_mode: + if average_dur2 > 0: + offsets_s = np.multiply(list(range( + self.num_seg[0])), average_dur1) + np.random.randint( + average_dur1, size=self.num_seg[0]) + + offsets_f = np.multiply(list(range( + self.num_seg[1])), average_dur2) + np.random.randint( + average_dur2, size=self.num_seg[1]) + elif frames_len > self.num_seg[1]: + offsets_s = np.sort( + np.random.randint(frames_len, size=self.num_seg[0])) + offsets_f = np.sort( + np.random.randint(frames_len, size=self.num_seg[1])) + else: + offsets_s = np.zeros(shape=(self.num_seg[0], )) + offsets_f = np.zeros(shape=(self.num_seg[1], )) + else: + if frames_len > self.num_seg[1]: + average_dur_float_s = frames_len / self.num_seg[0] + offsets_s = np.array([ + int(average_dur_float_s / 2.0 + average_dur_float_s * x) + for x in range(self.num_seg[0]) + ]) + average_dur_float_f = frames_len / self.num_seg[1] + offsets_f = np.array([ + int(average_dur_float_f / 2.0 + average_dur_float_f * x) + for x in range(self.num_seg[1]) + ]) + else: + offsets_s = np.zeros(shape=(self.num_seg[0], )) + offsets_f = np.zeros(shape=(self.num_seg[1], )) + + frames_idx_s = list(offsets_s) + frames_idx_f = list(offsets_f) + + return self._get(frames_idx_s, frames_idx_f, results) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py new file mode 100644 index 0000000..ccc5f98 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/mix.py @@ -0,0 +1,116 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from ..registry import PIPELINES + + +@PIPELINES.register() +class Mixup(object): + """ + Mixup operator. + Args: + alpha(float): alpha value. + """ + def __init__(self, alpha=0.2): + assert alpha > 0., \ + 'parameter alpha[%f] should > 0.0' % (alpha) + self.alpha = alpha + + def __call__(self, batch): + imgs, labels = list(zip(*batch)) + imgs = np.array(imgs) + labels = np.array(labels) + bs = len(batch) + idx = np.random.permutation(bs) + lam = np.random.beta(self.alpha, self.alpha) + lams = np.array([lam] * bs, dtype=np.float32) + imgs = lam * imgs + (1 - lam) * imgs[idx] + return list(zip(imgs, labels, labels[idx], lams)) + + +@PIPELINES.register() +class Cutmix(object): + """ Cutmix operator + Args: + alpha(float): alpha value. + """ + def __init__(self, alpha=0.2): + assert alpha > 0., \ + 'parameter alpha[%f] should > 0.0' % (alpha) + self.alpha = alpha + + def rand_bbox(self, size, lam): + """ rand_bbox """ + w = size[2] + h = size[3] + cut_rat = np.sqrt(1. - lam) + cut_w = np.int(w * cut_rat) + cut_h = np.int(h * cut_rat) + + # uniform + cx = np.random.randint(w) + cy = np.random.randint(h) + + bbx1 = np.clip(cx - cut_w // 2, 0, w) + bby1 = np.clip(cy - cut_h // 2, 0, h) + bbx2 = np.clip(cx + cut_w // 2, 0, w) + bby2 = np.clip(cy + cut_h // 2, 0, h) + + return bbx1, bby1, bbx2, bby2 + + def __call__(self, batch): + imgs, labels = list(zip(*batch)) + imgs = np.array(imgs) + labels = np.array(labels) + + bs = len(batch) + idx = np.random.permutation(bs) + lam = np.random.beta(self.alpha, self.alpha) + + bbx1, bby1, bbx2, bby2 = self.rand_bbox(imgs.shape, lam) + imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2] + lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) / + (imgs.shape[-2] * imgs.shape[-1])) + lams = np.array([lam] * bs, dtype=np.float32) + + return list(zip(imgs, labels, labels[idx], lams)) + + +@PIPELINES.register() +class VideoMix(object): + """ + VideoMix operator. + Args: + cutmix_prob(float): prob choose cutmix + mixup_alpha(float): alpha for mixup aug + cutmix_alpha(float): alpha for cutmix aug + """ + def __init__(self, cutmix_prob=0.5, mixup_alpha=0.2, cutmix_alpha=1.0): + assert cutmix_prob > 0., \ + 'parameter cutmix_prob[%f] should > 0.0' % (cutmix_prob) + assert mixup_alpha > 0., \ + 'parameter mixup_alpha[%f] should > 0.0' % (mixup_alpha) + assert cutmix_alpha > 0., \ + 'parameter cutmix_alpha[%f] should > 0.0' % (cutmix_alpha) + self.cutmix_prob = cutmix_prob + self.mixup = Mixup(mixup_alpha) + self.cutmix = Cutmix(cutmix_alpha) + + def __call__(self, batch): + if np.random.random() < self.cutmix_prob: + return self.cutmix(batch) + else: + return self.mixup(batch) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py new file mode 100644 index 0000000..39ced5d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/multimodal.py @@ -0,0 +1,380 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import numpy as np +from PIL import Image +# import decord as de +import copy +import json +from ..registry import PIPELINES + +try: + from paddlenlp.transformers import BertTokenizer +except ImportError as e: + print( + f"Warning! {e}, [paddlenlp] package and it's dependencies is required for ActBERT." + ) + + +@PIPELINES.register() +class FeaturePadding(object): + """ + Padding feature to target shape. + """ + def __init__(self, max_region_num=36, max_action_num=5): + self.max_region_num = max_region_num + self.max_action_num = max_action_num + + def __call__(self, results): + """ + Padding feature. + """ + pack_feature = results['feature'] + tokenizer = results['tokenizer'] + image_feature_wp, image_target_wp, image_location_wp, \ + num_boxes, image_h, image_w, image_id, caption, \ + action_feature_wp, action_target_wp, num_actions = pack_feature + + image_feature = np.zeros((self.max_region_num, 2048), dtype=np.float32) + image_target = np.zeros((self.max_region_num, 1601), dtype=np.float32) + image_location = np.zeros((self.max_region_num, 5), dtype=np.float32) + + action_feature = np.zeros((self.max_action_num, 2048), dtype=np.float32) + action_target = np.zeros((self.max_action_num, ), dtype=np.int64) + + num_boxes = int(num_boxes) + image_feature[:num_boxes] = image_feature_wp + image_target[:num_boxes] = image_target_wp + image_location[:num_boxes, :4] = image_location_wp + + image_location[:, 4] = (image_location[:, 3] - image_location[:, 1]) * ( + image_location[:, 2] - image_location[:, 0]) / (float(image_w) * + float(image_h)) + + image_location[:, 0] = image_location[:, 0] / float(image_w) + image_location[:, 1] = image_location[:, 1] / float(image_h) + image_location[:, 2] = image_location[:, 2] / float(image_w) + image_location[:, 3] = image_location[:, 3] / float(image_h) + + image_feature = copy.deepcopy(image_feature) + image_target = copy.deepcopy(image_target) + + num_actions = int(num_actions) + action_feature[:num_actions] = action_feature_wp + action_target[:num_actions] = action_target_wp + action_feature = copy.deepcopy(action_feature) + action_target = copy.deepcopy(action_target) + + results = dict(image_feat=image_feature, + image_target=image_target, + caption=caption, + image_loc=image_location, + num_boxes=int(num_boxes), + action_feat=action_feature, + action_target=action_target, + num_actions=int(num_actions), + tokenizer=tokenizer) + return results + + +@PIPELINES.register() +class RandomCap(object): + def __init__(self, caption_path): + """ + Random Caption for NSP task + """ + self.caption_path = caption_path + + def select_caption(self, caption): + captions = caption.split('!') + rind = random.randint(0, len(captions) - 1) + caption = captions[rind] + return caption + + def get_random_caption(self, all_captions): + num_caps = len(all_captions) + rand_doc_idx = random.randint(0, num_caps - 1) + caption = all_captions[rand_doc_idx] + caption = self.select_caption(caption) + return caption + + def random_cap(self, caption, all_captions): + if random.random() > 0.5: + label = 0 + else: + caption = self.get_random_caption(all_captions) + label = 1 + return caption, label + + def __call__(self, results): + caption = results['caption'] + all_captions = list(json.load(open(self.caption_path, 'r'))) + caption = self.select_caption(caption) + caption, label = self.random_cap(caption, all_captions) + results['caption'] = caption + results['is_next'] = label + return results + + +@PIPELINES.register() +class Tokenize(object): + def __init__(self, ): + """ + Tokenize caption + """ + pass + + def __call__(self, results): + caption = results['caption'] + tokenizer = results['tokenizer'] + tokens_caption = tokenizer.tokenize(caption) + results['caption'] = tokens_caption + return results + + +@PIPELINES.register() +class RandomMask(object): + def __init__(self, + max_seq_length=36, + max_action_length=5, + max_region_length=36): + self.max_seq_length = max_seq_length + self.max_action_length = max_action_length + self.max_region_length = max_region_length + + def get_image_global_feature(self, image_feat, image_loc, image_mask): + g_image_feat = np.sum(image_feat, axis=0) / np.sum( + image_mask, axis=0, keepdims=True) + image_feat = np.concatenate( + [np.expand_dims(g_image_feat, axis=0), image_feat], + axis=0).astype("float32") + + g_image_loc = np.array([0, 0, 1, 1, 1]).astype("float32") + image_loc = np.concatenate( + [np.expand_dims(g_image_loc, axis=0), image_loc], axis=0) + + g_image_mask = np.array([1]) + image_mask = np.concatenate([g_image_mask, image_mask], axis=0) + + return image_feat, image_loc, image_mask + + def _truncate_seq_pair(self, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length. + This is a simple heuristic which will always truncate the longer sequence + one token at a time. This makes more sense than truncating an equal percent + of tokens from each, since if one sequence is very short then each token + that's truncated likely contains more information than a longer sequence. + """ + while True: + total_length = len(tokens_b) + if total_length <= max_length: + break + tokens_b.pop() + + def random_word(self, tokens, tokenizer): + """ + Masking some random tokens for Language Model task with probabilities as in the original BERT paper. + Args: + tokens: list of str, tokenized sentence. + tokenizer: Tokenizer, object used for tokenization (we need it's vocab here) + Return: + (list of str, list of int), masked tokens and related labels for LM prediction + """ + output_label = [] + + for i, token in enumerate(tokens): + prob = random.random() + # mask token with 15% probability + + if prob < 0.15: + prob /= 0.15 + + # 80% randomly change token to mask token + if prob < 0.8: + tokens[i] = "[MASK]" + + # 10% randomly change token to random token + elif prob < 0.9: + #tok = random.choice(list(tokenizer.vocab.items()))[0] + tok = tokenizer.vocab.idx_to_token[random.randint( + 0, + tokenizer.vocab_size, + )] + tokens[i] = tok + + # rest 10% randomly keep current token + # append current token to output (we will predict these later) + try: + output_label.append(tokenizer.vocab[token]) + except KeyError: + # For unknown words (should not occur with BPE vocab) + output_label.append(tokenizer.vocab["[UNK]"]) + print( + "Cannot find token '{}' in vocab. Using [UNK] insetad". + format(token)) + else: + # no masking token (will be ignored by loss function later) + output_label.append(-1) + + return tokens, output_label + + def random_region(self, image_feat, image_loc, num_boxes): + output_label = [] + + for i in range(num_boxes): + prob = random.random() + # mask token with 15% probability + if prob < 0.15: + prob /= 0.15 + + # 80% randomly change token to mask token + if prob < 0.9: + image_feat[i] = 0 + + # rest 20% randomly keep current token + # append current token to output (we will predict these later) + output_label.append(1) + else: + # no masking token (will be ignored by loss function later) + output_label.append(-1) + + return image_feat, image_loc, output_label + + def random_action(self, action_feat, action_target, num_actions): + output_label = [] + + for i in range(num_actions): + prob = random.random() + # mask token with 15% probability + if prob < 0.15: + prob /= 0.15 + + # 90% randomly change token to mask token + if prob < 0.9: + action_feat[i] = 0 + + # rest 10% randomly keep current token + # append current token to output (we will predict these later) + output_label.append(action_target[i]) + else: + # no masking token (will be ignored by loss function later) + output_label.append(-1) + + return action_feat, output_label + + def __call__(self, results): + caption = results['caption'] + tokenizer = results['tokenizer'] + image_feat = results['image_feat'] + image_loc = results['image_loc'] + num_boxes = results['num_boxes'] + action_feat = results['action_feat'] + action_target = results['action_target'] + num_actions = results['num_actions'] + is_next = results['is_next'] + image_target = results['image_target'] + + self._truncate_seq_pair(caption, self.max_seq_length - 2) + caption, caption_label = self.random_word(caption, tokenizer) + + image_feat, image_loc, image_label = self.random_region( + image_feat, image_loc, num_boxes) + action_feat, action_label = self.random_action(action_feat, + action_target, + num_actions) + + # concatenate lm labels and account for CLS, SEP, SEP + lm_label_ids = [-1] + caption_label + [-1] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambigiously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + + tokens = [] + segment_ids = [] + + tokens.append("[CLS]") + segment_ids.append(0) + + for token in caption: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. + input_mask = [1] * (len(input_ids)) + image_mask = [1] * (num_boxes) + action_mask = [1] * (num_actions) + + # Zero-pad up to the visual sequence length. + while len(image_mask) < self.max_region_length: + image_mask.append(0) + image_label.append(-1) + while len(action_mask) < self.max_action_length: + action_mask.append(0) + action_label.append(-1) + + # Zero-pad up to the sequence length. + while len(input_ids) < self.max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + lm_label_ids.append(-1) + + assert len(input_ids) == self.max_seq_length + assert len(input_mask) == self.max_seq_length + assert len(segment_ids) == self.max_seq_length + assert len(lm_label_ids) == self.max_seq_length + assert len(image_mask) == self.max_region_length + assert len(image_label) == self.max_region_length + assert len(action_mask) == self.max_action_length + assert len(action_label) == self.max_action_length + + image_feat, image_loc, image_mask = self.get_image_global_feature( + image_feat, image_loc, np.array(image_mask)) + features = [ + np.array(input_ids), + action_feat, + image_feat, + image_loc, + np.array(segment_ids), + np.array(input_mask), + image_mask, + np.array(action_mask), + np.array(lm_label_ids), + np.array(action_label), + np.array(is_next), + np.array(image_label), + image_target, + ] + results['features'] = features + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py new file mode 100644 index 0000000..0a1d068 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample.py @@ -0,0 +1,382 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random + +import numpy as np +from PIL import Image +try: + import SimpleITK as sitk +except ImportError as e: + print( + f"Warning! {e}, [SimpleITK] package and it's dependencies is required for PP-Care." + ) +import cv2 + +from ..registry import PIPELINES + +try: + import cPickle as pickle + from cStringIO import StringIO +except ImportError: + import pickle + from io import BytesIO + + +@PIPELINES.register() +class Sampler(object): + """ + Sample frames id. + NOTE: Use PIL to read image here, has diff with CV2 + Args: + num_seg(int): number of segments. + seg_len(int): number of sampled frames in each segment. + valid_mode(bool): True or False. + select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode. + Returns: + frames_idx: the index of sampled #frames. + """ + def __init__(self, + num_seg, + seg_len, + frame_interval=None, + valid_mode=False, + select_left=False, + dense_sample=False, + linspace_sample=False, + use_pil=True): + self.num_seg = num_seg + self.seg_len = seg_len + self.frame_interval = frame_interval + self.valid_mode = valid_mode + self.select_left = select_left + self.dense_sample = dense_sample + self.linspace_sample = linspace_sample + self.use_pil = use_pil + + def _get(self, frames_idx, results): + data_format = results['format'] + + if data_format == "frame": + frame_dir = results['frame_dir'] + imgs = [] + for idx in frames_idx: + img = Image.open( + os.path.join(frame_dir, + results['suffix'].format(idx))).convert('RGB') + imgs.append(img) + + elif data_format == "MRI": + frame_dir = results['frame_dir'] + imgs = [] + MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir)) + for idx in frames_idx: + item = MRI[idx] + item = cv2.resize(item, (224, 224)) + imgs.append(item) + + elif data_format == "video": + if results['backend'] == 'cv2': + frames = np.array(results['frames']) + imgs = [] + for idx in frames_idx: + imgbuf = frames[idx] + img = Image.fromarray(imgbuf, mode='RGB') + imgs.append(img) + elif results['backend'] == 'decord': + container = results['frames'] + if self.use_pil: + frames_select = container.get_batch(frames_idx) + # dearray_to_img + np_frames = frames_select.asnumpy() + imgs = [] + for i in range(np_frames.shape[0]): + imgbuf = np_frames[i] + imgs.append(Image.fromarray(imgbuf, mode='RGB')) + else: + if frames_idx.ndim != 1: + frames_idx = np.squeeze(frames_idx) + frame_dict = { + idx: container[idx].asnumpy() + for idx in np.unique(frames_idx) + } + imgs = [frame_dict[idx] for idx in frames_idx] + elif results['backend'] == 'pyav': + imgs = [] + frames = np.array(results['frames']) + for idx in frames_idx: + if self.dense_sample: + idx = idx - 1 + imgbuf = frames[idx] + imgs.append(imgbuf) + imgs = np.stack(imgs) # thwc + else: + raise NotImplementedError + else: + raise NotImplementedError + results['imgs'] = imgs + return results + + def _get_train_clips(self, num_frames): + ori_seg_len = self.seg_len * self.frame_interval + avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg + + if avg_interval > 0: + base_offsets = np.arange(self.num_seg) * avg_interval + clip_offsets = base_offsets + np.random.randint(avg_interval, + size=self.num_seg) + elif num_frames > max(self.num_seg, ori_seg_len): + clip_offsets = np.sort( + np.random.randint(num_frames - ori_seg_len + 1, + size=self.num_seg)) + elif avg_interval == 0: + ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg + clip_offsets = np.around(np.arange(self.num_seg) * ratio) + else: + clip_offsets = np.zeros((self.num_seg, ), dtype=np.int) + return clip_offsets + + def _get_test_clips(self, num_frames): + ori_seg_len = self.seg_len * self.frame_interval + avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg) + if num_frames > ori_seg_len - 1: + base_offsets = np.arange(self.num_seg) * avg_interval + clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int) + else: + clip_offsets = np.zeros((self.num_seg, ), dtype=np.int) + return clip_offsets + + def __call__(self, results): + """ + Args: + frames_len: length of frames. + return: + sampling id. + """ + frames_len = int(results['frames_len']) + frames_idx = [] + if self.frame_interval is not None: + assert isinstance(self.frame_interval, int) + if not self.valid_mode: + offsets = self._get_train_clips(frames_len) + else: + offsets = self._get_test_clips(frames_len) + + offsets = offsets[:, None] + np.arange( + self.seg_len)[None, :] * self.frame_interval + offsets = np.concatenate(offsets) + + offsets = offsets.reshape((-1, self.seg_len)) + offsets = np.mod(offsets, frames_len) + offsets = np.concatenate(offsets) + + if results['format'] == 'video': + frames_idx = offsets + elif results['format'] == 'frame': + frames_idx = list(offsets + 1) + else: + raise NotImplementedError + + return self._get(frames_idx, results) + + if self.linspace_sample: + if 'start_idx' in results and 'end_idx' in results: + offsets = np.linspace(results['start_idx'], results['end_idx'], + self.num_seg) + else: + offsets = np.linspace(0, frames_len - 1, self.num_seg) + offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64) + if results['format'] == 'video': + frames_idx = list(offsets) + frames_idx = [x % frames_len for x in frames_idx] + elif results['format'] == 'frame': + frames_idx = list(offsets + 1) + + elif results['format'] == 'MRI': + frames_idx = list(offsets) + + else: + raise NotImplementedError + return self._get(frames_idx, results) + + average_dur = int(frames_len / self.num_seg) + if not self.select_left: + if self.dense_sample: # For ppTSM + if not self.valid_mode: # train + sample_pos = max(1, 1 + frames_len - 64) + t_stride = 64 // self.num_seg + start_idx = 0 if sample_pos == 1 else np.random.randint( + 0, sample_pos - 1) + offsets = [(idx * t_stride + start_idx) % frames_len + 1 + for idx in range(self.num_seg)] + frames_idx = offsets + else: + sample_pos = max(1, 1 + frames_len - 64) + t_stride = 64 // self.num_seg + start_list = np.linspace(0, + sample_pos - 1, + num=10, + dtype=int) + offsets = [] + for start_idx in start_list.tolist(): + offsets += [ + (idx * t_stride + start_idx) % frames_len + 1 + for idx in range(self.num_seg) + ] + frames_idx = offsets + else: + for i in range(self.num_seg): + idx = 0 + if not self.valid_mode: + if average_dur >= self.seg_len: + idx = random.randint(0, average_dur - self.seg_len) + idx += i * average_dur + elif average_dur >= 1: + idx += i * average_dur + else: + idx = i + else: + if average_dur >= self.seg_len: + idx = (average_dur - 1) // 2 + idx += i * average_dur + elif average_dur >= 1: + idx += i * average_dur + else: + idx = i + for jj in range(idx, idx + self.seg_len): + if results['format'] == 'video': + frames_idx.append(int(jj % frames_len)) + elif results['format'] == 'frame': + frames_idx.append(jj + 1) + + elif results['format'] == 'MRI': + frames_idx.append(jj) + else: + raise NotImplementedError + return self._get(frames_idx, results) + + else: # for TSM + if not self.valid_mode: + if average_dur > 0: + offsets = np.multiply(list(range(self.num_seg)), + average_dur) + np.random.randint( + average_dur, size=self.num_seg) + elif frames_len > self.num_seg: + offsets = np.sort( + np.random.randint(frames_len, size=self.num_seg)) + else: + offsets = np.zeros(shape=(self.num_seg, )) + else: + if frames_len > self.num_seg: + average_dur_float = frames_len / self.num_seg + offsets = np.array([ + int(average_dur_float / 2.0 + average_dur_float * x) + for x in range(self.num_seg) + ]) + else: + offsets = np.zeros(shape=(self.num_seg, )) + + if results['format'] == 'video': + frames_idx = list(offsets) + frames_idx = [x % frames_len for x in frames_idx] + elif results['format'] == 'frame': + frames_idx = list(offsets + 1) + + elif results['format'] == 'MRI': + frames_idx = list(offsets) + + else: + raise NotImplementedError + + return self._get(frames_idx, results) + + +@PIPELINES.register() +class SamplerPkl(object): + """ + Sample frames id. + NOTE: Use PIL to read image here, has diff with CV2 + Args: + num_seg(int): number of segments. + seg_len(int): number of sampled frames in each segment. + mode(str): 'train', 'valid' + Returns: + frames_idx: the index of sampled #frames. + """ + def __init__(self, num_seg, seg_len, backend='pillow', valid_mode=False): + self.num_seg = num_seg + self.seg_len = seg_len + self.valid_mode = valid_mode + self.backend = backend + + def _get(self, buf): + if isinstance(buf, str): + img = Image.open(StringIO(buf)) + else: + img = Image.open(BytesIO(buf)) + img = img.convert('RGB') + if self.backend != 'pillow': + img = np.array(img) + return img + + def __call__(self, results): + """ + Args: + frames_len: length of frames. + return: + sampling id. + """ + filename = results['frame_dir'] + data_loaded = pickle.load(open(filename, 'rb'), encoding='bytes') + video_name, label, frames = data_loaded + if isinstance(label, dict): + label = label['动作类型'] + results['labels'] = label + elif len(label) == 1: + results['labels'] = int(label[0]) + else: + results['labels'] = int(label[0]) if random.random() < 0.5 else int( + label[1]) + results['frames_len'] = len(frames) + frames_len = results['frames_len'] + average_dur = int(int(frames_len) / self.num_seg) + imgs = [] + for i in range(self.num_seg): + idx = 0 + if not self.valid_mode: + if average_dur >= self.seg_len: + idx = random.randint(0, average_dur - self.seg_len) + idx += i * average_dur + elif average_dur >= 1: + idx += i * average_dur + else: + idx = i + else: + if average_dur >= self.seg_len: + idx = (average_dur - 1) // 2 + idx += i * average_dur + elif average_dur >= 1: + idx += i * average_dur + else: + idx = i + + for jj in range(idx, idx + self.seg_len): + imgbuf = frames[int(jj % results['frames_len'])] + img = self._get(imgbuf) + imgs.append(img) + results['backend'] = self.backend + results['imgs'] = imgs + + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py new file mode 100644 index 0000000..39e90a2 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ava.py @@ -0,0 +1,375 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +from PIL import Image +from ..registry import PIPELINES +import os +import numpy as np +import io +import os.path as osp +from abc import ABCMeta, abstractmethod +import cv2 +from cv2 import IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_UNCHANGED +import inspect + +imread_backend = 'cv2' +imread_flags = { + 'color': IMREAD_COLOR, + 'grayscale': IMREAD_GRAYSCALE, + 'unchanged': IMREAD_UNCHANGED +} + + +@PIPELINES.register() +class SampleFrames: + """Sample frames from the video. """ + + def __init__(self, + clip_len, + frame_interval=1, + num_clips=1, + temporal_jitter=False, + twice_sample=False, + out_of_bound_opt='loop', + test_mode=False): + self.clip_len = clip_len + self.frame_interval = frame_interval + self.num_clips = num_clips + self.temporal_jitter = temporal_jitter + self.twice_sample = twice_sample + self.out_of_bound_opt = out_of_bound_opt + self.test_mode = test_mode + assert self.out_of_bound_opt in ['loop', 'repeat_last'] + + def _get_train_clips(self, num_frames): + """Get clip offsets in train mode. """ + ori_clip_len = self.clip_len * self.frame_interval + avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips + if avg_interval > 0: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = base_offsets + np.random.randint( + avg_interval, size=self.num_clips) + elif num_frames > max(self.num_clips, ori_clip_len): + clip_offsets = np.sort( + np.random.randint( + num_frames - ori_clip_len + 1, size=self.num_clips)) + elif avg_interval == 0: + ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips + clip_offsets = np.around(np.arange(self.num_clips) * ratio) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int) + return clip_offsets + + def _get_test_clips(self, num_frames): + """Get clip offsets in test mode. """ + ori_clip_len = self.clip_len * self.frame_interval + avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips) + if num_frames > ori_clip_len - 1: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int) + if self.twice_sample: + clip_offsets = np.concatenate([clip_offsets, base_offsets]) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int) + return clip_offsets + + def _sample_clips(self, num_frames): + """Choose clip offsets for the video in a given mode. """ + if self.test_mode: + clip_offsets = self._get_test_clips(num_frames) + else: + clip_offsets = self._get_train_clips(num_frames) + return clip_offsets + + def __call__(self, results): + """Perform the SampleFrames loading. """ + total_frames = results['total_frames'] + clip_offsets = self._sample_clips(total_frames) + frame_inds = clip_offsets[:, None] + np.arange( + self.clip_len)[None, :] * self.frame_interval + frame_inds = np.concatenate(frame_inds) + if self.temporal_jitter: + perframe_offsets = np.random.randint( + self.frame_interval, size=len(frame_inds)) + frame_inds += perframe_offsets + frame_inds = frame_inds.reshape((-1, self.clip_len)) + if self.out_of_bound_opt == 'loop': + frame_inds = np.mod(frame_inds, total_frames) + elif self.out_of_bound_opt == 'repeat_last': + safe_inds = frame_inds < total_frames + unsafe_inds = 1 - safe_inds + last_ind = np.max(safe_inds * frame_inds, axis=1) + new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) + frame_inds = new_inds + else: + raise ValueError('Illegal out_of_bound option.') + start_index = results['start_index'] + frame_inds = np.concatenate(frame_inds) + start_index + results['frame_inds'] = frame_inds.astype(np.int) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = self.num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'num_clips={self.num_clips}, ' + f'temporal_jitter={self.temporal_jitter}, ' + f'twice_sample={self.twice_sample}, ' + f'out_of_bound_opt={self.out_of_bound_opt}, ' + f'test_mode={self.test_mode})') + return repr_str + +class BaseStorageBackend(metaclass=ABCMeta): + """Abstract class of storage backends. """ + + @abstractmethod + def get(self, filepath): + pass + + @abstractmethod + def get_text(self, filepath): + pass + +class HardDiskBackend(BaseStorageBackend): + """Raw hard disks storage backend.""" + + def get(self, filepath): + filepath = str(filepath) + with open(filepath, 'rb') as f: + value_buf = f.read() + return value_buf + + def get_text(self, filepath): + filepath = str(filepath) + with open(filepath, 'r') as f: + value_buf = f.read() + return value_buf + +class FileClient: + """A general file client to access files in different backend. """ + + _backends = { + 'disk': HardDiskBackend, + } + + def __init__(self, backend='disk', **kwargs): + if backend not in self._backends: + raise ValueError( + f'Backend {backend} is not supported. Currently supported ones' + f' are {list(self._backends.keys())}') + self.backend = backend + self.client = self._backends[backend](**kwargs) + + @classmethod + def _register_backend(cls, name, backend, force=False): + if not isinstance(name, str): + raise TypeError('the backend name should be a string, ' + f'but got {type(name)}') + if not inspect.isclass(backend): + raise TypeError( + f'backend should be a class but got {type(backend)}') + if not issubclass(backend, BaseStorageBackend): + raise TypeError( + f'backend {backend} is not a subclass of BaseStorageBackend') + if not force and name in cls._backends: + raise KeyError( + f'{name} is already registered as a storage backend, ' + 'add "force=True" if you want to override it') + + cls._backends[name] = backend + + @classmethod + def register_backend(cls, name, backend=None, force=False): + """Register a backend to FileClient. """ + + if backend is not None: + cls._register_backend(name, backend, force=force) + return + + def _register(backend_cls): + cls._register_backend(name, backend_cls, force=force) + return backend_cls + + return _register + + def get(self, filepath): + return self.client.get(filepath) + + def get_text(self, filepath): + return self.client.get_text(filepath) + +@PIPELINES.register() +class RawFrameDecode: + """Load and decode frames with given indices. """ + + def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs): + self.io_backend = io_backend + self.decoding_backend = decoding_backend + self.kwargs = kwargs + self.file_client = None + + def _pillow2array(self,img, flag='color', channel_order='bgr'): + """Convert a pillow image to numpy array. """ + + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'unchanged': + array = np.array(img) + if array.ndim >= 3 and array.shape[2] >= 3: # color image + array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR + else: + # If the image mode is not 'RGB', convert it to 'RGB' first. + if img.mode != 'RGB': + if img.mode != 'LA': + # Most formats except 'LA' can be directly converted to RGB + img = img.convert('RGB') + else: + # When the mode is 'LA', the default conversion will fill in + # the canvas with black, which sometimes shadows black objects + # in the foreground. + # + # Therefore, a random color (124, 117, 104) is used for canvas + img_rgba = img.convert('RGBA') + img = Image.new('RGB', img_rgba.size, (124, 117, 104)) + img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha + if flag == 'color': + array = np.array(img) + if channel_order != 'rgb': + array = array[:, :, ::-1] # RGB to BGR + elif flag == 'grayscale': + img = img.convert('L') + array = np.array(img) + else: + raise ValueError( + 'flag must be "color", "grayscale" or "unchanged", ' + f'but got {flag}') + return array + + def _imfrombytes(self,content, flag='color', channel_order='bgr'):#, backend=None): + """Read an image from bytes. """ + + img_np = np.frombuffer(content, np.uint8) + flag = imread_flags[flag] if isinstance(flag, str) else flag + img = cv2.imdecode(img_np, flag) + if flag == IMREAD_COLOR and channel_order == 'rgb': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + return img + + def __call__(self, results): + """Perform the ``RawFrameDecode`` to pick frames given indices. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + # mmcv.use_backend(self.decoding_backend) + + directory = results['frame_dir'] + suffix = results['suffix'] + #modality = results['modality'] + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + + for frame_idx in results['frame_inds']: + frame_idx += offset + filepath = osp.join(directory, suffix.format(frame_idx)) + img_bytes = self.file_client.get(filepath) #以二进制方式读取图片 + # Get frame with channel order RGB directly. + + cur_frame = self._imfrombytes(img_bytes, channel_order='rgb') + imgs.append(cur_frame) + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + # we resize the gt_bboxes and proposals to their real scale + h, w = results['img_shape'] + scale_factor = np.array([w, h, w, h]) + if 'gt_bboxes' in results: + gt_bboxes = results['gt_bboxes'] + gt_bboxes_new = (gt_bboxes * scale_factor).astype(np.float32) + results['gt_bboxes'] = gt_bboxes_new + if 'proposals' in results and results['proposals'] is not None: + proposals = results['proposals'] + proposals = (proposals * scale_factor).astype(np.float32) + results['proposals'] = proposals + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend}, ' + f'decoding_backend={self.decoding_backend})') + return repr_str + +@PIPELINES.register() +class SampleAVAFrames(SampleFrames): + + def __init__(self, clip_len, frame_interval=2, test_mode=False): + + super().__init__(clip_len, frame_interval, test_mode=test_mode) + + def _get_clips(self, center_index, skip_offsets, shot_info): + start = center_index - (self.clip_len // 2) * self.frame_interval + end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval + frame_inds = list(range(start, end, self.frame_interval)) + frame_inds = frame_inds + skip_offsets + frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1) + + return frame_inds + + def __call__(self, results): + fps = results['fps'] + timestamp = results['timestamp'] + timestamp_start = results['timestamp_start'] + shot_info = results['shot_info'] + + #delta=(timestamp - timestamp_start) 为该帧距离15min视频开头有几秒 + #center_index=fps*delta为该帧距离15min视频开头有几帧 + #center_index+1是为了避免后续采样时出现负数? + #后续需要以center_index为中心前后采样视频帧片段 + center_index = fps * (timestamp - timestamp_start) + 1 + + skip_offsets = np.random.randint( + -self.frame_interval // 2, (self.frame_interval + 1) // 2, + size=self.clip_len) + frame_inds = self._get_clips(center_index, skip_offsets, shot_info) + + results['frame_inds'] = np.array(frame_inds, dtype=np.int) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = 1 + results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32) + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'test_mode={self.test_mode})') + return repr_str + diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py new file mode 100644 index 0000000..7d9e904 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/sample_ucf24.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random + +from PIL import Image + +from ..registry import PIPELINES + + +@PIPELINES.register() +class SamplerUCF24(object): + """ + Sample frames id. + NOTE: Use PIL to read image here, has diff with CV2 + Args: + num_frames(int): The amount of frames used in a video + frame_interval(int): Sampling rate + valid_mode(bool): True or False. + Returns: + frames_idx: the index of sampled #frames. + """ + def __init__(self, + num_frames=16, + frame_interval=1, + valid_mode=False): + self.num_frames = num_frames + self.frame_interval = frame_interval if valid_mode else random.randint(1, 2) + self.valid_mode = valid_mode + + def _get(self, frames_idxs, img_folder, results): + imgs = [] + for idx in frames_idxs: + img = Image.open( + os.path.join(img_folder, '{:05d}.jpg'.format(idx))).convert('RGB') + imgs.append(img) + results['imgs'] = imgs + return results + + def _make_clip(self, im_ind, max_num): + frame_idxs = [] + for i in reversed(range(self.num_frames)): + # make it as a loop + i_temp = im_ind - i * self.frame_interval + if i_temp < 1: + i_temp = 1 + elif i_temp > max_num: + i_temp = max_num + frame_idxs.append(i_temp) + return frame_idxs + + def __call__(self, results): + img_folder, key_frame = os.path.split(results['filename']) + frame_len = len(os.listdir(img_folder)) + key_idx = int(key_frame[0:5]) + frame_idxs = self._make_clip(key_idx, frame_len) + return self._get(frame_idxs, img_folder, results) diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py new file mode 100644 index 0000000..2471442 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation.py @@ -0,0 +1,130 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from PIL import Image +import copy +import cv2 +from ..registry import PIPELINES + + +@PIPELINES.register() +class MultiRestrictSize(object): + def __init__(self, + min_size=None, + max_size=800, + flip=False, + multi_scale=[1.3]): + self.min_size = min_size + self.max_size = max_size + self.multi_scale = multi_scale + self.flip = flip + assert ((min_size is None)) or ((max_size is None)) + + def __call__(self, sample): + samples = [] + image = sample['current_img'] + h, w = image.shape[:2] + for scale in self.multi_scale: + # Fixed range of scales + sc = None + # Align short edge + if not (self.min_size is None): + if h > w: + short_edge = w + else: + short_edge = h + if short_edge > self.min_size: + sc = float(self.min_size) / short_edge + else: + if h > w: + long_edge = h + else: + long_edge = w + if long_edge > self.max_size: + sc = float(self.max_size) / long_edge + + if sc is None: + new_h = h + new_w = w + else: + new_h = sc * h + new_w = sc * w + new_h = int(new_h * scale) + new_w = int(new_w * scale) + + if (new_h - 1) % 16 != 0: + new_h = int(np.around((new_h - 1) / 16.) * 16 + 1) + if (new_w - 1) % 16 != 0: + new_w = int(np.around((new_w - 1) / 16.) * 16 + 1) + + if new_h == h and new_w == w: + samples.append(sample) + else: + new_sample = {} + for elem in sample.keys(): + if 'meta' in elem: + new_sample[elem] = sample[elem] + continue + tmp = sample[elem] + if 'label' in elem: + new_sample[elem] = sample[elem] + continue + else: + flagval = cv2.INTER_CUBIC + tmp = cv2.resize(tmp, + dsize=(new_w, new_h), + interpolation=flagval) + new_sample[elem] = tmp + samples.append(new_sample) + + if self.flip: + now_sample = samples[-1] + new_sample = {} + for elem in now_sample.keys(): + if 'meta' in elem: + new_sample[elem] = now_sample[elem].copy() + new_sample[elem]['flip'] = True + continue + tmp = now_sample[elem] + tmp = tmp[:, ::-1].copy() + new_sample[elem] = tmp + samples.append(new_sample) + + return samples + + +@PIPELINES.register() +class MultiNorm(object): + def __call__(self, samples): + for idx in range(len(samples)): + sample = samples[idx] + for elem in sample.keys(): + if 'meta' in elem: + continue + tmp = sample[elem] + if tmp is None: + continue + + if tmp.ndim == 2: + tmp = tmp[:, :, np.newaxis] + else: + tmp = tmp / 255. + tmp -= (0.485, 0.456, 0.406) + tmp /= (0.229, 0.224, 0.225) + + tmp = tmp.transpose((2, 0, 1)) + samples[idx][elem] = tmp + + return samples diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py new file mode 100644 index 0000000..dda6dee --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/segmentation_pipline.py @@ -0,0 +1,40 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy + +import os +import numpy as np +import random +import paddle +from ..registry import PIPELINES +""" +pipeline ops for Action Segmentation Dataset. +""" + + +@PIPELINES.register() +class SegmentationSampler(object): + + def __init__(self, sample_rate): + self.sample_rate = sample_rate + + def __call__(self, results): + for key, data in results.items(): + if len(data.shape) == 1: + data = data[::self.sample_rate] + results[key] = copy.deepcopy(data) + else: + data = data[:, ::self.sample_rate] + results[key] = copy.deepcopy(data) + return results diff --git a/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py new file mode 100644 index 0000000..d31c816 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/pipelines/skeleton_pipeline.py @@ -0,0 +1,1554 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import collections +from itertools import repeat +import copy as cp +from collections import abc +import numpy as np +import paddle.nn.functional as F +import random +import paddle +from ..registry import PIPELINES +from .augmentations_ava import iminvert, imflip_ +"""pipeline ops for Activity Net. +""" + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +_single = _ntuple(1) +_pair = _ntuple(2) +_triple = _ntuple(3) +_quadruple = _ntuple(4) + + +def _init_lazy_if_proper(results, lazy): + """Initialize lazy operation properly. + + Make sure that a lazy operation is properly initialized, + and avoid a non-lazy operation accidentally getting mixed in. + + Required keys in results are "imgs" if "img_shape" not in results, + otherwise, Required keys in results are "img_shape", add or modified keys + are "img_shape", "lazy". + Add or modified keys in "lazy" are "original_shape", "crop_bbox", "flip", + "flip_direction", "interpolation". + + Args: + results (dict): A dict stores data pipeline result. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + if 'img_shape' not in results: + results['img_shape'] = results['imgs'][0].shape[:2] + if lazy: + if 'lazy' not in results: + img_h, img_w = results['img_shape'] + lazyop = dict() + lazyop['original_shape'] = results['img_shape'] + lazyop['crop_bbox'] = np.array([0, 0, img_w, img_h], + dtype=np.float32) + lazyop['flip'] = False + lazyop['flip_direction'] = None + lazyop['interpolation'] = None + results['lazy'] = lazyop + else: + assert 'lazy' not in results, 'Use Fuse after lazy operations' + + +@PIPELINES.register() +class AutoPadding(object): + """ + Sample or Padding frame skeleton feature. + Args: + window_size: int, temporal size of skeleton feature. + random_pad: bool, whether do random padding when frame length < window size. Default: False. + """ + + def __init__(self, window_size, random_pad=False): + self.window_size = window_size + self.random_pad = random_pad + + def get_frame_num(self, data): + C, T, V, M = data.shape + for i in range(T - 1, -1, -1): + tmp = np.sum(data[:, i, :, :]) + if tmp > 0: + T = i + 1 + break + return T + + def __call__(self, results): + data = results['data'] + + C, T, V, M = data.shape + T = self.get_frame_num(data) + if T == self.window_size: + data_pad = data[:, :self.window_size, :, :] + elif T < self.window_size: + begin = random.randint( + 0, self.window_size - T) if self.random_pad else 0 + data_pad = np.zeros((C, self.window_size, V, M)) + data_pad[:, begin:begin + T, :, :] = data[:, :T, :, :] + else: + if self.random_pad: + index = np.random.choice( + T, self.window_size, replace=False).astype('int64') + else: + index = np.linspace(0, T, self.window_size).astype("int64") + data_pad = data[:, index, :, :] + + results['data'] = data_pad + return results + + +@PIPELINES.register() +class SkeletonNorm(object): + """ + Normalize skeleton feature. + Args: + aixs: dimensions of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default: 2. + """ + + def __init__(self, axis=2, squeeze=False): + self.axis = axis + self.squeeze = squeeze + + def __call__(self, results): + data = results['data'] + + # Centralization + data = data - data[:, :, 8:9, :] + data = data[:self.axis, :, :, :] # get (x,y) from (x,y, acc) + C, T, V, M = data.shape + if self.squeeze: + data = data.reshape((C, T, V)) # M = 1 + + results['data'] = data.astype('float32') + if 'label' in results: + label = results['label'] + results['label'] = np.expand_dims(label, 0).astype('int64') + return results + + +@PIPELINES.register() +class Iden(object): + """ + Wrapper Pipeline + """ + + def __init__(self, label_expand=True): + self.label_expand = label_expand + + def __call__(self, results): + data = results['data'] + results['data'] = data.astype('float32') + + if 'label' in results and self.label_expand: + label = results['label'] + results['label'] = np.expand_dims(label, 0).astype('int64') + return results + + +@PIPELINES.register() +class RandomRotation(object): + """ + Random rotation sketeton. + Args: + argument: bool, if rotation. + theta: float, rotation rate. + """ + + def __init__(self, argument, theta=0.3): + self.theta = theta + self.argument = argument + + def _rot(self, rot): + """ + rot: T,3 + """ + cos_r, sin_r = np.cos(rot), np.sin(rot) # T,3 + zeros = np.zeros((rot.shape[0], 1)) # T,1 + ones = np.ones((rot.shape[0], 1)) # T,1 + + r1 = np.stack((ones, zeros, zeros), axis=-1) # T,1,3 + rx2 = np.stack((zeros, cos_r[:, 0:1], sin_r[:, 0:1]), axis=-1) # T,1,3 + rx3 = np.stack((zeros, -sin_r[:, 0:1], cos_r[:, 0:1]), axis=-1) # T,1,3 + rx = np.concatenate((r1, rx2, rx3), axis=1) # T,3,3 + + ry1 = np.stack((cos_r[:, 1:2], zeros, -sin_r[:, 1:2]), axis=-1) + r2 = np.stack((zeros, ones, zeros), axis=-1) + ry3 = np.stack((sin_r[:, 1:2], zeros, cos_r[:, 1:2]), axis=-1) + ry = np.concatenate((ry1, r2, ry3), axis=1) + + rz1 = np.stack((cos_r[:, 2:3], sin_r[:, 2:3], zeros), axis=-1) + r3 = np.stack((zeros, zeros, ones), axis=-1) + rz2 = np.stack((-sin_r[:, 2:3], cos_r[:, 2:3], zeros), axis=-1) + rz = np.concatenate((rz1, rz2, r3), axis=1) + + rot = np.matmul(np.matmul(rz, ry), rx) + return rot + + def __call__(self, results): + # C,T,V,M + data = results['data'] + if self.argument: + C, T, V, M = data.shape + data_numpy = np.transpose(data, (1, 0, 2, 3)).conjugate().reshape( + T, C, V * M) # T,3,V*M + rot = np.random.uniform(-self.theta, self.theta, 3) + rot = np.stack( + [ + rot, + ] * T, axis=0) + rot = self._rot(rot) # T,3,3 + data_numpy = np.matmul(rot, data_numpy) + data_numpy = data_numpy.reshape(T, C, V, M) + data_numpy = np.transpose(data_numpy, (1, 0, 2, 3)) + data = data_numpy + results['data'] = data.astype(np.float32) + return results + + +@PIPELINES.register() +class SketeonCropSample(object): + """ + Sketeon Crop Sampler. + Args: + crop_model: str, crop model, support: ['center']. + p_interval: list, crop len + window_size: int, sample windows size. + """ + + def __init__(self, window_size, crop_model='center', p_interval=1): + assert crop_model in ['center'], "Don't support :" + crop_model + + self.crop_model = crop_model + self.window_size = window_size + self.p_interval = p_interval + + def __call__(self, results): + if self.crop_model == 'center': + # input: C,T,V,M + data = results['data'] + valid_frame_num = np.sum(data.sum(0).sum(-1).sum(-1) != 0) + + C, T, V, M = data.shape + begin = 0 + end = valid_frame_num + valid_size = end - begin + + #crop + if len(self.p_interval) == 1: + p = self.p_interval[0] + bias = int((1 - p) * valid_size / 2) + data = data[:, begin + bias:end - bias, :, :] # center_crop + cropped_length = data.shape[1] + else: + p = np.random.rand(1) * (self.p_interval[1] - self.p_interval[0] + ) + self.p_interval[0] + # constraint cropped_length lower bound as 64 + cropped_length = np.minimum( + np.maximum(int(np.floor(valid_size * p)), 64), valid_size) + bias = np.random.randint(0, valid_size - cropped_length + 1) + data = data[:, begin + bias:begin + bias + cropped_length, :, :] + + # resize + data = np.transpose(data, (0, 2, 3, 1)).conjugate().reshape( + C * V * M, cropped_length) + data = data[None, None, :, :] + # could perform both up sample and down sample + data_tensor = paddle.to_tensor(data) + data_tensor = F.interpolate( + data_tensor, + size=(C * V * M, self.window_size), + mode='bilinear', + align_corners=False).squeeze() + data = paddle.transpose( + paddle.reshape(data_tensor, (C, V, M, self.window_size)), + (0, 3, 1, 2)).numpy() + else: + raise NotImplementedError + results['data'] = data + return results + + +@PIPELINES.register() +class SketeonModalityTransform(object): + """ + Sketeon Crop Sampler. + Args: + crop_model: str, crop model, support: ['center']. + p_interval: list, crop len + window_size: int, sample windows size. + """ + + def __init__(self, bone, motion, joint=True, graph='ntu_rgb_d'): + + self.joint = joint + self.bone = bone + self.motion = motion + self.graph = graph + if self.graph == "ntu_rgb_d": + self.bone_pairs = ((1, 2), (2, 21), (3, 21), (4, 3), (5, 21), + (6, 5), (7, 6), (8, 7), (9, 21), (10, 9), + (11, 10), (12, 11), (13, 1), (14, 13), (15, 14), + (16, 15), (17, 1), (18, 17), (19, 18), (20, 19), + (22, 23), (21, 21), (23, 8), (24, 25), (25, 12)) + else: + raise NotImplementedError + + def __call__(self, results): + if self.joint: + return results + data_numpy = results['data'] + if self.bone: + bone_data_numpy = np.zeros_like(data_numpy) + for v1, v2 in self.bone_pairs: + bone_data_numpy[:, :, v1 - + 1] = data_numpy[:, :, v1 - + 1] - data_numpy[:, :, v2 - 1] + data_numpy = bone_data_numpy + if self.motion: + data_numpy[:, :-1] = data_numpy[:, 1:] - data_numpy[:, :-1] + data_numpy[:, -1] = 0 + results['data'] = data_numpy + return results + + +@PIPELINES.register() +class UniformSampleFrames: + """Uniformly sample frames from the video. + + To sample an n-frame clip from the video. UniformSampleFrames basically + divide the video into n segments of equal length and randomly sample one + frame from each segment. To make the testing results reproducible, a + random seed is set during testing, to make the sampling results + deterministic. + + Required keys are "total_frames", "start_index" , added or modified keys + are "frame_inds", "clip_len", "frame_interval" and "num_clips". + + Args: + clip_len (int): Frames of each sampled output clip. + num_clips (int): Number of clips to be sampled. Default: 1. + test_mode (bool): Store True when building test or validation dataset. + Default: False. + seed (int): The random seed used during test time. Default: 255. + """ + + def __init__(self, clip_len, num_clips=1, test_mode=False, seed=255): + + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + self.seed = seed + + def _get_train_clips(self, num_frames, clip_len): + """Uniformly sample indices for training clips. + + Args: + num_frames (int): The number of frames. + clip_len (int): The length of the clip. + """ + + assert self.num_clips == 1 + if num_frames < clip_len: + start = np.random.randint(0, num_frames) + inds = np.arange(start, start + clip_len) + elif clip_len <= num_frames < 2 * clip_len: + basic = np.arange(clip_len) + inds = np.random.choice( + clip_len + 1, num_frames - clip_len, replace=False) + offset = np.zeros(clip_len + 1, dtype=np.int64) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + else: + bids = np.array( + [i * num_frames // clip_len for i in range(clip_len + 1)]) + bsize = np.diff(bids) + bst = bids[:clip_len] + offset = np.random.randint(bsize) + inds = bst + offset + return inds + + def _get_test_clips(self, num_frames, clip_len): + """Uniformly sample indices for testing clips. + + Args: + num_frames (int): The number of frames. + clip_len (int): The length of the clip. + """ + + np.random.seed(self.seed) + if num_frames < clip_len: + # Then we use a simple strategy + if num_frames < self.num_clips: + start_inds = list(range(self.num_clips)) + else: + start_inds = [ + i * num_frames // self.num_clips + for i in range(self.num_clips) + ] + inds = np.concatenate( + [np.arange(i, i + clip_len) for i in start_inds]) + elif clip_len <= num_frames < clip_len * 2: + all_inds = [] + for i in range(self.num_clips): + basic = np.arange(clip_len) + inds = np.random.choice( + clip_len + 1, num_frames - clip_len, replace=False) + offset = np.zeros(clip_len + 1, dtype=np.int64) + offset[inds] = 1 + offset = np.cumsum(offset) + inds = basic + offset[:-1] + all_inds.append(inds) + inds = np.concatenate(all_inds) + else: + bids = np.array( + [i * num_frames // clip_len for i in range(clip_len + 1)]) + bsize = np.diff(bids) + bst = bids[:clip_len] + all_inds = [] + for i in range(self.num_clips): + offset = np.random.randint(bsize) + all_inds.append(bst + offset) + inds = np.concatenate(all_inds) + return inds + + def __call__(self, results): + num_frames = results['total_frames'] + + if self.test_mode: + inds = self._get_test_clips(num_frames, self.clip_len) + else: + inds = self._get_train_clips(num_frames, self.clip_len) + + inds = np.mod(inds, num_frames) + start_index = results['start_index'] + inds = inds + start_index + + results['frame_inds'] = inds.astype(np.int) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'num_clips={self.num_clips}, ' + f'test_mode={self.test_mode}, ' + f'seed={self.seed})') + return repr_str + + +@PIPELINES.register() +class PoseDecode: + """Load and decode pose with given indices. + + Required keys are "keypoint", "frame_inds" (optional), "keypoint_score" + (optional), added or modified keys are "keypoint", "keypoint_score" (if + applicable). + """ + + @staticmethod + def _load_kp(kp, frame_inds): + """Load keypoints given frame indices. + + Args: + kp (np.ndarray): The keypoint coordinates. + frame_inds (np.ndarray): The frame indices. + """ + + return [x[frame_inds].astype(np.float32) for x in kp] + + @staticmethod + def _load_kpscore(kpscore, frame_inds): + """Load keypoint scores given frame indices. + + Args: + kpscore (np.ndarray): The confidence scores of keypoints. + frame_inds (np.ndarray): The frame indices. + """ + + return [x[frame_inds].astype(np.float32) for x in kpscore] + + def __call__(self, results): + + if 'frame_inds' not in results: + results['frame_inds'] = np.arange(results['total_frames']) + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + frame_inds = results['frame_inds'] + offset + + if 'keypoint_score' in results: + kpscore = results['keypoint_score'] + results['keypoint_score'] = kpscore[:, frame_inds].astype( + np.float32) + + if 'keypoint' in results: + results['keypoint'] = results['keypoint'][:, frame_inds].astype( + np.float32) + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}()' + return repr_str + + +@PIPELINES.register() +class PoseCompact: + """Convert the coordinates of keypoints to make it more compact. + Specifically, it first find a tight bounding box that surrounds all joints + in each frame, then we expand the tight box by a given padding ratio. For + example, if 'padding == 0.25', then the expanded box has unchanged center, + and 1.25x width and height. + + Required keys in results are "img_shape", "keypoint", add or modified keys + are "img_shape", "keypoint", "crop_quadruple". + + Args: + padding (float): The padding size. Default: 0.25. + threshold (int): The threshold for the tight bounding box. If the width + or height of the tight bounding box is smaller than the threshold, + we do not perform the compact operation. Default: 10. + hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded + box. Float indicates the specific ratio and tuple indicates a + ratio range. If set as None, it means there is no requirement on + hw_ratio. Default: None. + allow_imgpad (bool): Whether to allow expanding the box outside the + image to meet the hw_ratio requirement. Default: True. + + Returns: + type: Description of returned object. + """ + + def __init__(self, + padding=0.25, + threshold=10, + hw_ratio=None, + allow_imgpad=True): + + self.padding = padding + self.threshold = threshold + if hw_ratio is not None: + hw_ratio = _pair(hw_ratio) + + self.hw_ratio = hw_ratio + + self.allow_imgpad = allow_imgpad + assert self.padding >= 0 + + def _combine_quadruple(self, a, b): + return (a[0] + a[2] * b[0], a[1] + a[3] * b[1], a[2] * b[2], + a[3] * b[3]) + + def __call__(self, results): + img_shape = results['img_shape'] + h, w = img_shape + kp = results['keypoint'] + + # Make NaN zero + kp[np.isnan(kp)] = 0. + kp_x = kp[..., 0] + kp_y = kp[..., 1] + + min_x = np.min(kp_x[kp_x != 0], initial=np.Inf) + min_y = np.min(kp_y[kp_y != 0], initial=np.Inf) + max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf) + max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf) + + # The compact area is too small + if max_x - min_x < self.threshold or max_y - min_y < self.threshold: + return results + + center = ((max_x + min_x) / 2, (max_y + min_y) / 2) + half_width = (max_x - min_x) / 2 * (1 + self.padding) + half_height = (max_y - min_y) / 2 * (1 + self.padding) + + if self.hw_ratio is not None: + half_height = max(self.hw_ratio[0] * half_width, half_height) + half_width = max(1 / self.hw_ratio[1] * half_height, half_width) + + min_x, max_x = center[0] - half_width, center[0] + half_width + min_y, max_y = center[1] - half_height, center[1] + half_height + + # hot update + if not self.allow_imgpad: + min_x, min_y = int(max(0, min_x)), int(max(0, min_y)) + max_x, max_y = int(min(w, max_x)), int(min(h, max_y)) + else: + min_x, min_y = int(min_x), int(min_y) + max_x, max_y = int(max_x), int(max_y) + + kp_x[kp_x != 0] -= min_x + kp_y[kp_y != 0] -= min_y + + new_shape = (max_y - min_y, max_x - min_x) + results['img_shape'] = new_shape + + # the order is x, y, w, h (in [0, 1]), a tuple + crop_quadruple = results.get('crop_quadruple', (0., 0., 1., 1.)) + new_crop_quadruple = (min_x / w, min_y / h, (max_x - min_x) / w, + (max_y - min_y) / h) + crop_quadruple = self._combine_quadruple(crop_quadruple, + new_crop_quadruple) + results['crop_quadruple'] = crop_quadruple + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(padding={self.padding}, ' + f'threshold={self.threshold}, ' + f'hw_ratio={self.hw_ratio}, ' + f'allow_imgpad={self.allow_imgpad})') + return repr_str + + +class CropBase: + @staticmethod + def _crop_kps(kps, crop_bbox): + return kps - crop_bbox[:2] + + @staticmethod + def _crop_imgs(imgs, crop_bbox): + x1, y1, x2, y2 = crop_bbox + return [img[y1:y2, x1:x2] for img in imgs] + + @staticmethod + def _box_crop(box, crop_bbox): + """Crop the bounding boxes according to the crop_bbox. + + Args: + box (np.ndarray): The bounding boxes. + crop_bbox(np.ndarray): The bbox used to crop the original image. + """ + + x1, y1, x2, y2 = crop_bbox + img_w, img_h = x2 - x1, y2 - y1 + + box_ = box.copy() + box_[..., 0::2] = np.clip(box[..., 0::2] - x1, 0, img_w - 1) + box_[..., 1::2] = np.clip(box[..., 1::2] - y1, 0, img_h - 1) + return box_ + + def _all_box_crop(self, results, crop_bbox): + """Crop the gt_bboxes and proposals in results according to crop_bbox. + + Args: + results (dict): All information about the sample, which contain + 'gt_bboxes' and 'proposals' (optional). + crop_bbox(np.ndarray): The bbox used to crop the original image. + """ + results['gt_bboxes'] = self._box_crop(results['gt_bboxes'], crop_bbox) + if 'proposals' in results and results['proposals'] is not None: + assert results['proposals'].shape[1] == 4 + results['proposals'] = self._box_crop(results['proposals'], + crop_bbox) + return results + + def __call__(self, results): + raise NotImplementedError + + +@PIPELINES.register() +class RandomResizedCrop_V2(CropBase): + """Random crop that specifics the area and height-weight ratio range. + + Required keys in results are "img_shape", "crop_bbox", "imgs" (optional), + "keypoint" (optional), added or modified keys are "imgs", "keypoint", + "crop_bbox" and "lazy"; Required keys in "lazy" are "flip", "crop_bbox", + added or modified key is "crop_bbox". + + Args: + area_range (Tuple[float]): The candidate area scales range of + output cropped images. Default: (0.08, 1.0). + aspect_ratio_range (Tuple[float]): The candidate aspect ratio range of + output cropped images. Default: (3 / 4, 4 / 3). + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, + area_range=(0.08, 1.0), + aspect_ratio_range=(3 / 4, 4 / 3), + lazy=False): + self.area_range = eval(area_range) + self.aspect_ratio_range = aspect_ratio_range + self.lazy = lazy + if not is_tuple_of(self.area_range, float): + raise TypeError(f'Area_range must be a tuple of float, ' + f'but got {type(area_range)}') + if not is_tuple_of(self.aspect_ratio_range, float): + raise TypeError(f'Aspect_ratio_range must be a tuple of float, ' + f'but got {type(aspect_ratio_range)}') + + @staticmethod + def get_crop_bbox(img_shape, + area_range, + aspect_ratio_range, + max_attempts=10): + """Get a crop bbox given the area range and aspect ratio range. + + Args: + img_shape (Tuple[int]): Image shape + area_range (Tuple[float]): The candidate area scales range of + output cropped images. Default: (0.08, 1.0). + aspect_ratio_range (Tuple[float]): The candidate aspect + ratio range of output cropped images. Default: (3 / 4, 4 / 3). + max_attempts (int): The maximum of attempts. Default: 10. + max_attempts (int): Max attempts times to generate random candidate + bounding box. If it doesn't qualified one, the center bounding + box will be used. + Returns: + (list[int]) A random crop bbox within the area range and aspect + ratio range. + """ + assert 0 < area_range[0] <= area_range[1] <= 1 + assert 0 < aspect_ratio_range[0] <= aspect_ratio_range[1] + + img_h, img_w = img_shape + area = img_h * img_w + + min_ar, max_ar = aspect_ratio_range + aspect_ratios = np.exp( + np.random.uniform( + np.log(min_ar), np.log(max_ar), size=max_attempts)) + target_areas = np.random.uniform(*area_range, size=max_attempts) * area + candidate_crop_w = np.round(np.sqrt( + target_areas * aspect_ratios)).astype(np.int32) + candidate_crop_h = np.round(np.sqrt( + target_areas / aspect_ratios)).astype(np.int32) + + for i in range(max_attempts): + crop_w = candidate_crop_w[i] + crop_h = candidate_crop_h[i] + if crop_h <= img_h and crop_w <= img_w: + x_offset = random.randint(0, img_w - crop_w) + y_offset = random.randint(0, img_h - crop_h) + return x_offset, y_offset, x_offset + crop_w, y_offset + crop_h + + # Fallback + crop_size = min(img_h, img_w) + x_offset = (img_w - crop_size) // 2 + y_offset = (img_h - crop_size) // 2 + return x_offset, y_offset, x_offset + crop_size, y_offset + crop_size + + def __call__(self, results): + """Performs the RandomResizeCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + img_h, img_w = results['img_shape'] + + left, top, right, bottom = self.get_crop_bbox( + (img_h, img_w), self.area_range, self.aspect_ratio_range) + new_h, new_w = bottom - top, right - left + + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = left / img_w, top / img_h + w_ratio, h_ratio = new_w / img_w, new_h / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_h_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + crop_bbox = np.array([left, top, right, bottom]) + results['crop_bbox'] = crop_bbox + results['img_shape'] = (new_h, new_w) + + if not self.lazy: + if 'keypoint' in results: + results['keypoint'] = self._crop_kps(results['keypoint'], + crop_bbox) + if 'imgs' in results: + results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = left * (lazy_right - lazy_left) / img_w + right = right * (lazy_right - lazy_left) / img_w + top = top * (lazy_bottom - lazy_top) / img_h + bottom = bottom * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array( + [(lazy_left + left), (lazy_top + top), (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + if 'gt_bboxes' in results: + assert not self.lazy + results = self._all_box_crop(results, results['crop_bbox']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'area_range={self.area_range}, ' + f'aspect_ratio_range={self.aspect_ratio_range}, ' + f'lazy={self.lazy})') + return repr_str + + +def is_seq_of(seq, expected_type, seq_type=None): + """Check whether it is a sequence of some type. + + Args: + seq (Sequence): The sequence to be checked. + expected_type (type): Expected type of sequence items. + seq_type (type, optional): Expected sequence type. + + Returns: + bool: Whether the sequence is valid. + """ + if seq_type is None: + exp_seq_type = abc.Sequence + else: + assert isinstance(seq_type, type) + exp_seq_type = seq_type + if not isinstance(seq, exp_seq_type): + return False + for item in seq: + if not isinstance(item, expected_type): + return False + return True + + +def is_tuple_of(seq, expected_type): + """Check whether it is a tuple of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=tuple) + + +@PIPELINES.register() +class CenterCrop_V2(CropBase): + """Crop the center area from images. + + Required keys are "img_shape", "imgs" (optional), "keypoint" (optional), + added or modified keys are "imgs", "keypoint", "crop_bbox", "lazy" and + "img_shape". Required keys in "lazy" is "crop_bbox", added or modified key + is "crop_bbox". + + Args: + crop_size (int | tuple[int]): (w, h) of crop size. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + + def __init__(self, crop_size, lazy=False): + self.crop_size = _pair(crop_size) + self.lazy = lazy + if not is_tuple_of(self.crop_size, int): + raise TypeError(f'Crop_size must be int or tuple of int, ' + f'but got {type(crop_size)}') + + def __call__(self, results): + """Performs the CenterCrop augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + + img_h, img_w = results['img_shape'] + crop_w, crop_h = self.crop_size + + left = (img_w - crop_w) // 2 + top = (img_h - crop_h) // 2 + right = left + crop_w + bottom = top + crop_h + new_h, new_w = bottom - top, right - left + + crop_bbox = np.array([left, top, right, bottom]) + results['crop_bbox'] = crop_bbox + results['img_shape'] = (new_h, new_w) + + if 'crop_quadruple' not in results: + results['crop_quadruple'] = np.array( + [0, 0, 1, 1], # x, y, w, h + dtype=np.float32) + + x_ratio, y_ratio = left / img_w, top / img_h + w_ratio, h_ratio = new_w / img_w, new_h / img_h + + old_crop_quadruple = results['crop_quadruple'] + old_x_ratio, old_y_ratio = old_crop_quadruple[0], old_crop_quadruple[1] + old_w_ratio, old_h_ratio = old_crop_quadruple[2], old_crop_quadruple[3] + new_crop_quadruple = [ + old_x_ratio + x_ratio * old_w_ratio, + old_y_ratio + y_ratio * old_h_ratio, w_ratio * old_w_ratio, + h_ratio * old_h_ratio + ] + results['crop_quadruple'] = np.array( + new_crop_quadruple, dtype=np.float32) + + if not self.lazy: + if 'keypoint' in results: + results['keypoint'] = self._crop_kps(results['keypoint'], + crop_bbox) + if 'imgs' in results: + results['imgs'] = self._crop_imgs(results['imgs'], crop_bbox) + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Put Flip at last for now') + + # record crop_bbox in lazyop dict to ensure only crop once in Fuse + lazy_left, lazy_top, lazy_right, lazy_bottom = lazyop['crop_bbox'] + left = left * (lazy_right - lazy_left) / img_w + right = right * (lazy_right - lazy_left) / img_w + top = top * (lazy_bottom - lazy_top) / img_h + bottom = bottom * (lazy_bottom - lazy_top) / img_h + lazyop['crop_bbox'] = np.array( + [(lazy_left + left), (lazy_top + top), (lazy_left + right), + (lazy_top + bottom)], + dtype=np.float32) + + if 'gt_bboxes' in results: + assert not self.lazy + results = self._all_box_crop(results, results['crop_bbox']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(crop_size={self.crop_size}, ' + f'lazy={self.lazy})') + return repr_str + + +@PIPELINES.register() +class Flip_V2: + """Flip the input images with a probability. + + Reverse the order of elements in the given imgs with a specific direction. + The shape of the imgs is preserved, but the elements are reordered. + + Required keys are "img_shape", "modality", "imgs" (optional), "keypoint" + (optional), added or modified keys are "imgs", "keypoint", "lazy" and + "flip_direction". Required keys in "lazy" is None, added or modified key + are "flip" and "flip_direction". The Flip augmentation should be placed + after any cropping / reshaping augmentations, to make sure crop_quadruple + is calculated properly. + + Args: + flip_ratio (float): Probability of implementing flip. Default: 0.5. + direction (str): Flip imgs horizontally or vertically. Options are + "horizontal" | "vertical". Default: "horizontal". + flip_label_map (Dict[int, int] | None): Transform the label of the + flipped image with the specific label. Default: None. + left_kp (list[int]): Indexes of left keypoints, used to flip keypoints. + Default: None. + right_kp (list[ind]): Indexes of right keypoints, used to flip + keypoints. Default: None. + lazy (bool): Determine whether to apply lazy operation. Default: False. + """ + _directions = ['horizontal', 'vertical'] + + def __init__(self, + flip_ratio=0.5, + direction='horizontal', + flip_label_map=None, + left_kp=None, + right_kp=None, + lazy=False): + if direction not in self._directions: + raise ValueError(f'Direction {direction} is not supported. ' + f'Currently support ones are {self._directions}') + self.flip_ratio = flip_ratio + self.direction = direction + self.flip_label_map = flip_label_map + self.left_kp = left_kp + self.right_kp = right_kp + self.lazy = lazy + + def _flip_imgs(self, imgs, modality): + _ = [imflip_(img, self.direction) for img in imgs] + lt = len(imgs) + if modality == 'Flow': + # The 1st frame of each 2 frames is flow-x + for i in range(0, lt, 2): + imgs[i] = iminvert(imgs[i]) + return imgs + + def _flip_kps(self, kps, kpscores, img_width): + kp_x = kps[..., 0] + kp_x[kp_x != 0] = img_width - kp_x[kp_x != 0] + new_order = list(range(kps.shape[2])) + if self.left_kp is not None and self.right_kp is not None: + for left, right in zip(self.left_kp, self.right_kp): + new_order[left] = right + new_order[right] = left + kps = kps[:, :, new_order] + if kpscores is not None: + kpscores = kpscores[:, :, new_order] + return kps, kpscores + + @staticmethod + def _box_flip(box, img_width): + """Flip the bounding boxes given the width of the image. + + Args: + box (np.ndarray): The bounding boxes. + img_width (int): The img width. + """ + box_ = box.copy() + box_[..., 0::4] = img_width - box[..., 2::4] + box_[..., 2::4] = img_width - box[..., 0::4] + return box_ + + def __call__(self, results): + """Performs the Flip augmentation. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + _init_lazy_if_proper(results, self.lazy) + if 'keypoint' in results: + assert not self.lazy, ('Keypoint Augmentations are not compatible ' + 'with lazy == True') + assert self.direction == 'horizontal', ( + 'Only horizontal flips are' + 'supported for human keypoints') + + modality = results['modality'] + if modality == 'Flow': + assert self.direction == 'horizontal' + + flip = np.random.rand() < self.flip_ratio + + results['flip'] = flip + results['flip_direction'] = self.direction + img_width = results['img_shape'][1] + + if self.flip_label_map is not None and flip: + results['label'] = self.flip_label_map.get(results['label'], + results['label']) + + if not self.lazy: + if flip: + if 'imgs' in results: + results['imgs'] = self._flip_imgs(results['imgs'], modality) + if 'keypoint' in results: + kp = results['keypoint'] + kpscore = results.get('keypoint_score', None) + kp, kpscore = self._flip_kps(kp, kpscore, img_width) + results['keypoint'] = kp + if 'keypoint_score' in results: + results['keypoint_score'] = kpscore + else: + lazyop = results['lazy'] + if lazyop['flip']: + raise NotImplementedError('Use one Flip please') + lazyop['flip'] = flip + lazyop['flip_direction'] = self.direction + + if 'gt_bboxes' in results and flip: + assert not self.lazy and self.direction == 'horizontal' + width = results['img_shape'][1] + results['gt_bboxes'] = self._box_flip(results['gt_bboxes'], width) + if 'proposals' in results and results['proposals'] is not None: + assert results['proposals'].shape[1] == 4 + results['proposals'] = self._box_flip(results['proposals'], + width) + + return results + + def __repr__(self): + repr_str = ( + f'{self.__class__.__name__}(' + f'flip_ratio={self.flip_ratio}, direction={self.direction}, ' + f'flip_label_map={self.flip_label_map}, lazy={self.lazy})') + return repr_str + + +@PIPELINES.register() +class FormatShape: + """Format final imgs shape to the given input_format. + + Required keys are "imgs", "num_clips" and "clip_len", added or modified + keys are "imgs" and "input_shape". + + Args: + input_format (str): Define the final imgs format. + collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW, + etc.) if N is 1. Should be set as True when training and testing + detectors. Default: False. + """ + + def __init__(self, input_format, collapse=False): + self.input_format = input_format + self.collapse = collapse + if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']: + raise ValueError( + f'The input format {self.input_format} is invalid.') + + def __call__(self, results): + """Performs the FormatShape formating. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if not isinstance(results['imgs'], np.ndarray): + results['imgs'] = np.array(results['imgs']) + imgs = results['imgs'] + # [M x H x W x C] + # M = 1 * N_crops * N_clips * L + if self.collapse: + assert results['num_clips'] == 1 + + if self.input_format == 'NCTHW': + num_clips = results['num_clips'] + clip_len = results['clip_len'] + + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x L x H x W x C + imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4)) + # N_crops x N_clips x C x L x H x W + imgs = imgs.reshape((-1, ) + imgs.shape[2:]) + # M' x C x L x H x W + # M' = N_crops x N_clips + elif self.input_format == 'NCHW': + imgs = np.transpose(imgs, (0, 3, 1, 2)) + # M x C x H x W + elif self.input_format == 'NCHW_Flow': + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x L x H x W x C + imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4)) + # N_crops x N_clips x L x C x H x W + imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) + + imgs.shape[4:]) + # M' x C' x H x W + # M' = N_crops x N_clips + # C' = L x C + elif self.input_format == 'NPTCHW': + num_proposals = results['num_proposals'] + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = imgs.reshape((num_proposals, num_clips * clip_len) + + imgs.shape[1:]) + # P x M x H x W x C + # M = N_clips x L + imgs = np.transpose(imgs, (0, 1, 4, 2, 3)) + # P x M x C x H x W + + if self.collapse: + assert imgs.shape[0] == 1 + imgs = imgs.squeeze(0) + + results['imgs'] = imgs + results['input_shape'] = imgs.shape + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f"(input_format='{self.input_format}')" + return repr_str + + +@PIPELINES.register() +class Collect: + """Collect data from the loader relevant to the specific task. + + This keeps the items in ``keys`` as it is, and collect items in + ``meta_keys`` into a meta item called ``meta_name``.This is usually + the last stage of the data loader pipeline. + For example, when keys='imgs', meta_keys=('filename', 'label', + 'original_shape'), meta_name='img_metas', the results will be a dict with + keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of + another dict with keys 'filename', 'label', 'original_shape'. + + Args: + keys (Sequence[str]): Required keys to be collected. + meta_name (str): The name of the key that contains meta infomation. + This key is always populated. Default: "img_metas". + meta_keys (Sequence[str]): Keys that are collected under meta_name. + The contents of the ``meta_name`` dictionary depends on + ``meta_keys``. + By default this includes: + + - "filename": path to the image file + - "label": label of the image file + - "original_shape": original shape of the image as a tuple + (h, w, c) + - "img_shape": shape of the image input to the network as a tuple + (h, w, c). Note that images may be zero padded on the + bottom/right, if the batch tensor is larger than this shape. + - "pad_shape": image shape after padding + - "flip_direction": a str in ("horiziontal", "vertival") to + indicate if the image is fliped horizontally or vertically. + - "img_norm_cfg": a dict of normalization information: + - mean - per channel mean subtraction + - std - per channel std divisor + - to_rgb - bool indicating if bgr was converted to rgb + nested (bool): If set as True, will apply data[x] = [data[x]] to all + items in data. The arg is added for compatibility. Default: False. + """ + + def __init__(self, + keys, + meta_keys=('filename', 'label', 'original_shape', 'img_shape', + 'pad_shape', 'flip_direction', 'img_norm_cfg'), + meta_name='img_metas'): + self.keys = keys + self.meta_keys = meta_keys + self.meta_name = meta_name + + def __call__(self, results): + """Performs the Collect formating. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + data = [] + for key in self.keys: + data.append(results[key]) + + if len(self.meta_keys) != 0: + meta = {} + for key in self.meta_keys: + meta[key] = results[key] + data.append(meta) + + return data + + def __repr__(self): + return (f'{self.__class__.__name__}(' + f'keys={self.keys}, meta_keys={self.meta_keys}, ' + f'nested={self.nested})') + + +@PIPELINES.register() +class GeneratePoseTarget: + """Generate pseudo heatmaps based on joint coordinates and confidence. + + Required keys are "keypoint", "img_shape", "keypoint_score" (optional), + added or modified keys are "imgs". + + Args: + sigma (float): The sigma of the generated gaussian map. Default: 0.6. + use_score (bool): Use the confidence score of keypoints as the maximum + of the gaussian maps. Default: True. + with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True. + with_limb (bool): Generate pseudo heatmaps for limbs. At least one of + 'with_kp' and 'with_limb' should be True. Default: False. + skeletons (tuple[tuple]): The definition of human skeletons. + Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9), + (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15), + (6, 12), (12, 14), (14, 16), (11, 12)), + which is the definition of COCO-17p skeletons. + double (bool): Output both original heatmaps and flipped heatmaps. + Default: False. + left_kp (tuple[int]): Indexes of left keypoints, which is used when + flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15), + which is left keypoints in COCO-17p. + right_kp (tuple[int]): Indexes of right keypoints, which is used when + flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16), + which is right keypoints in COCO-17p. + """ + + def __init__(self, + sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False, + skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), + (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), + (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)), + double=False, + left_kp=(1, 3, 5, 7, 9, 11, 13, 15), + right_kp=(2, 4, 6, 8, 10, 12, 14, 16)): + + self.sigma = sigma + self.use_score = use_score + self.with_kp = with_kp + self.with_limb = with_limb + self.double = double + + # an auxiliary const + self.eps = 1e-4 + + assert self.with_kp or self.with_limb, ( + 'At least one of "with_limb" ' + 'and "with_kp" should be set as True.') + self.left_kp = left_kp + self.right_kp = right_kp + self.skeletons = skeletons + + def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values): + """Generate pseudo heatmap for one keypoint in one frame. + + Args: + img_h (int): The height of the heatmap. + img_w (int): The width of the heatmap. + centers (np.ndarray): The coordinates of corresponding keypoints + (of multiple persons). + sigma (float): The sigma of generated gaussian. + max_values (np.ndarray): The max values of each keypoint. + + Returns: + np.ndarray: The generated pseudo heatmap. + """ + + heatmap = np.zeros([img_h, img_w], dtype=np.float32) + + for center, max_value in zip(centers, max_values): + mu_x, mu_y = center[0], center[1] + if max_value < self.eps: + continue + + st_x = max(int(mu_x - 3 * sigma), 0) + ed_x = min(int(mu_x + 3 * sigma) + 1, img_w) + st_y = max(int(mu_y - 3 * sigma), 0) + ed_y = min(int(mu_y + 3 * sigma) + 1, img_h) + x = np.arange(st_x, ed_x, 1, np.float32) + y = np.arange(st_y, ed_y, 1, np.float32) + + # if the keypoint not in the heatmap coordinate system + if not (len(x) and len(y)): + continue + y = y[:, None] + + patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2) + patch = patch * max_value + heatmap[st_y:ed_y, st_x:ed_x] = np.maximum( + heatmap[st_y:ed_y, st_x:ed_x], patch) + + return heatmap + + def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma, + start_values, end_values): + """Generate pseudo heatmap for one limb in one frame. + + Args: + img_h (int): The height of the heatmap. + img_w (int): The width of the heatmap. + starts (np.ndarray): The coordinates of one keypoint in the + corresponding limbs (of multiple persons). + ends (np.ndarray): The coordinates of the other keypoint in the + corresponding limbs (of multiple persons). + sigma (float): The sigma of generated gaussian. + start_values (np.ndarray): The max values of one keypoint in the + corresponding limbs. + end_values (np.ndarray): The max values of the other keypoint in + the corresponding limbs. + + Returns: + np.ndarray: The generated pseudo heatmap. + """ + + heatmap = np.zeros([img_h, img_w], dtype=np.float32) + + for start, end, start_value, end_value in zip(starts, ends, + start_values, end_values): + value_coeff = min(start_value, end_value) + if value_coeff < self.eps: + continue + + min_x, max_x = min(start[0], end[0]), max(start[0], end[0]) + min_y, max_y = min(start[1], end[1]), max(start[1], end[1]) + + min_x = max(int(min_x - 3 * sigma), 0) + max_x = min(int(max_x + 3 * sigma) + 1, img_w) + min_y = max(int(min_y - 3 * sigma), 0) + max_y = min(int(max_y + 3 * sigma) + 1, img_h) + + x = np.arange(min_x, max_x, 1, np.float32) + y = np.arange(min_y, max_y, 1, np.float32) + + if not (len(x) and len(y)): + continue + + y = y[:, None] + x_0 = np.zeros_like(x) + y_0 = np.zeros_like(y) + + # distance to start keypoints + d2_start = ((x - start[0])**2 + (y - start[1])**2) + + # distance to end keypoints + d2_end = ((x - end[0])**2 + (y - end[1])**2) + + # the distance between start and end keypoints. + d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2) + + if d2_ab < 1: + full_map = self.generate_a_heatmap(img_h, img_w, [start], sigma, + [start_value]) + heatmap = np.maximum(heatmap, full_map) + continue + + coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab + + a_dominate = coeff <= 0 + b_dominate = coeff >= 1 + seg_dominate = 1 - a_dominate - b_dominate + + position = np.stack([x + y_0, y + x_0], axis=-1) + projection = start + np.stack([coeff, coeff], + axis=-1) * (end - start) + d2_line = position - projection + d2_line = d2_line[:, :, 0]**2 + d2_line[:, :, 1]**2 + d2_seg = (a_dominate * d2_start + b_dominate * d2_end + + seg_dominate * d2_line) + + patch = np.exp(-d2_seg / 2. / sigma**2) + patch = patch * value_coeff + + heatmap[min_y:max_y, min_x:max_x] = np.maximum( + heatmap[min_y:max_y, min_x:max_x], patch) + + return heatmap + + def generate_heatmap(self, img_h, img_w, kps, sigma, max_values): + """Generate pseudo heatmap for all keypoints and limbs in one frame (if + needed). + + Args: + img_h (int): The height of the heatmap. + img_w (int): The width of the heatmap. + kps (np.ndarray): The coordinates of keypoints in this frame. + sigma (float): The sigma of generated gaussian. + max_values (np.ndarray): The confidence score of each keypoint. + + Returns: + np.ndarray: The generated pseudo heatmap. + """ + + heatmaps = [] + if self.with_kp: + num_kp = kps.shape[1] + for i in range(num_kp): + heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i], + sigma, max_values[:, i]) + heatmaps.append(heatmap) + + if self.with_limb: + for limb in self.skeletons: + start_idx, end_idx = limb + starts = kps[:, start_idx] + ends = kps[:, end_idx] + + start_values = max_values[:, start_idx] + end_values = max_values[:, end_idx] + heatmap = self.generate_a_limb_heatmap( + img_h, img_w, starts, ends, sigma, start_values, end_values) + heatmaps.append(heatmap) + + return np.stack(heatmaps, axis=-1) + + def gen_an_aug(self, results): + """Generate pseudo heatmaps for all frames. + + Args: + results (dict): The dictionary that contains all info of a sample. + + Returns: + list[np.ndarray]: The generated pseudo heatmaps. + """ + + all_kps = results['keypoint'] + kp_shape = all_kps.shape + + if 'keypoint_score' in results: + all_kpscores = results['keypoint_score'] + else: + all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32) + + img_h, img_w = results['img_shape'] + num_frame = kp_shape[1] + + imgs = [] + for i in range(num_frame): + sigma = self.sigma + kps = all_kps[:, i] + kpscores = all_kpscores[:, i] + + max_values = np.ones(kpscores.shape, dtype=np.float32) + if self.use_score: + max_values = kpscores + + hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values) + imgs.append(hmap) + + return imgs + + def __call__(self, results): + if not self.double: + results['imgs'] = np.stack(self.gen_an_aug(results)) + else: + results_ = cp.deepcopy(results) + flip = Flip_V2( + flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp) + results_ = flip(results_) + results['imgs'] = np.concatenate( + [self.gen_an_aug(results), + self.gen_an_aug(results_)]) + results['label'] = np.array([results['label']]) + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'sigma={self.sigma}, ' + f'use_score={self.use_score}, ' + f'with_kp={self.with_kp}, ' + f'with_limb={self.with_limb}, ' + f'skeletons={self.skeletons}, ' + f'double={self.double}, ' + f'left_kp={self.left_kp}, ' + f'right_kp={self.right_kp})') + return repr_str diff --git a/Bank_second_part/detect_process/paddlevideo/loader/registry.py b/Bank_second_part/detect_process/paddlevideo/loader/registry.py new file mode 100644 index 0000000..add6631 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/loader/registry.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import Registry + +PIPELINES = Registry("pipeline") +DATASETS = Registry("datasets") diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__init__.py new file mode 100644 index 0000000..eefabbd --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__init__.py @@ -0,0 +1,3 @@ +from .anet_prop import ANETproposal + +__all__ = ['ANETproposal'] diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..c5a7b5f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/anet_prop.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/anet_prop.cpython-310.pyc new file mode 100644 index 0000000..5987d5b Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/__pycache__/anet_prop.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py new file mode 100644 index 0000000..411b164 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ActivityNet/anet_prop.py @@ -0,0 +1,359 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import json +import numpy as np +import pandas as pd +import urllib.request as urllib2 +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +class ANETproposal(object): + """ + This class is used for calculating AR@N and AUC; + Code transfer from ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git) + """ + GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version'] + PROPOSAL_FIELDS = ['results', 'version', 'external_data'] + API = 'http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/challenge19/api.py' + + def __init__(self, + ground_truth_filename=None, + proposal_filename=None, + ground_truth_fields=GROUND_TRUTH_FIELDS, + proposal_fields=PROPOSAL_FIELDS, + tiou_thresholds=np.linspace(0.5, 0.95, 10), + max_avg_nr_proposals=None, + subset='validation', + verbose=False, + check_status=True): + if not ground_truth_filename: + raise IOError('Please input a valid ground truth file.') + if not proposal_filename: + raise IOError('Please input a valid proposal file.') + self.subset = subset + self.tiou_thresholds = tiou_thresholds + self.max_avg_nr_proposals = max_avg_nr_proposals + self.verbose = verbose + self.gt_fields = ground_truth_fields + self.pred_fields = proposal_fields + self.recall = None + self.avg_recall = None + self.proposals_per_video = None + self.check_status = check_status + # Retrieve blocked videos from server. + if self.check_status: + self.blocked_videos = self.get_blocked_videos() + else: + self.blocked_videos = list() + # Import ground truth and proposals. + self.ground_truth, self.activity_index = self._import_ground_truth( + ground_truth_filename) + self.proposal = self._import_proposal(proposal_filename) + + if self.verbose: + print('[INIT] Loaded annotations from {} subset.'.format(subset)) + nr_gt = len(self.ground_truth) + print('\tNumber of ground truth instances: {}'.format(nr_gt)) + nr_pred = len(self.proposal) + print('\tNumber of proposals: {}'.format(nr_pred)) + print('\tFixed threshold for tiou score: {}'.format( + self.tiou_thresholds)) + + def _import_ground_truth(self, ground_truth_filename): + """ + Reads ground truth file, checks if it is well formatted, and returns + the ground truth instances and the activity classes. + + Parameters: + ground_truth_filename (str): full path to the ground truth json file. + Returns: + ground_truth (df): Data frame containing the ground truth instances. + activity_index (dict): Dictionary containing class index. + """ + with open(ground_truth_filename, 'r') as fobj: + data = json.load(fobj) + # Checking format + if not all([field in data.keys() for field in self.gt_fields]): + raise IOError('Please input a valid ground truth file.') + + # Read ground truth data. + activity_index, cidx = {}, 0 + video_lst, t_start_lst, t_end_lst, label_lst = [], [], [], [] + for videoid, v in data['database'].items(): + if self.subset != v['subset']: + continue + if videoid in self.blocked_videos: + continue + for ann in v['annotations']: + if ann['label'] not in activity_index: + activity_index[ann['label']] = cidx + cidx += 1 + video_lst.append(videoid) + t_start_lst.append(float(ann['segment'][0])) + t_end_lst.append(float(ann['segment'][1])) + label_lst.append(activity_index[ann['label']]) + + ground_truth = pd.DataFrame({ + 'video-id': video_lst, + 't-start': t_start_lst, + 't-end': t_end_lst, + 'label': label_lst + }) + return ground_truth, activity_index + + def _import_proposal(self, proposal_filename): + """ + Reads proposal file, checks if it is well formatted, and returns + the proposal instances. + + Parameters: + proposal_filename (str): Full path to the proposal json file. + Returns: + proposal (df): Data frame containing the proposal instances. + """ + with open(proposal_filename, 'r') as fobj: + data = json.load(fobj) + # Checking format... + if not all([field in data.keys() for field in self.pred_fields]): + raise IOError('Please input a valid proposal file.') + + # Read predictions. + video_lst, t_start_lst, t_end_lst = [], [], [] + score_lst = [] + for videoid, v in data['results'].items(): + if videoid in self.blocked_videos: + continue + for result in v: + video_lst.append(videoid) + t_start_lst.append(float(result['segment'][0])) + t_end_lst.append(float(result['segment'][1])) + score_lst.append(result['score']) + proposal = pd.DataFrame({ + 'video-id': video_lst, + 't-start': t_start_lst, + 't-end': t_end_lst, + 'score': score_lst + }) + return proposal + + def evaluate(self): + """ + Evaluates a proposal file. To measure the performance of a + method for the proposal task, we computes the area under the + average recall vs average number of proposals per video curve. + """ + recall, avg_recall, proposals_per_video = self.average_recall_vs_avg_nr_proposals( + self.ground_truth, + self.proposal, + max_avg_nr_proposals=self.max_avg_nr_proposals, + tiou_thresholds=self.tiou_thresholds) + + area_under_curve = np.trapz(avg_recall, proposals_per_video) + + if self.verbose: + print('[RESULTS] Performance on ActivityNet proposal task.') + with open("data/bmn/BMN_Test_results/auc_result.txt", + "a") as text_file: + text_file.write( + '\tArea Under the AR vs AN curve: {}% \n'.format( + 100. * float(area_under_curve) / + proposals_per_video[-1])) + print('\tArea Under the AR vs AN curve: {}%'.format( + 100. * float(area_under_curve) / proposals_per_video[-1])) + + self.recall = recall + self.avg_recall = avg_recall + self.proposals_per_video = proposals_per_video + + def average_recall_vs_avg_nr_proposals(self, + ground_truth, + proposals, + max_avg_nr_proposals=None, + tiou_thresholds=np.linspace( + 0.5, 0.95, 10)): + """ + Computes the average recall given an average number of + proposals per video. + + Parameters: + ground_truth(df): Data frame containing the ground truth instances. + Required fields: ['video-id', 't-start', 't-end'] + proposal(df): Data frame containing the proposal instances. + Required fields: ['video-id, 't-start', 't-end', 'score'] + tiou_thresholds(1d-array | optional): array with tiou thresholds. + + Returns: + recall(2d-array): recall[i,j] is recall at ith tiou threshold at the jth + average number of average number of proposals per video. + average_recall(1d-array): recall averaged over a list of tiou threshold. + This is equivalent to recall.mean(axis=0). + proposals_per_video(1d-array): average number of proposals per video. + """ + + # Get list of videos. + video_lst = ground_truth['video-id'].unique() + + if not max_avg_nr_proposals: + max_avg_nr_proposals = float( + proposals.shape[0]) / video_lst.shape[0] + + ratio = max_avg_nr_proposals * float( + video_lst.shape[0]) / proposals.shape[0] + + # Adaptation to query faster + ground_truth_gbvn = ground_truth.groupby('video-id') + proposals_gbvn = proposals.groupby('video-id') + + # For each video, computes tiou scores among the retrieved proposals. + score_lst = [] + total_nr_proposals = 0 + for videoid in video_lst: + # Get ground-truth instances associated to this video. + ground_truth_videoid = ground_truth_gbvn.get_group(videoid) + this_video_ground_truth = ground_truth_videoid.loc[:, [ + 't-start', 't-end' + ]].values + + # Get proposals for this video. + try: + proposals_videoid = proposals_gbvn.get_group(videoid) + except: + n = this_video_ground_truth.shape[0] + score_lst.append(np.zeros((n, 1))) + continue + + this_video_proposals = proposals_videoid.loc[:, + ['t-start', 't-end' + ]].values + + if this_video_proposals.shape[0] == 0: + n = this_video_ground_truth.shape[0] + score_lst.append(np.zeros((n, 1))) + continue + + # Sort proposals by score. + sort_idx = proposals_videoid['score'].argsort()[::-1] + this_video_proposals = this_video_proposals[sort_idx, :] + + if this_video_proposals.ndim != 2: + this_video_proposals = np.expand_dims(this_video_proposals, + axis=0) + if this_video_ground_truth.ndim != 2: + this_video_ground_truth = np.expand_dims( + this_video_ground_truth, axis=0) + + nr_proposals = np.minimum( + int(this_video_proposals.shape[0] * ratio), + this_video_proposals.shape[0]) + total_nr_proposals += nr_proposals + this_video_proposals = this_video_proposals[:nr_proposals, :] + + # Compute tiou scores. + tiou = self.wrapper_segment_iou(this_video_proposals, + this_video_ground_truth) + score_lst.append(tiou) + + # Given that the length of the videos is really varied, we + # compute the number of proposals in terms of a ratio of the total + # proposals retrieved, i.e. average recall at a percentage of proposals + # retrieved per video. + + # Computes average recall. + pcn_lst = np.arange(1, 101) / 100.0 * (max_avg_nr_proposals * float( + video_lst.shape[0]) / total_nr_proposals) + matches = np.empty((video_lst.shape[0], pcn_lst.shape[0])) + positives = np.empty(video_lst.shape[0]) + recall = np.empty((tiou_thresholds.shape[0], pcn_lst.shape[0])) + # Iterates over each tiou threshold. + for ridx, tiou in enumerate(tiou_thresholds): + + # Inspect positives retrieved per video at different + # number of proposals (percentage of the total retrieved). + for i, score in enumerate(score_lst): + # Total positives per video. + positives[i] = score.shape[0] + # Find proposals that satisfies minimum tiou threshold. + true_positives_tiou = score >= tiou + # Get number of proposals as a percentage of total retrieved. + pcn_proposals = np.minimum( + (score.shape[1] * pcn_lst).astype(int), score.shape[1]) + + for j, nr_proposals in enumerate(pcn_proposals): + # Compute the number of matches for each percentage of the proposals + matches[i, j] = np.count_nonzero( + (true_positives_tiou[:, :nr_proposals]).sum(axis=1)) + + # Computes recall given the set of matches per video. + recall[ridx, :] = matches.sum(axis=0) / positives.sum() + + # Recall is averaged. + avg_recall = recall.mean(axis=0) + + # Get the average number of proposals per video. + proposals_per_video = pcn_lst * (float(total_nr_proposals) / + video_lst.shape[0]) + + return recall, avg_recall, proposals_per_video + + def get_blocked_videos(self, api=API): + api_url = '{}?action=get_blocked'.format(api) + req = urllib2.Request(api_url) + response = urllib2.urlopen(req) + return json.loads(response.read()) + + def wrapper_segment_iou(self, target_segments, candidate_segments): + """ + Compute intersection over union btw segments + Parameters: + target_segments(nd-array): 2-dim array in format [m x 2:=[init, end]] + candidate_segments(nd-array): 2-dim array in format [n x 2:=[init, end]] + Returns: + tiou(nd-array): 2-dim array [n x m] with IOU ratio. + Note: It assumes that candidate-segments are more scarce that target-segments + """ + if candidate_segments.ndim != 2 or target_segments.ndim != 2: + raise ValueError('Dimension of arguments is incorrect') + + n, m = candidate_segments.shape[0], target_segments.shape[0] + tiou = np.empty((n, m)) + for i in range(m): + tiou[:, i] = self.segment_iou(target_segments[i, :], + candidate_segments) + + return tiou + + def segment_iou(self, target_segment, candidate_segments): + """ + Compute the temporal intersection over union between a + target segment and all the test segments. + + Parameters: + target_segment(1d-array): Temporal target segment containing [starting, ending] times. + candidate_segments(2d-array): Temporal candidate segments containing N x [starting, ending] times. + + Returns: + tiou(1d-array): Temporal intersection over union score of the N's candidate segments. + """ + tt1 = np.maximum(target_segment[0], candidate_segments[:, 0]) + tt2 = np.minimum(target_segment[1], candidate_segments[:, 1]) + # Intersection including Non-negative overlap score. + segments_intersection = (tt2 - tt1).clip(0) + # Segment union. + segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \ + + (target_segment[1] - target_segment[0]) - segments_intersection + # Compute overlap as the ratio of the intersection + # over union of two segments. + tIoU = segments_intersection.astype(float) / segments_union + return tIoU diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/__init__.py new file mode 100644 index 0000000..b693b87 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .bmn_metric import BMNMetric +from .build import build_metric +from .center_crop_metric import CenterCropMetric +from .depth_metric import DepthMetric +from .msrvtt_metric import MSRVTTMetric +from .multi_crop_metric import MultiCropMetric +from .registry import METRIC +from .skeleton_metric import SkeletonMetric +from .transnetv2_metric import TransNetV2Metric +from .youtube8m.eval_util import HitOneMetric +from .segmentation_metric import SegmentationMetric +from .ava_metric import AVAMetric +from .vos_metric import VOSMetric +from .center_crop_metric_MRI import CenterCropMetric_MRI +from .yowo_metric import YOWOMetric + +__all__ = [ + 'METRIC', 'build_metric', 'MultiCropMetric', 'BMNMetric', + 'CenterCropMetric', 'SkeletonMetric', 'HitOneMetric', 'TransNetV2Metric', + 'DepthMetric', 'MSRVTTMetric', 'VOSMetric', 'CenterCropMetric_MRI','AVAMetric', + 'SegmentationMetric', 'YOWOMetric' +] diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..9629b72 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_metric.cpython-310.pyc new file mode 100644 index 0000000..48d8818 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_utils.cpython-310.pyc new file mode 100644 index 0000000..2d80209 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ava_utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..0e4c617 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/bmn_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/bmn_metric.cpython-310.pyc new file mode 100644 index 0000000..9ea8d55 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/bmn_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/build.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/build.cpython-310.pyc new file mode 100644 index 0000000..a53f868 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/build.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric.cpython-310.pyc new file mode 100644 index 0000000..bfc8299 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric_MRI.cpython-310.pyc new file mode 100644 index 0000000..8b1974f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/center_crop_metric_MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/depth_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/depth_metric.cpython-310.pyc new file mode 100644 index 0000000..cb68274 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/depth_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/msrvtt_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/msrvtt_metric.cpython-310.pyc new file mode 100644 index 0000000..c1574d9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/msrvtt_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/multi_crop_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/multi_crop_metric.cpython-310.pyc new file mode 100644 index 0000000..66d2ac2 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/multi_crop_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/recall.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/recall.cpython-310.pyc new file mode 100644 index 0000000..e228e1d Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/recall.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000..6563d14 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/registry.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/segmentation_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/segmentation_metric.cpython-310.pyc new file mode 100644 index 0000000..f2b2e3f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/segmentation_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/skeleton_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/skeleton_metric.cpython-310.pyc new file mode 100644 index 0000000..10e12cf Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/skeleton_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/transnetv2_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/transnetv2_metric.cpython-310.pyc new file mode 100644 index 0000000..562d0fe Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/transnetv2_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ucf24_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ucf24_utils.cpython-310.pyc new file mode 100644 index 0000000..3597d80 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/ucf24_utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/vos_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/vos_metric.cpython-310.pyc new file mode 100644 index 0000000..004dc17 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/vos_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/yowo_metric.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/yowo_metric.cpython-310.pyc new file mode 100644 index 0000000..3815b7d Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/__pycache__/yowo_metric.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/README.md b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/README.md new file mode 100644 index 0000000..7414d0f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/README.md @@ -0,0 +1,2 @@ +The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet). +Some unused codes are removed to minimize the length of codes added. diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..34f17ef Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/metrics.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000..39f6227 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/metrics.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc new file mode 100644 index 0000000..24a0843 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_list.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc new file mode 100644 index 0000000..7df2341 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/np_box_ops.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/object_detection_evaluation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/object_detection_evaluation.cpython-310.pyc new file mode 100644 index 0000000..40abd9e Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/object_detection_evaluation.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/per_image_evaluation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/per_image_evaluation.cpython-310.pyc new file mode 100644 index 0000000..a74a9bb Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/per_image_evaluation.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/standard_fields.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/standard_fields.cpython-310.pyc new file mode 100644 index 0000000..c0609b8 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/__pycache__/standard_fields.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/metrics.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/metrics.py new file mode 100644 index 0000000..13eb034 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/metrics.py @@ -0,0 +1,143 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Functions for computing metrics like precision, recall, CorLoc and etc.""" + +import numpy as np + + +def compute_precision_recall(scores, labels, num_gt): + """Compute precision and recall. + + Args: + scores: A float numpy array representing detection score + labels: A boolean numpy array representing true/false positive labels + num_gt: Number of ground truth instances + + Raises: + ValueError: if the input is not of the correct format + + Returns: + precision: Fraction of positive instances over detected ones. This + value is None if no ground truth labels are present. + recall: Fraction of detected positive instance over all positive + instances. This value is None if no ground truth labels are + present. + """ + if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool + or len(labels.shape) != 1): + raise ValueError('labels must be single dimension bool numpy array') + + if not isinstance(scores, np.ndarray) or len(scores.shape) != 1: + raise ValueError('scores must be single dimension numpy array') + + if num_gt < np.sum(labels): + raise ValueError( + 'Number of true positives must be smaller than num_gt.') + + if len(scores) != len(labels): + raise ValueError('scores and labels must be of the same size.') + + if num_gt == 0: + return None, None + + sorted_indices = np.argsort(scores) + sorted_indices = sorted_indices[::-1] + labels = labels.astype(int) + true_positive_labels = labels[sorted_indices] + false_positive_labels = 1 - true_positive_labels + cum_true_positives = np.cumsum(true_positive_labels) + cum_false_positives = np.cumsum(false_positive_labels) + precision = cum_true_positives.astype(float) / ( + cum_true_positives + cum_false_positives) + recall = cum_true_positives.astype(float) / num_gt + return precision, recall + + +def compute_average_precision(precision, recall): + """Compute Average Precision according to the definition in VOCdevkit. + + Precision is modified to ensure that it does not decrease as recall + decrease. + + Args: + precision: A float [N, 1] numpy array of precisions + recall: A float [N, 1] numpy array of recalls + + Raises: + ValueError: if the input is not of the correct format + + Returns: + average_precison: The area under the precision recall curve. NaN if + precision and recall are None. + """ + if precision is None: + if recall is not None: + raise ValueError('If precision is None, recall must also be None') + return np.NAN + + if not isinstance(precision, np.ndarray) or not isinstance( + recall, np.ndarray): + raise ValueError('precision and recall must be numpy array') + if precision.dtype != np.float or recall.dtype != np.float: + raise ValueError('input must be float numpy array.') + if len(precision) != len(recall): + raise ValueError('precision and recall must be of the same size.') + if not precision.size: + return 0.0 + if np.amin(precision) < 0 or np.amax(precision) > 1: + raise ValueError('Precision must be in the range of [0, 1].') + if np.amin(recall) < 0 or np.amax(recall) > 1: + raise ValueError('recall must be in the range of [0, 1].') + if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): + raise ValueError('recall must be a non-decreasing array') + + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Preprocess precision to be a non-decreasing array + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum( + (recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def compute_cor_loc(num_gt_imgs_per_class, + num_images_correctly_detected_per_class): + """Compute CorLoc according to the definition in the following paper. + + https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf + + Returns nans if there are no ground truth images for a class. + + Args: + num_gt_imgs_per_class: 1D array, representing number of images + containing at least one object instance of a particular class + num_images_correctly_detected_per_class: 1D array, representing number + of images that are correctly detected at least one object instance + of a particular class + + Returns: + corloc_per_class: A float numpy array represents the corloc score of + each class + """ + # Divide by zero expected for classes with no gt examples. + with np.errstate(divide='ignore', invalid='ignore'): + return np.where( + num_gt_imgs_per_class == 0, np.nan, + num_images_correctly_detected_per_class / num_gt_imgs_per_class) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_list.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_list.py new file mode 100644 index 0000000..f9b101e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_list.py @@ -0,0 +1,138 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""Numpy BoxList classes and functions.""" + +import numpy as np + + +class BoxList: + """Box collection. + + BoxList represents a list of bounding boxes as numpy array, where each + bounding box is represented as a row of 4 numbers, + [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within + a given list correspond to a single image. + + Optionally, users can add additional related fields (such as + objectness/classification scores). + """ + + def __init__(self, data): + """Constructs box collection. + + Args: + data: a numpy array of shape [N, 4] representing box coordinates + + Raises: + ValueError: if bbox data is not a numpy array + ValueError: if invalid dimensions for bbox data + """ + if not isinstance(data, np.ndarray): + raise ValueError('data must be a numpy array.') + if len(data.shape) != 2 or data.shape[1] != 4: + raise ValueError('Invalid dimensions for box data.') + if data.dtype != np.float32 and data.dtype != np.float64: + raise ValueError( + 'Invalid data type for box data: float is required.') + if not self._is_valid_boxes(data): + raise ValueError('Invalid box data. data must be a numpy array of ' + 'N*[y_min, x_min, y_max, x_max]') + self.data = {'boxes': data} + + def num_boxes(self): + """Return number of boxes held in collections.""" + return self.data['boxes'].shape[0] + + def get_extra_fields(self): + """Return all non-box fields.""" + return [k for k in self.data if k != 'boxes'] + + def has_field(self, field): + return field in self.data + + def add_field(self, field, field_data): + """Add data to a specified field. + + Args: + field: a string parameter used to speficy a related field to be + accessed. + field_data: a numpy array of [N, ...] representing the data + associated with the field. + Raises: + ValueError: if the field is already exist or the dimension of the + field data does not matches the number of boxes. + """ + if self.has_field(field): + raise ValueError('Field ' + field + 'already exists') + if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes( + ): + raise ValueError('Invalid dimensions for field data') + self.data[field] = field_data + + def get(self): + """Convenience function for accesssing box coordinates. + + Returns: + a numpy array of shape [N, 4] representing box corners + """ + return self.get_field('boxes') + + def get_field(self, field): + """Accesses data associated with the specified field in the box + collection. + + Args: + field: a string parameter used to speficy a related field to be + accessed. + + Returns: + a numpy 1-d array representing data of an associated field + + Raises: + ValueError: if invalid field + """ + if not self.has_field(field): + raise ValueError(f'field {field} does not exist') + return self.data[field] + + def get_coordinates(self): + """Get corner coordinates of boxes. + + Returns: + a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] + """ + box_coordinates = self.get() + y_min = box_coordinates[:, 0] + x_min = box_coordinates[:, 1] + y_max = box_coordinates[:, 2] + x_max = box_coordinates[:, 3] + return [y_min, x_min, y_max, x_max] + + def _is_valid_boxes(self, data): + """Check whether data fullfills the format of N*[ymin, xmin, ymax, + xmin]. + + Args: + data: a numpy array of shape [N, 4] representing box coordinates + + Returns: + a boolean indicating whether all ymax of boxes are equal or greater + than ymin, and all xmax of boxes are equal or greater than xmin. + """ + if len(data): + for v in data: + if v[0] > v[2] or v[1] > v[3]: + return False + return True diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_ops.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_ops.py new file mode 100644 index 0000000..94e7d30 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/np_box_ops.py @@ -0,0 +1,98 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Operations for [N, 4] numpy arrays representing bounding boxes. + +Example box operations that are supported: + * Areas: compute bounding box areas + * IOU: pairwise intersection-over-union scores +""" + +import numpy as np + + +def area(boxes): + """Computes area of boxes. + + Args: + boxes: Numpy array with shape [N, 4] holding N boxes + + Returns: + a numpy array with shape [N*1] representing box areas + """ + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def intersection(boxes1, boxes2): + """Compute pairwise intersection areas between boxes. + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes + boxes2: a numpy array with shape [M, 4] holding M boxes + + Returns: + a numpy array with shape [N*M] representing pairwise intersection area + """ + [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) + [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) + + all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) + all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) + intersect_heights = np.maximum( + np.zeros(all_pairs_max_ymin.shape), + all_pairs_min_ymax - all_pairs_max_ymin) + all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) + all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) + intersect_widths = np.maximum( + np.zeros(all_pairs_max_xmin.shape), + all_pairs_min_xmax - all_pairs_max_xmin) + return intersect_heights * intersect_widths + + +def iou(boxes1, boxes2): + """Computes pairwise intersection-over-union between box collections. + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes. + boxes2: a numpy array with shape [M, 4] holding N boxes. + + Returns: + a numpy array with shape [N, M] representing pairwise iou scores. + """ + intersect = intersection(boxes1, boxes2) + area1 = area(boxes1) + area2 = area(boxes2) + union = ( + np.expand_dims(area1, axis=1) + np.expand_dims(area2, axis=0) - + intersect) + return intersect / union + + +def ioa(boxes1, boxes2): + """Computes pairwise intersection-over-area between box collections. + + Intersection-over-area (ioa) between two boxes box1 and box2 is defined as + their intersection area over box2's area. Note that ioa is not symmetric, + that is, IOA(box1, box2) != IOA(box2, box1). + + Args: + boxes1: a numpy array with shape [N, 4] holding N boxes. + boxes2: a numpy array with shape [M, 4] holding N boxes. + + Returns: + a numpy array with shape [N, M] representing pairwise ioa scores. + """ + intersect = intersection(boxes1, boxes2) + areas = np.expand_dims(area(boxes2), axis=0) + return intersect / areas diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py new file mode 100644 index 0000000..c9f0054 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/object_detection_evaluation.py @@ -0,0 +1,658 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""object_detection_evaluation module. + +ObjectDetectionEvaluation is a class which manages ground truth information of +a object detection dataset, and computes frequently used detection metrics such +as Precision, Recall, CorLoc of the provided detection results. +It supports the following operations: +1) Add ground truth information of images sequentially. +2) Add detection result of images sequentially. +3) Evaluate detection metrics on already inserted detection results. +4) Write evaluation result into a pickle file for future processing or + visualization. + +Note: This module operates on numpy boxes and box lists. +""" +import collections +import logging +from abc import ABCMeta, abstractmethod + +import numpy as np + +from . import metrics, per_image_evaluation, standard_fields + + +class DetectionEvaluator: + """Interface for object detection evalution classes. + + Example usage of the Evaluator: + ------------------------------ + evaluator = DetectionEvaluator(categories) + + # Detections and groundtruth for image 1. + evaluator.add_single_groundtruth_image_info(...) + evaluator.add_single_detected_image_info(...) + + # Detections and groundtruth for image 2. + evaluator.add_single_groundtruth_image_info(...) + evaluator.add_single_detected_image_info(...) + + metrics_dict = evaluator.evaluate() + """ + + __metaclass__ = ABCMeta + + def __init__(self, categories): + """Constructor. + + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this + category. + 'name': (required) string representing category name e.g., + 'cat', 'dog'. + """ + self._categories = categories + + @abstractmethod + def add_single_ground_truth_image_info(self, image_id, groundtruth_dict): + """Adds groundtruth for a single image to be used for evaluation. + + Args: + image_id: A unique string/integer identifier for the image. + groundtruth_dict: A dictionary of groundtruth numpy arrays required + for evaluations. + """ + + @abstractmethod + def add_single_detected_image_info(self, image_id, detections_dict): + """Adds detections for a single image to be used for evaluation. + + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary of detection numpy arrays required + for evaluation. + """ + + @abstractmethod + def evaluate(self): + """Evaluates detections and returns a dictionary of metrics.""" + + @abstractmethod + def clear(self): + """Clears the state to prepare for a fresh evaluation.""" + + +class ObjectDetectionEvaluator(DetectionEvaluator): + """A class to evaluate detections.""" + + def __init__( + self, + categories, + matching_iou_threshold=0.5, + evaluate_corlocs=False, + metric_prefix=None, + use_weighted_mean_ap=False, + evaluate_masks=False, + ): + """Constructor. + + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this + category. + 'name': (required) string representing category name e.g., + 'cat', 'dog'. + matching_iou_threshold: IOU threshold to use for matching + groundtruth boxes to detection boxes. + evaluate_corlocs: (optional) boolean which determines if corloc + scores are to be returned or not. + metric_prefix: (optional) string prefix for metric name; if None, + no prefix is used. + use_weighted_mean_ap: (optional) boolean which determines if the + mean average precision is computed directly from the scores and + tp_fp_labels of all classes. + evaluate_masks: If False, evaluation will be performed based on + boxes. If True, mask evaluation will be performed instead. + + Raises: + ValueError: If the category ids are not 1-indexed. + """ + super(ObjectDetectionEvaluator, self).__init__(categories) + self._num_classes = max([cat['id'] for cat in categories]) + if min(cat['id'] for cat in categories) < 1: + raise ValueError('Classes should be 1-indexed.') + self._matching_iou_threshold = matching_iou_threshold + self._use_weighted_mean_ap = use_weighted_mean_ap + self._label_id_offset = 1 + self._evaluate_masks = evaluate_masks + self._evaluation = ObjectDetectionEvaluation( + num_groundtruth_classes=self._num_classes, + matching_iou_threshold=self._matching_iou_threshold, + use_weighted_mean_ap=self._use_weighted_mean_ap, + label_id_offset=self._label_id_offset, + ) + self._image_ids = set([]) + self._evaluate_corlocs = evaluate_corlocs + self._metric_prefix = (metric_prefix + '_') if metric_prefix else '' + + def add_single_ground_truth_image_info(self, image_id, groundtruth_dict): + """Adds groundtruth for a single image to be used for evaluation. + + Args: + image_id: A unique string/integer identifier for the image. + groundtruth_dict: A dictionary containing - + standard_fields.InputDataFields.groundtruth_boxes: float32 + numpy array of shape [num_boxes, 4] containing `num_boxes` + groundtruth boxes of the format [ymin, xmin, ymax, xmax] in + absolute image coordinates. + standard_fields.InputDataFields.groundtruth_classes: integer + numpy array of shape [num_boxes] containing 1-indexed + groundtruth classes for the boxes. + standard_fields.InputDataFields.groundtruth_difficult: Optional + length M numpy boolean array denoting whether a ground + truth box is a difficult instance or not. This field is + optional to support the case that no boxes are difficult. + standard_fields.InputDataFields.groundtruth_instance_masks: + Optional numpy array of shape [num_boxes, height, width] + with values in {0, 1}. + + Raises: + ValueError: On adding groundtruth for an image more than once. Will + also raise error if instance masks are not in groundtruth + dictionary. + """ + if image_id in self._image_ids: + raise ValueError( + 'Image with id {} already added.'.format(image_id)) + + groundtruth_classes = ( + groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_classes] - + self._label_id_offset) + # If the key is not present in the groundtruth_dict or the array is + # empty (unless there are no annotations for the groundtruth on this + # image) use values from the dictionary or insert None otherwise. + if (standard_fields.InputDataFields.groundtruth_difficult + in groundtruth_dict.keys()) and (groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_difficult].size + or + not groundtruth_classes.size): + groundtruth_difficult = groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_difficult] + else: + groundtruth_difficult = None + if not len(self._image_ids) % 1000: + logging.warn(('image %s does not have groundtruth difficult ' + 'flag specified'), image_id) + groundtruth_masks = None + if self._evaluate_masks: + if (standard_fields.InputDataFields.groundtruth_instance_masks + not in groundtruth_dict): + raise ValueError( + 'Instance masks not in groundtruth dictionary.') + groundtruth_masks = groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_instance_masks] + self._evaluation.add_single_ground_truth_image_info( + image_key=image_id, + groundtruth_boxes=groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_boxes], + groundtruth_class_labels=groundtruth_classes, + groundtruth_is_difficult_list=groundtruth_difficult, + groundtruth_masks=groundtruth_masks, + ) + self._image_ids.update([image_id]) + + def add_single_detected_image_info(self, image_id, detections_dict): + """Adds detections for a single image to be used for evaluation. + + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary containing - + standard_fields.DetectionResultFields.detection_boxes: float32 + numpy array of shape [num_boxes, 4] containing `num_boxes` + detection boxes of the format [ymin, xmin, ymax, xmax] in + absolute image coordinates. + standard_fields.DetectionResultFields.detection_scores: float32 + numpy array of shape [num_boxes] containing detection + scores for the boxes. + standard_fields.DetectionResultFields.detection_classes: + integer numpy array of shape [num_boxes] containing + 1-indexed detection classes for the boxes. + standard_fields.DetectionResultFields.detection_masks: uint8 + numpy array of shape [num_boxes, height, width] containing + `num_boxes` masks of values ranging between 0 and 1. + + Raises: + ValueError: If detection masks are not in detections dictionary. + """ + detection_classes = ( + detections_dict[ + standard_fields.DetectionResultFields.detection_classes] - + self._label_id_offset) + detection_masks = None + if self._evaluate_masks: + if (standard_fields.DetectionResultFields.detection_masks + not in detections_dict): + raise ValueError( + 'Detection masks not in detections dictionary.') + detection_masks = detections_dict[ + standard_fields.DetectionResultFields.detection_masks] + self._evaluation.add_single_detected_image_info( + image_key=image_id, + detected_boxes=detections_dict[ + standard_fields.DetectionResultFields.detection_boxes], + detected_scores=detections_dict[ + standard_fields.DetectionResultFields.detection_scores], + detected_class_labels=detection_classes, + detected_masks=detection_masks, + ) + + def create_category_index(self, categories): + """Creates dictionary of COCO compatible categories keyed by category + id. + + Args: + categories: a list of dicts, each of which has the following keys: + 'id': (required) an integer id uniquely identifying this + category. + 'name': (required) string representing category name + e.g., 'cat', 'dog', 'pizza'. + + Returns: + category_index: a dict containing the same entries as categories, + but keyed by the 'id' field of each category. + """ + category_index = {} + for cat in categories: + category_index[cat['id']] = cat + return category_index + + def evaluate(self): + """Compute evaluation result. + + Returns: + A dictionary of metrics with the following fields - + + 1. summary_metrics: + 'Precision/mAP@IOU': mean average + precision at the specified IOU threshold + + 2. per_category_ap: category specific results with keys of the form + 'PerformanceByCategory/mAP@IOU/category' + """ + ( + per_class_ap, + mean_ap, + _, + _, + per_class_corloc, + mean_corloc, + ) = self._evaluation.evaluate() + + metric = f'mAP@{self._matching_iou_threshold}IOU' + pascal_metrics = {self._metric_prefix + metric: mean_ap} + if self._evaluate_corlocs: + pascal_metrics[self._metric_prefix + + 'Precision/meanCorLoc@{}IOU'.format( + self._matching_iou_threshold)] = mean_corloc + category_index = self.create_category_index(self._categories) + for idx in range(per_class_ap.size): + if idx + self._label_id_offset in category_index: + display_name = ( + self._metric_prefix + + 'PerformanceByCategory/AP@{}IOU/{}'.format( + self._matching_iou_threshold, + category_index[idx + self._label_id_offset]['name'], + )) + pascal_metrics[display_name] = per_class_ap[idx] + + # Optionally add CorLoc metrics.classes + if self._evaluate_corlocs: #False + display_name = ( + self._metric_prefix + + 'PerformanceByCategory/CorLoc@{}IOU/{}'.format( + self._matching_iou_threshold, + category_index[idx + + self._label_id_offset]['name'], + )) + pascal_metrics[display_name] = per_class_corloc[idx] + + return pascal_metrics + + def clear(self): + """Clears the state to prepare for a fresh evaluation.""" + self._evaluation = ObjectDetectionEvaluation( + num_groundtruth_classes=self._num_classes, + matching_iou_threshold=self._matching_iou_threshold, + use_weighted_mean_ap=self._use_weighted_mean_ap, + label_id_offset=self._label_id_offset, + ) + self._image_ids.clear() + + +class PascalDetectionEvaluator(ObjectDetectionEvaluator): + """A class to evaluate detections using PASCAL metrics.""" + + def __init__(self, categories, matching_iou_threshold=0.5): + super(PascalDetectionEvaluator, self).__init__( + categories, + matching_iou_threshold=matching_iou_threshold, + evaluate_corlocs=False, + use_weighted_mean_ap=False, + ) + + +ObjectDetectionEvalMetrics = collections.namedtuple( + 'ObjectDetectionEvalMetrics', + [ + 'average_precisions', + 'mean_ap', + 'precisions', + 'recalls', + 'corlocs', + 'mean_corloc', + ], +) + + +class ObjectDetectionEvaluation: + """Internal implementation of Pascal object detection metrics.""" + + def __init__( + self, + num_groundtruth_classes, + matching_iou_threshold=0.5, + nms_iou_threshold=1.0, + nms_max_output_boxes=10000, + use_weighted_mean_ap=False, + label_id_offset=0, + ): + if num_groundtruth_classes < 1: + raise ValueError( + 'Need at least 1 groundtruth class for evaluation.') + + self.per_image_eval = per_image_evaluation.PerImageEvaluation( + num_groundtruth_classes=num_groundtruth_classes, + matching_iou_threshold=matching_iou_threshold, + ) + self.num_class = num_groundtruth_classes + self.use_weighted_mean_ap = use_weighted_mean_ap + self.label_id_offset = label_id_offset + + self.groundtruth_boxes = {} + self.groundtruth_class_labels = {} + self.groundtruth_masks = {} + self.groundtruth_is_difficult_list = {} + self.groundtruth_is_group_of_list = {} + self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int) + self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int) + + self._initialize_detections() + + def _initialize_detections(self): + self.detection_keys = set() + self.scores_per_class = [[] for _ in range(self.num_class)] + self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)] + self.num_images_correctly_detected_per_class = np.zeros(self.num_class) + self.average_precision_per_class = np.empty( + self.num_class, dtype=float) + self.average_precision_per_class.fill(np.nan) + self.precisions_per_class = [] + self.recalls_per_class = [] + self.corloc_per_class = np.ones(self.num_class, dtype=float) + + def clear_detections(self): + self._initialize_detections() + + def add_single_ground_truth_image_info( + self, + image_key, + groundtruth_boxes, + groundtruth_class_labels, + groundtruth_is_difficult_list=None, + groundtruth_is_group_of_list=None, + groundtruth_masks=None, + ): + """Adds groundtruth for a single image to be used for evaluation. + + Args: + image_key: A unique string/integer identifier for the image. + groundtruth_boxes: float32 numpy array of shape [num_boxes, 4] + containing `num_boxes` groundtruth boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + groundtruth_class_labels: integer numpy array of shape [num_boxes] + containing 0-indexed groundtruth classes for the boxes. + groundtruth_is_difficult_list: A length M numpy boolean array + denoting whether a ground truth box is a difficult instance or + not. To support the case that no boxes are difficult, it is by + default set as None. + groundtruth_is_group_of_list: A length M numpy boolean array + denoting whether a ground truth box is a group-of box or not. + To support the case that no boxes are groups-of, it is by + default set as None. + groundtruth_masks: uint8 numpy array of shape + [num_boxes, height, width] containing `num_boxes` groundtruth + masks. The mask values range from 0 to 1. + """ + if image_key in self.groundtruth_boxes: + logging.warn(('image %s has already been added to the ground ' + 'truth database.'), image_key) + return + + self.groundtruth_boxes[image_key] = groundtruth_boxes + self.groundtruth_class_labels[image_key] = groundtruth_class_labels + self.groundtruth_masks[image_key] = groundtruth_masks + if groundtruth_is_difficult_list is None: + num_boxes = groundtruth_boxes.shape[0] + groundtruth_is_difficult_list = np.zeros(num_boxes, dtype=bool) + self.groundtruth_is_difficult_list[ + image_key] = groundtruth_is_difficult_list.astype(dtype=bool) + if groundtruth_is_group_of_list is None: + num_boxes = groundtruth_boxes.shape[0] + groundtruth_is_group_of_list = np.zeros(num_boxes, dtype=bool) + self.groundtruth_is_group_of_list[ + image_key] = groundtruth_is_group_of_list.astype(dtype=bool) + + self._update_ground_truth_statistics( + groundtruth_class_labels, + groundtruth_is_difficult_list.astype(dtype=bool), + groundtruth_is_group_of_list.astype(dtype=bool), + ) + + def add_single_detected_image_info( + self, + image_key, + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks=None, + ): + """Adds detections for a single image to be used for evaluation. + + Args: + image_key: A unique string/integer identifier for the image. + detected_boxes: float32 numpy array of shape [num_boxes, 4] + containing `num_boxes` detection boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + detected_scores: float32 numpy array of shape [num_boxes] + containing detection scores for the boxes. + detected_class_labels: integer numpy array of shape [num_boxes] + containing 0-indexed detection classes for the boxes. + detected_masks: np.uint8 numpy array of shape + [num_boxes, height, width] containing `num_boxes` detection + masks with values ranging between 0 and 1. + + Raises: + ValueError: if the number of boxes, scores and class labels differ + in length. + """ + if len(detected_boxes) != len(detected_scores) or len( + detected_boxes) != len(detected_class_labels): + raise ValueError( + 'detected_boxes, detected_scores and ' + 'detected_class_labels should all have same lengths. Got' + '[%d, %d, %d]' % len(detected_boxes), + len(detected_scores), + len(detected_class_labels), + ) + + if image_key in self.detection_keys: + logging.warn(('image %s has already been added to the ground ' + 'truth database.'), image_key) + return + + self.detection_keys.add(image_key) + if image_key in self.groundtruth_boxes: + groundtruth_boxes = self.groundtruth_boxes[image_key] + groundtruth_class_labels = self.groundtruth_class_labels[image_key] + # Masks are popped instead of look up. The reason is that we do not + # want to keep all masks in memory which can cause memory overflow. + groundtruth_masks = self.groundtruth_masks.pop(image_key) + groundtruth_is_difficult_list = self.groundtruth_is_difficult_list[ + image_key] + groundtruth_is_group_of_list = self.groundtruth_is_group_of_list[ + image_key] + else: + groundtruth_boxes = np.empty(shape=[0, 4], dtype=float) + groundtruth_class_labels = np.array([], dtype=int) + if detected_masks is None: + groundtruth_masks = None + else: + groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float) + groundtruth_is_difficult_list = np.array([], dtype=bool) + groundtruth_is_group_of_list = np.array([], dtype=bool) + ( + scores, + tp_fp_labels, + ) = self.per_image_eval.compute_object_detection_metrics( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_class_labels=detected_class_labels, + groundtruth_boxes=groundtruth_boxes, + groundtruth_class_labels=groundtruth_class_labels, + groundtruth_is_difficult_list=groundtruth_is_difficult_list, + groundtruth_is_group_of_list=groundtruth_is_group_of_list, + detected_masks=detected_masks, + groundtruth_masks=groundtruth_masks, + ) + + for i in range(self.num_class): + if scores[i].shape[0] > 0: + self.scores_per_class[i].append(scores[i]) + self.tp_fp_labels_per_class[i].append(tp_fp_labels[i]) + + def _update_ground_truth_statistics( + self, + groundtruth_class_labels, + groundtruth_is_difficult_list, + groundtruth_is_group_of_list, + ): + """Update grouth truth statitistics. + + 1. Difficult boxes are ignored when counting the number of ground truth + instances as done in Pascal VOC devkit. + 2. Difficult boxes are treated as normal boxes when computing CorLoc + related statitistics. + + Args: + groundtruth_class_labels: An integer numpy array of length M, + representing M class labels of object instances in ground truth + groundtruth_is_difficult_list: A boolean numpy array of length M + denoting whether a ground truth box is a difficult instance or + not + groundtruth_is_group_of_list: A boolean numpy array of length M + denoting whether a ground truth box is a group-of box or not + """ + for class_index in range(self.num_class): + num_gt_instances = np.sum(groundtruth_class_labels[ + ~groundtruth_is_difficult_list + & ~groundtruth_is_group_of_list] == class_index) + self.num_gt_instances_per_class[class_index] += num_gt_instances + if np.any(groundtruth_class_labels == class_index): + self.num_gt_imgs_per_class[class_index] += 1 + + def evaluate(self): + """Compute evaluation result. + + Returns: + A named tuple with the following fields - + average_precision: float numpy array of average precision for + each class. + mean_ap: mean average precision of all classes, float scalar + precisions: List of precisions, each precision is a float numpy + array + recalls: List of recalls, each recall is a float numpy array + corloc: numpy float array + mean_corloc: Mean CorLoc score for each class, float scalar + """ + if (self.num_gt_instances_per_class == 0).any(): + print( + 'The following classes have no ground truth examples: %s', + np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) + + self.label_id_offset, "self.detection_keys:",self.detection_keys + ) + + if self.use_weighted_mean_ap: + all_scores = np.array([], dtype=float) + all_tp_fp_labels = np.array([], dtype=bool) + + for class_index in range(self.num_class): + if self.num_gt_instances_per_class[class_index] == 0: + continue + + if not self.scores_per_class[class_index]: + scores = np.array([], dtype=float) + tp_fp_labels = np.array([], dtype=bool) + else: + scores = np.concatenate(self.scores_per_class[class_index]) + tp_fp_labels = np.concatenate( + self.tp_fp_labels_per_class[class_index]) + if self.use_weighted_mean_ap: + all_scores = np.append(all_scores, scores) + all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels) + precision, recall = metrics.compute_precision_recall( + scores, + tp_fp_labels, + self.num_gt_instances_per_class[class_index], + ) + self.precisions_per_class.append(precision) + self.recalls_per_class.append(recall) + average_precision = metrics.compute_average_precision( + precision, recall) + self.average_precision_per_class[class_index] = average_precision + + self.corloc_per_class = metrics.compute_cor_loc( + self.num_gt_imgs_per_class, + self.num_images_correctly_detected_per_class, + ) + + if self.use_weighted_mean_ap: + num_gt_instances = np.sum(self.num_gt_instances_per_class) + precision, recall = metrics.compute_precision_recall( + all_scores, all_tp_fp_labels, num_gt_instances) + mean_ap = metrics.compute_average_precision(precision, recall) + else: + mean_ap = np.nanmean(self.average_precision_per_class) + mean_corloc = np.nanmean(self.corloc_per_class) + return ObjectDetectionEvalMetrics( + self.average_precision_per_class, + mean_ap, + self.precisions_per_class, + self.recalls_per_class, + self.corloc_per_class, + mean_corloc, + ) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py new file mode 100644 index 0000000..3013ae7 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/per_image_evaluation.py @@ -0,0 +1,452 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""Evaluate Object Detection result on a single image. + +Annotate each detected result as true positives or false positive according to +a predefined IOU ratio. Non Maximum Supression is used by default. Multi class +detection is supported by default. Based on the settings, per image evaluation +is either performed on boxes or on object masks. +""" + +import numpy as np + +from . import np_box_list, np_box_ops + + +class PerImageEvaluation: + """Evaluate detection result of a single image.""" + + def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5): + """Initialized PerImageEvaluation by evaluation parameters. + + Args: + num_groundtruth_classes: Number of ground truth object classes + matching_iou_threshold: A ratio of area intersection to union, + which is the threshold to consider whether a detection is true + positive or not + """ + self.matching_iou_threshold = matching_iou_threshold + self.num_groundtruth_classes = num_groundtruth_classes + + def compute_object_detection_metrics( + self, + detected_boxes, + detected_scores, + detected_class_labels, + groundtruth_boxes, + groundtruth_class_labels, + groundtruth_is_difficult_list, + groundtruth_is_group_of_list, + detected_masks=None, + groundtruth_masks=None, + ): + """Evaluates detections as being tp, fp or ignored from a single image. + + The evaluation is done in two stages: + 1. All detections are matched to non group-of boxes; true positives + are determined and detections matched to difficult boxes are + ignored. + 2. Detections that are determined as false positives are matched + against group-of boxes and ignored if matched. + + Args: + detected_boxes: A float numpy array of shape [N, 4], representing N + regions of detected object regions. + Each row is of the format [y_min, x_min, y_max, x_max] + detected_scores: A float numpy array of shape [N, 1], representing + the confidence scores of the detected N object instances. + detected_class_labels: A integer numpy array of shape [N, 1], + repreneting the class labels of the detected N object + instances. + groundtruth_boxes: A float numpy array of shape [M, 4], + representing M regions of object instances in ground truth + groundtruth_class_labels: An integer numpy array of shape [M, 1], + representing M class labels of object instances in ground truth + groundtruth_is_difficult_list: A boolean numpy array of length M + denoting whether a ground truth box is a difficult instance or + not + groundtruth_is_group_of_list: A boolean numpy array of length M + denoting whether a ground truth box has group-of tag + detected_masks: (optional) A uint8 numpy array of shape + [N, height, width]. If not None, the metrics will be computed + based on masks. + groundtruth_masks: (optional) A uint8 numpy array of shape + [M, height, width]. + + Returns: + scores: A list of C float numpy arrays. Each numpy array is of + shape [K, 1], representing K scores detected with object class + label c + tp_fp_labels: A list of C boolean numpy arrays. Each numpy array + is of shape [K, 1], representing K True/False positive label of + object instances detected with class label c + """ + ( + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks, + ) = self._remove_invalid_boxes( + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks, + ) + scores, tp_fp_labels = self._compute_tp_fp( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + detected_class_labels=detected_class_labels, + groundtruth_boxes=groundtruth_boxes, + groundtruth_class_labels=groundtruth_class_labels, + groundtruth_is_difficult_list=groundtruth_is_difficult_list, + groundtruth_is_group_of_list=groundtruth_is_group_of_list, + detected_masks=detected_masks, + groundtruth_masks=groundtruth_masks, + ) + + return scores, tp_fp_labels + + def _compute_tp_fp( + self, + detected_boxes, + detected_scores, + detected_class_labels, + groundtruth_boxes, + groundtruth_class_labels, + groundtruth_is_difficult_list, + groundtruth_is_group_of_list, + detected_masks=None, + groundtruth_masks=None, + ): + """Labels true/false positives of detections of an image across all + classes. + + Args: + detected_boxes: A float numpy array of shape [N, 4], representing N + regions of detected object regions. + Each row is of the format [y_min, x_min, y_max, x_max] + detected_scores: A float numpy array of shape [N, 1], representing + the confidence scores of the detected N object instances. + detected_class_labels: A integer numpy array of shape [N, 1], + repreneting the class labels of the detected N object + instances. + groundtruth_boxes: A float numpy array of shape [M, 4], + representing M regions of object instances in ground truth + groundtruth_class_labels: An integer numpy array of shape [M, 1], + representing M class labels of object instances in ground truth + groundtruth_is_difficult_list: A boolean numpy array of length M + denoting whether a ground truth box is a difficult instance or + not + groundtruth_is_group_of_list: A boolean numpy array of length M + denoting whether a ground truth box has group-of tag + detected_masks: (optional) A np.uint8 numpy array of shape + [N, height, width]. If not None, the scores will be computed + based on masks. + groundtruth_masks: (optional) A np.uint8 numpy array of shape + [M, height, width]. + + Returns: + result_scores: A list of float numpy arrays. Each numpy array is of + shape [K, 1], representing K scores detected with object class + label c + result_tp_fp_labels: A list of boolean numpy array. Each numpy + array is of shape [K, 1], representing K True/False positive + label of object instances detected with class label c + + Raises: + ValueError: If detected masks is not None but groundtruth masks are + None, or the other way around. + """ + if detected_masks is not None and groundtruth_masks is None: + raise ValueError( + 'Detected masks is available but groundtruth masks is not.') + if detected_masks is None and groundtruth_masks is not None: + raise ValueError( + 'Groundtruth masks is available but detected masks is not.') + + result_scores = [] + result_tp_fp_labels = [] + for i in range(self.num_groundtruth_classes): + groundtruth_is_difficult_list_at_ith_class = ( + groundtruth_is_difficult_list[groundtruth_class_labels == i]) + groundtruth_is_group_of_list_at_ith_class = ( + groundtruth_is_group_of_list[groundtruth_class_labels == i]) + ( + gt_boxes_at_ith_class, + gt_masks_at_ith_class, + detected_boxes_at_ith_class, + detected_scores_at_ith_class, + detected_masks_at_ith_class, + ) = self._get_ith_class_arrays(detected_boxes, detected_scores, + detected_masks, + detected_class_labels, + groundtruth_boxes, + groundtruth_masks, + groundtruth_class_labels, i) + scores, tp_fp_labels = self._compute_tp_fp_for_single_class( + detected_boxes=detected_boxes_at_ith_class, + detected_scores=detected_scores_at_ith_class, + groundtruth_boxes=gt_boxes_at_ith_class, + groundtruth_is_difficult_list=( + groundtruth_is_difficult_list_at_ith_class), + groundtruth_is_group_of_list=( + groundtruth_is_group_of_list_at_ith_class), + detected_masks=detected_masks_at_ith_class, + groundtruth_masks=gt_masks_at_ith_class, + ) + result_scores.append(scores) + result_tp_fp_labels.append(tp_fp_labels) + return result_scores, result_tp_fp_labels + + def _get_overlaps_and_scores_box_mode( + self, + detected_boxes, + detected_scores, + groundtruth_boxes, + groundtruth_is_group_of_list, + ): + """Computes overlaps and scores between detected and groudntruth boxes. + + Args: + detected_boxes: A numpy array of shape [N, 4] representing detected + box coordinates + detected_scores: A 1-d numpy array of length N representing + classification score + groundtruth_boxes: A numpy array of shape [M, 4] representing + ground truth box coordinates + groundtruth_is_group_of_list: A boolean numpy array of length M + denoting whether a ground truth box has group-of tag. If a + groundtruth box is group-of box, every detection matching this + box is ignored. + + Returns: + iou: A float numpy array of size [num_detected_boxes, + num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it + will be None. + ioa: A float numpy array of size [num_detected_boxes, + num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will + be None. + scores: The score of the detected boxlist. + num_boxes: Number of non-maximum suppressed detected boxes. + """ + detected_boxlist = np_box_list.BoxList(detected_boxes) + detected_boxlist.add_field('scores', detected_scores) + gt_non_group_of_boxlist = np_box_list.BoxList( + groundtruth_boxes[~groundtruth_is_group_of_list]) + + iou = np_box_ops.iou(detected_boxlist.get(), + gt_non_group_of_boxlist.get()) + scores = detected_boxlist.get_field('scores') + num_boxes = detected_boxlist.num_boxes() + return iou, None, scores, num_boxes + + def _compute_tp_fp_for_single_class( + self, + detected_boxes, + detected_scores, + groundtruth_boxes, + groundtruth_is_difficult_list, + groundtruth_is_group_of_list, + detected_masks=None, + groundtruth_masks=None, + ): + """Labels boxes detected with the same class from the same image as + tp/fp. + + Args: + detected_boxes: A numpy array of shape [N, 4] representing detected + box coordinates + detected_scores: A 1-d numpy array of length N representing + classification score + groundtruth_boxes: A numpy array of shape [M, 4] representing + groundtruth box coordinates + groundtruth_is_difficult_list: A boolean numpy array of length M + denoting whether a ground truth box is a difficult instance or + not. If a groundtruth box is difficult, every detection + matching this box is ignored. + groundtruth_is_group_of_list: A boolean numpy array of length M + denoting whether a ground truth box has group-of tag. If a + groundtruth box is group-of box, every detection matching this + box is ignored. + detected_masks: (optional) A uint8 numpy array of shape + [N, height, width]. If not None, the scores will be computed + based on masks. + groundtruth_masks: (optional) A uint8 numpy array of shape + [M, height, width]. + + Returns: + Two arrays of the same size, containing all boxes that were + evaluated as being true positives or false positives; if a box + matched to a difficult box or to a group-of box, it is ignored. + + scores: A numpy array representing the detection scores. + tp_fp_labels: a boolean numpy array indicating whether a detection + is a true positive. + """ + if detected_boxes.size == 0: + return np.array([], dtype=float), np.array([], dtype=bool) + + ( + iou, + _, + scores, + num_detected_boxes, + ) = self._get_overlaps_and_scores_box_mode( + detected_boxes=detected_boxes, + detected_scores=detected_scores, + groundtruth_boxes=groundtruth_boxes, + groundtruth_is_group_of_list=groundtruth_is_group_of_list, + ) + + if groundtruth_boxes.size == 0: + return scores, np.zeros(num_detected_boxes, dtype=bool) + + tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool) + is_matched_to_difficult_box = np.zeros(num_detected_boxes, dtype=bool) + is_matched_to_group_of_box = np.zeros(num_detected_boxes, dtype=bool) + + # The evaluation is done in two stages: + # 1. All detections are matched to non group-of boxes; true positives + # are determined and detections matched to difficult boxes are + # ignored. + # 2. Detections that are determined as false positives are matched + # against group-of boxes and ignored if matched. + + # Tp-fp evaluation for non-group of boxes (if any). + if iou.shape[1] > 0: + groundtruth_nongroup_of_is_difficult_list = ( + groundtruth_is_difficult_list[~groundtruth_is_group_of_list]) + max_overlap_gt_ids = np.argmax(iou, axis=1) + is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool) + for i in range(num_detected_boxes): + gt_id = max_overlap_gt_ids[i] + if iou[i, gt_id] >= self.matching_iou_threshold: + if not groundtruth_nongroup_of_is_difficult_list[gt_id]: + if not is_gt_box_detected[gt_id]: + tp_fp_labels[i] = True + is_gt_box_detected[gt_id] = True + else: + is_matched_to_difficult_box[i] = True + + return ( + scores[~is_matched_to_difficult_box & ~is_matched_to_group_of_box], + tp_fp_labels[~is_matched_to_difficult_box + & ~is_matched_to_group_of_box], + ) + + def _get_ith_class_arrays( + self, + detected_boxes, + detected_scores, + detected_masks, + detected_class_labels, + groundtruth_boxes, + groundtruth_masks, + groundtruth_class_labels, + class_index, + ): + """Returns numpy arrays belonging to class with index `class_index`. + + Args: + detected_boxes: A numpy array containing detected boxes. + detected_scores: A numpy array containing detected scores. + detected_masks: A numpy array containing detected masks. + detected_class_labels: A numpy array containing detected class + labels. + groundtruth_boxes: A numpy array containing groundtruth boxes. + groundtruth_masks: A numpy array containing groundtruth masks. + groundtruth_class_labels: A numpy array containing groundtruth + class labels. + class_index: An integer index. + + Returns: + gt_boxes_at_ith_class: A numpy array containing groundtruth boxes + labeled as ith class. + gt_masks_at_ith_class: A numpy array containing groundtruth masks + labeled as ith class. + detected_boxes_at_ith_class: A numpy array containing detected + boxes corresponding to the ith class. + detected_scores_at_ith_class: A numpy array containing detected + scores corresponding to the ith class. + detected_masks_at_ith_class: A numpy array containing detected + masks corresponding to the ith class. + """ + selected_groundtruth = groundtruth_class_labels == class_index + gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth] + if groundtruth_masks is not None: + gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth] + else: + gt_masks_at_ith_class = None + selected_detections = detected_class_labels == class_index + detected_boxes_at_ith_class = detected_boxes[selected_detections] + detected_scores_at_ith_class = detected_scores[selected_detections] + if detected_masks is not None: + detected_masks_at_ith_class = detected_masks[selected_detections] + else: + detected_masks_at_ith_class = None + return ( + gt_boxes_at_ith_class, + gt_masks_at_ith_class, + detected_boxes_at_ith_class, + detected_scores_at_ith_class, + detected_masks_at_ith_class, + ) + + def _remove_invalid_boxes( + self, + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks=None, + ): + """Removes entries with invalid boxes. + + A box is invalid if either its xmax is smaller than its xmin, or its + ymax is smaller than its ymin. + + Args: + detected_boxes: A float numpy array of size [num_boxes, 4] + containing box coordinates in [ymin, xmin, ymax, xmax] format. + detected_scores: A float numpy array of size [num_boxes]. + detected_class_labels: A int32 numpy array of size [num_boxes]. + detected_masks: A uint8 numpy array of size + [num_boxes, height, width]. + + Returns: + valid_detected_boxes: A float numpy array of size + [num_valid_boxes, 4] containing box coordinates in + [ymin, xmin, ymax, xmax] format. + valid_detected_scores: A float numpy array of size + [num_valid_boxes]. + valid_detected_class_labels: A int32 numpy array of size + [num_valid_boxes]. + valid_detected_masks: A uint8 numpy array of size + [num_valid_boxes, height, width]. + """ + valid_indices = np.logical_and( + detected_boxes[:, 0] < detected_boxes[:, 2], + detected_boxes[:, 1] < detected_boxes[:, 3], + ) + detected_boxes = detected_boxes[valid_indices] + detected_scores = detected_scores[valid_indices] + detected_class_labels = detected_class_labels[valid_indices] + if detected_masks is not None: + detected_masks = detected_masks[valid_indices] + return [ + detected_boxes, + detected_scores, + detected_class_labels, + detected_masks, + ] diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/standard_fields.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/standard_fields.py new file mode 100644 index 0000000..8edf46d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_evaluation/standard_fields.py @@ -0,0 +1,115 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +"""Contains classes specifying naming conventions used for object detection. + +Specifies: + InputDataFields: standard fields used by reader/preprocessor/batcher. + DetectionResultFields: standard fields returned by object detector. +""" + + +class InputDataFields: + """Names for the input tensors. + + Holds the standard data field names to use for identifying input tensors. + This should be used by the decoder to identify keys for the returned + tensor_dict containing input tensors. And it should be used by the model to + identify the tensors it needs. + + Attributes: + image: image. + original_image: image in the original input size. + key: unique key corresponding to image. + source_id: source of the original image. + filename: original filename of the dataset (without common path). + groundtruth_image_classes: image-level class labels. + groundtruth_boxes: coordinates of the ground truth boxes in the image. + groundtruth_classes: box-level class labels. + groundtruth_label_types: box-level label types (e.g. explicit + negative). + groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead] + is the groundtruth a single object or a crowd. + groundtruth_area: area of a groundtruth segment. + groundtruth_difficult: is a `difficult` object + groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of + the same class, forming a connected group, where instances are + heavily occluding each other. + proposal_boxes: coordinates of object proposal boxes. + proposal_objectness: objectness score of each proposal. + groundtruth_instance_masks: ground truth instance masks. + groundtruth_instance_boundaries: ground truth instance boundaries. + groundtruth_instance_classes: instance mask-level class labels. + groundtruth_keypoints: ground truth keypoints. + groundtruth_keypoint_visibilities: ground truth keypoint visibilities. + groundtruth_label_scores: groundtruth label scores. + groundtruth_weights: groundtruth weight factor for bounding boxes. + num_groundtruth_boxes: number of groundtruth boxes. + true_image_shapes: true shapes of images in the resized images, as + resized images can be padded with zeros. + """ + + image = 'image' + original_image = 'original_image' + key = 'key' + source_id = 'source_id' + filename = 'filename' + groundtruth_image_classes = 'groundtruth_image_classes' + groundtruth_boxes = 'groundtruth_boxes' + groundtruth_classes = 'groundtruth_classes' + groundtruth_label_types = 'groundtruth_label_types' + groundtruth_is_crowd = 'groundtruth_is_crowd' + groundtruth_area = 'groundtruth_area' + groundtruth_difficult = 'groundtruth_difficult' + groundtruth_group_of = 'groundtruth_group_of' + proposal_boxes = 'proposal_boxes' + proposal_objectness = 'proposal_objectness' + groundtruth_instance_masks = 'groundtruth_instance_masks' + groundtruth_instance_boundaries = 'groundtruth_instance_boundaries' + groundtruth_instance_classes = 'groundtruth_instance_classes' + groundtruth_keypoints = 'groundtruth_keypoints' + groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities' + groundtruth_label_scores = 'groundtruth_label_scores' + groundtruth_weights = 'groundtruth_weights' + num_groundtruth_boxes = 'num_groundtruth_boxes' + true_image_shape = 'true_image_shape' + + +class DetectionResultFields: + """Naming conventions for storing the output of the detector. + + Attributes: + source_id: source of the original image. + key: unique key corresponding to image. + detection_boxes: coordinates of the detection boxes in the image. + detection_scores: detection scores for the detection boxes in the + image. + detection_classes: detection-level class labels. + detection_masks: contains a segmentation mask for each detection box. + detection_boundaries: contains an object boundary for each detection + box. + detection_keypoints: contains detection keypoints for each detection + box. + num_detections: number of detections in the batch. + """ + + source_id = 'source_id' + key = 'key' + detection_boxes = 'detection_boxes' + detection_scores = 'detection_scores' + detection_classes = 'detection_classes' + detection_masks = 'detection_masks' + detection_boundaries = 'detection_boundaries' + detection_keypoints = 'detection_keypoints' + num_detections = 'num_detections' diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_metric.py new file mode 100644 index 0000000..b17c8c8 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_metric.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np +import paddle +from collections import OrderedDict +from paddlevideo.utils import get_logger, load, log_batch, AverageMeter +from .registry import METRIC +from .base import BaseMetric +import time +from datetime import datetime +from .ava_utils import ava_evaluate_results + +logger = get_logger("paddlevideo") +""" An example for metrics class. + MultiCropMetric for slowfast. +""" + + +@METRIC.register +class AVAMetric(BaseMetric): + def __init__(self, + data_size, + batch_size, + file_path, + exclude_file, + label_file, + custom_classes, + log_interval=1): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + + self.file_path = file_path + self.exclude_file = exclude_file + self.label_file = label_file + self.custom_classes = custom_classes + + self.results = [] + + record_list = [ + ("loss", AverageMeter('loss', '7.5f')), + ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f')), + ("prec@thr=0.5", AverageMeter("prec@thr=0.5", '.5f')), + ("recall@top3", AverageMeter("recall@top3", '.5f')), + ("prec@top3", AverageMeter("prec@top3", '.5f')), + ("recall@top5", AverageMeter("recall@top5", '.5f')), + ("prec@top5", AverageMeter("prec@top5", '.5f')), + ("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f')), + ("batch_time", AverageMeter('batch_cost', '.5f')), + ("reader_time", AverageMeter('reader_cost', '.5f')), + ] + + self.record_list = OrderedDict(record_list) + + self.tic = time.time() + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + + self.results.extend(outputs) + self.record_list['batch_time'].update(time.time() - self.tic) + tic = time.time() + ips = "ips: {:.5f} instance/sec.".format( + self.batch_size / self.record_list["batch_time"].val) + log_batch(self.record_list, batch_id, 0, 0, "test", ips) + + def set_dataset_info(self, info, dataset_len): + self.info = info + self.dataset_len = dataset_len + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + test_res = ava_evaluate_results(self.info, self.dataset_len, + self.results, None, self.label_file, + self.file_path, self.exclude_file) + + for name, value in test_res.items(): + self.record_list[name].update(value, self.batch_size) + + return self.record_list diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ava_utils.py b/Bank_second_part/detect_process/paddlevideo/metrics/ava_utils.py new file mode 100644 index 0000000..b127267 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ava_utils.py @@ -0,0 +1,394 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import heapq +import logging +import time +from collections import defaultdict +from .ava_evaluation import object_detection_evaluation as det_eval +from .ava_evaluation import standard_fields +from .recall import eval_recalls +import shutil +import pickle +import time +import os +import os.path as osp +from paddlevideo.utils import get_logger, get_dist_info +import paddle.distributed as dist +import sys +import numpy as np +from pathlib import Path +from datetime import datetime +import paddle + + +def det2csv(info, dataset_len, results, custom_classes): + csv_results = [] + for idx in range(dataset_len): + video_id = info[idx]['video_id'] + timestamp = info[idx]['timestamp'] + + result = results[idx] + for label, _ in enumerate(result): + for bbox in result[label]: + if type(bbox) == paddle.Tensor: + bbox = bbox.numpy() + + bbox_ = tuple(bbox.tolist()) + if custom_classes is not None: + actual_label = custom_classes[label + 1] + else: + actual_label = label + 1 + csv_results.append(( + video_id, + timestamp, + ) + bbox_[:4] + (actual_label, ) + bbox_[4:]) + return csv_results + + +# results is organized by class +def results2csv(info, dataset_len, results, out_file, custom_classes=None): + if isinstance(results[0], list): + csv_results = det2csv(info, dataset_len, results, custom_classes) + + # save space for float + def tostr(item): + if isinstance(item, float): + return f'{item:.3f}' + return str(item) + + with open(out_file, 'w') as f: + for csv_result in csv_results: + f.write(','.join(map(lambda x: tostr(x), csv_result))) + f.write('\n') + + +def print_time(message, start): + print('==> %g seconds to %s' % (time.time() - start, message)) + + +def make_image_key(video_id, timestamp): + """Returns a unique identifier for a video id & timestamp.""" + return f'{video_id},{int(timestamp):04d}' + + +def read_csv(csv_file, class_whitelist=None, capacity=0): + """Loads boxes and class labels from a CSV file in the AVA format. + + CSV file format described at https://research.google.com/ava/download.html. + + Args: + csv_file: A file object. + class_whitelist: If provided, boxes corresponding to (integer) class + labels not in this set are skipped. + capacity: Maximum number of labeled boxes allowed for each example. + Default is 0 where there is no limit. + + Returns: + boxes: A dictionary mapping each unique image key (string) to a list of + boxes, given as coordinates [y1, x1, y2, x2]. + labels: A dictionary mapping each unique image key (string) to a list + of integer class lables, matching the corresponding box in `boxes`. + scores: A dictionary mapping each unique image key (string) to a list + of score values lables, matching the corresponding label in `labels`. + If scores are not provided in the csv, then they will default to 1.0. + """ + start = time.time() + entries = defaultdict(list) + boxes = defaultdict(list) + labels = defaultdict(list) + scores = defaultdict(list) + reader = csv.reader(csv_file) + for row in reader: + assert len(row) in [7, 8], 'Wrong number of columns: ' + row + image_key = make_image_key(row[0], row[1]) + x1, y1, x2, y2 = [float(n) for n in row[2:6]] + action_id = int(row[6]) + if class_whitelist and action_id not in class_whitelist: + continue + + score = 1.0 + if len(row) == 8: + score = float(row[7]) + if capacity < 1 or len(entries[image_key]) < capacity: + heapq.heappush(entries[image_key], + (score, action_id, y1, x1, y2, x2)) + elif score > entries[image_key][0][0]: + heapq.heapreplace(entries[image_key], + (score, action_id, y1, x1, y2, x2)) + for image_key in entries: + # Evaluation API assumes boxes with descending scores + entry = sorted(entries[image_key], key=lambda tup: -tup[0]) + for item in entry: + score, action_id, y1, x1, y2, x2 = item + boxes[image_key].append([y1, x1, y2, x2]) + labels[image_key].append(action_id) + scores[image_key].append(score) + print_time('read file ' + csv_file.name, start) + return boxes, labels, scores + + +def read_exclusions(exclusions_file): + """Reads a CSV file of excluded timestamps. + + Args: + exclusions_file: A file object containing a csv of video-id,timestamp. + + Returns: + A set of strings containing excluded image keys, e.g. + "aaaaaaaaaaa,0904", + or an empty set if exclusions file is None. + """ + excluded = set() + if exclusions_file: + reader = csv.reader(exclusions_file) + for row in reader: + assert len(row) == 2, 'Expected only 2 columns, got: ' + row + excluded.add(make_image_key(row[0], row[1])) + return excluded + + +def read_labelmap(labelmap_file): + """Reads a labelmap without the dependency on protocol buffers. + + Args: + labelmap_file: A file object containing a label map protocol buffer. + + Returns: + labelmap: The label map in the form used by the + object_detection_evaluation + module - a list of {"id": integer, "name": classname } dicts. + class_ids: A set containing all of the valid class id integers. + """ + labelmap = [] + class_ids = set() + name = '' + class_id = '' + for line in labelmap_file: + if line.startswith(' name:'): + name = line.split('"')[1] + elif line.startswith(' id:') or line.startswith(' label_id:'): + class_id = int(line.strip().split(' ')[-1]) + labelmap.append({'id': class_id, 'name': name}) + class_ids.add(class_id) + return labelmap, class_ids + + +# Seems there is at most 100 detections for each image +def ava_eval(result_file, + result_type, + label_file, + ann_file, + exclude_file, + max_dets=(100, ), + verbose=True, + custom_classes=None): + + assert result_type in ['mAP'] + start = time.time() + categories, class_whitelist = read_labelmap(open(label_file)) + + if custom_classes is not None: + custom_classes = custom_classes[1:] + assert set(custom_classes).issubset(set(class_whitelist)) + class_whitelist = custom_classes + categories = [cat for cat in categories if cat['id'] in custom_classes] + + # loading gt, do not need gt score + gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist, 0) + if verbose: + print_time('Reading detection results', start) + + if exclude_file is not None: + excluded_keys = read_exclusions(open(exclude_file)) + else: + excluded_keys = list() + + start = time.time() + boxes, labels, scores = read_csv(open(result_file), class_whitelist, 0) + if verbose: + print_time('Reading detection results', start) + + if result_type == 'proposal': + gts = [ + np.array(gt_boxes[image_key], dtype=float) for image_key in gt_boxes + ] + proposals = [] + for image_key in gt_boxes: + if image_key in boxes: + proposals.append( + np.concatenate( + (np.array(boxes[image_key], dtype=float), + np.array(scores[image_key], dtype=float)[:, None]), + axis=1)) + else: + # if no corresponding proposal, add a fake one + proposals.append(np.array([0, 0, 1, 1, 1])) + + # Proposals used here are with scores + recalls = eval_recalls(gts, proposals, np.array(max_dets), + np.arange(0.5, 0.96, 0.05)) + ar = recalls.mean(axis=1) + ret = {} + for i, num in enumerate(max_dets): + print(f'Recall@0.5@{num}\t={recalls[i, 0]:.4f}') + print(f'AR@{num}\t={ar[i]:.4f}') + ret[f'Recall@0.5@{num}'] = recalls[i, 0] + ret[f'AR@{num}'] = ar[i] + return ret + + if result_type == 'mAP': + pascal_evaluator = det_eval.PascalDetectionEvaluator(categories) + + start = time.time() + for image_key in gt_boxes: + if verbose and image_key in excluded_keys: + logging.info( + 'Found excluded timestamp in detections: %s.' + 'It will be ignored.', image_key) + continue + pascal_evaluator.add_single_ground_truth_image_info( + image_key, { + standard_fields.InputDataFields.groundtruth_boxes: + np.array(gt_boxes[image_key], dtype=float), + standard_fields.InputDataFields.groundtruth_classes: + np.array(gt_labels[image_key], dtype=int), + standard_fields.InputDataFields.groundtruth_difficult: + np.zeros(len(gt_boxes[image_key]), dtype=bool) + }) + if verbose: + print_time('Convert groundtruth', start) + + start = time.time() + for image_key in boxes: + if verbose and image_key in excluded_keys: + logging.info( + 'Found excluded timestamp in detections: %s.' + 'It will be ignored.', image_key) + continue + pascal_evaluator.add_single_detected_image_info( + image_key, { + standard_fields.DetectionResultFields.detection_boxes: + np.array(boxes[image_key], dtype=float), + standard_fields.DetectionResultFields.detection_classes: + np.array(labels[image_key], dtype=int), + standard_fields.DetectionResultFields.detection_scores: + np.array(scores[image_key], dtype=float) + }) + if verbose: + print_time('convert detections', start) + + start = time.time() + metrics = pascal_evaluator.evaluate() + if verbose: + print_time('run_evaluator', start) + for display_name in metrics: + print(f'{display_name}=\t{metrics[display_name]}') + ret = { + display_name: metrics[display_name] + for display_name in metrics if 'ByCategory' not in display_name + } + return ret + + +def mkdir_or_exist(dir_name, mode=0o777): + if dir_name == '': + return + dir_name = osp.expanduser(dir_name) + os.makedirs(dir_name, mode=mode, exist_ok=True) + + +def dump_to_fileobj(obj, file, **kwargs): + kwargs.setdefault('protocol', 2) + pickle.dump(obj, file, **kwargs) + + +def dump_to_path(obj, filepath, mode='wb'): + with open(filepath, mode) as f: + dump_to_fileobj(obj, f) + + +def load_from_fileobj(file, **kwargs): + return pickle.load(file, **kwargs) + + +def load_from_path(filepath, mode='rb'): + with open(filepath, mode) as f: + return load_from_fileobj(f) + + +def collect_results_cpu(result_part, size): + """Collect results in cpu mode. + It saves the results on different gpus to 'tmpdir' and collects + them by the rank 0 worker. + """ + tmpdir = osp.join('./', 'collect_results_cpu') + #1. load results of all parts from tmp dir + mkdir_or_exist(tmpdir) + rank, world_size = get_dist_info() + dump_to_path(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + if rank != 0: + return None + #2. collect all parts + while 1: + all_exist = True + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + if not Path(part_file).exists(): + all_exist = False + if all_exist: + break + else: + time.sleep(60) + time.sleep(120) + #3. load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(load_from_path(part_file)) + #4. sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + ordered_results = ordered_results[: + size] #the dataloader may pad some samples + #5. remove results of all parts from tmp dir, avoid dump_file fail to tmp dir when dir not exists. + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + os.remove(part_file) + + return ordered_results + + +def ava_evaluate_results(info, dataset_len, results, custom_classes, label_file, + file_path, exclude_file): + # need to create a temp result file + time_now = datetime.now().strftime('%Y%m%d_%H%M%S') + temp_file = f'AVA_{time_now}_result.csv' + results2csv(info, dataset_len, results, temp_file) + ret = {} + eval_result = ava_eval( + temp_file, + 'mAP', + label_file, + file_path, #ann_file, + exclude_file, + custom_classes=custom_classes) + ret.update(eval_result) + + os.remove(temp_file) + + return ret diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/base.py b/Bank_second_part/detect_process/paddlevideo/metrics/base.py new file mode 100644 index 0000000..9842232 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/base.py @@ -0,0 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from abc import abstractmethod + +import paddle +from paddlevideo.utils import get_dist_info + +from .registry import METRIC + + +class BaseMetric(object): + def __init__(self, data_size, batch_size, log_interval=1, **kwargs): + self.data_size = data_size + self.batch_size = batch_size + _, self.world_size = get_dist_info() + self.log_interval = log_interval + + def gather_from_gpu(self, + gather_object: paddle.Tensor, + concat_axis=0) -> paddle.Tensor: + """gather Tensor from all gpus into a list and concatenate them on `concat_axis`. + + Args: + gather_object (paddle.Tensor): gather object Tensor + concat_axis (int, optional): axis for concatenation. Defaults to 0. + + Returns: + paddle.Tensor: gatherd & concatenated Tensor + """ + gather_object_list = [] + paddle.distributed.all_gather(gather_object_list, gather_object.cuda()) + return paddle.concat(gather_object_list, axis=concat_axis) + + @abstractmethod + def update(self): + raise NotImplementedError( + "'update' method must be implemented in subclass") + + @abstractmethod + def accumulate(self): + raise NotImplementedError( + "'accumulate' method must be implemented in subclass") diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/bmn_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/bmn_metric.py new file mode 100644 index 0000000..cc36283 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/bmn_metric.py @@ -0,0 +1,304 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import os +import json +import numpy as np +import pandas as pd +import multiprocessing as mp + +from .registry import METRIC +from .base import BaseMetric +from .ActivityNet import ANETproposal +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +def iou_with_anchors(anchors_min, anchors_max, box_min, box_max): + """Compute jaccard score between a box and the anchors. + """ + len_anchors = anchors_max - anchors_min + int_xmin = np.maximum(anchors_min, box_min) + int_xmax = np.minimum(anchors_max, box_max) + inter_len = np.maximum(int_xmax - int_xmin, 0.) + union_len = len_anchors - inter_len + box_max - box_min + jaccard = np.divide(inter_len, union_len) + return jaccard + + +def boundary_choose(score_list): + """Choose start and end boundary from score. + """ + max_score = max(score_list) + mask_high = (score_list > max_score * 0.5) + score_list = list(score_list) + score_middle = np.array([0.0] + score_list + [0.0]) + score_front = np.array([0.0, 0.0] + score_list) + score_back = np.array(score_list + [0.0, 0.0]) + mask_peak = ((score_middle > score_front) & (score_middle > score_back)) + mask_peak = mask_peak[1:-1] + mask = (mask_high | mask_peak).astype('float32') + return mask + + +def soft_nms(df, alpha, t1, t2): + ''' + df: proposals generated by network; + alpha: alpha value of Gaussian decaying function; + t1, t2: threshold for soft nms. + ''' + df = df.sort_values(by="score", ascending=False) + tstart = list(df.xmin.values[:]) + tend = list(df.xmax.values[:]) + tscore = list(df.score.values[:]) + + rstart = [] + rend = [] + rscore = [] + + while len(tscore) > 1 and len(rscore) < 101: + max_index = tscore.index(max(tscore)) + tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend), + tstart[max_index], tend[max_index]) + for idx in range(0, len(tscore)): + if idx != max_index: + tmp_iou = tmp_iou_list[idx] + tmp_width = tend[max_index] - tstart[max_index] + if tmp_iou > t1 + (t2 - t1) * tmp_width: + tscore[idx] = tscore[idx] * np.exp( + -np.square(tmp_iou) / alpha) + + rstart.append(tstart[max_index]) + rend.append(tend[max_index]) + rscore.append(tscore[max_index]) + tstart.pop(max_index) + tend.pop(max_index) + tscore.pop(max_index) + + newDf = pd.DataFrame() + newDf['score'] = rscore + newDf['xmin'] = rstart + newDf['xmax'] = rend + return newDf + + +@METRIC.register +class BMNMetric(BaseMetric): + """ + Metrics for BMN. Two Stages in this metric: + (1) Get test results using trained model, results will be saved in BMNMetric.result_path; + (2) Calculate metrics using results file from stage (1). + """ + + def __init__(self, + data_size, + batch_size, + tscale, + dscale, + file_path, + ground_truth_filename, + subset, + output_path, + result_path, + get_metrics=True, + log_interval=1): + """ + Init for BMN metrics. + Params: + get_metrics: whether to calculate AR@N and AUC metrics or not, default True. + """ + super().__init__(data_size, batch_size, log_interval) + assert self.batch_size == 1, " Now we just support batch_size==1 test" + assert self.world_size == 1, " Now we just support single-card test" + + self.tscale = tscale + self.dscale = dscale + self.file_path = file_path + self.ground_truth_filename = ground_truth_filename + self.subset = subset + self.output_path = output_path + self.result_path = result_path + self.get_metrics = get_metrics + + if not os.path.isdir(self.output_path): + os.makedirs(self.output_path) + if not os.path.isdir(self.result_path): + os.makedirs(self.result_path) + + self.video_dict, self.video_list = self.get_dataset_dict( + self.file_path, self.subset) + + def get_dataset_dict(self, file_path, subset): + annos = json.load(open(file_path)) + video_dict = {} + for video_name in annos.keys(): + video_subset = annos[video_name]["subset"] + if subset in video_subset: + video_dict[video_name] = annos[video_name] + video_list = list(video_dict.keys()) + video_list.sort() + return video_dict, video_list + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + fid = data[4].numpy() + pred_bm, pred_start, pred_end = outputs + pred_bm = pred_bm.numpy() + pred_start = pred_start[0].numpy() + pred_end = pred_end[0].numpy() + + snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)] + snippet_xmaxs = [ + 1.0 / self.tscale * i for i in range(1, self.tscale + 1) + ] + cols = ["xmin", "xmax", "score"] + + video_name = self.video_list[fid[0]] + pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :] + start_mask = boundary_choose(pred_start) + start_mask[0] = 1. + end_mask = boundary_choose(pred_end) + end_mask[-1] = 1. + score_vector_list = [] + for idx in range(self.dscale): + for jdx in range(self.tscale): + start_index = jdx + end_index = start_index + idx + if end_index < self.tscale and start_mask[ + start_index] == 1 and end_mask[end_index] == 1: + xmin = snippet_xmins[start_index] + xmax = snippet_xmaxs[end_index] + xmin_score = pred_start[start_index] + xmax_score = pred_end[end_index] + bm_score = pred_bm[idx, jdx] + conf_score = xmin_score * xmax_score * bm_score + score_vector_list.append([xmin, xmax, conf_score]) + + score_vector_list = np.stack(score_vector_list) + video_df = pd.DataFrame(score_vector_list, columns=cols) + video_df.to_csv(os.path.join(self.output_path, "%s.csv" % video_name), + index=False) + + if batch_id % self.log_interval == 0: + logger.info("Processing................ batch {}".format(batch_id)) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + # check clip index of each video + #Stage1 + self.bmn_post_processing(self.video_dict, self.subset, self.output_path, + self.result_path) + if self.get_metrics: + logger.info("[TEST] calculate metrics...") + #Stage2 + uniform_average_nr_proposals_valid, uniform_average_recall_valid, uniform_recall_valid = self.cal_metrics( + self.ground_truth_filename, + os.path.join(self.result_path, "bmn_results_validation.json"), + max_avg_nr_proposals=100, + tiou_thresholds=np.linspace(0.5, 0.95, 10), + subset='validation') + logger.info("AR@1; AR@5; AR@10; AR@100") + logger.info("%.02f %.02f %.02f %.02f" % + (100 * np.mean(uniform_recall_valid[:, 0]), + 100 * np.mean(uniform_recall_valid[:, 4]), + 100 * np.mean(uniform_recall_valid[:, 9]), + 100 * np.mean(uniform_recall_valid[:, -1]))) + + def bmn_post_processing(self, video_dict, subset, output_path, result_path): + video_list = list(video_dict.keys()) + global result_dict + result_dict = mp.Manager().dict() + pp_num = 12 + + num_videos = len(video_list) + num_videos_per_thread = int(num_videos / pp_num) + processes = [] + for tid in range(pp_num - 1): + tmp_video_list = video_list[tid * num_videos_per_thread:(tid + 1) * + num_videos_per_thread] + p = mp.Process(target=self.video_process, + args=(tmp_video_list, video_dict, output_path, + result_dict)) + p.start() + processes.append(p) + tmp_video_list = video_list[(pp_num - 1) * num_videos_per_thread:] + p = mp.Process(target=self.video_process, + args=(tmp_video_list, video_dict, output_path, + result_dict)) + p.start() + processes.append(p) + for p in processes: + p.join() + + result_dict = dict(result_dict) + output_dict = { + "version": "VERSION 1.3", + "results": result_dict, + "external_data": {} + } + outfile = open( + os.path.join(result_path, "bmn_results_%s.json" % subset), "w") + + # json.dump(output_dict, outfile) + # in case of file name in chinese + json.dump(output_dict, outfile, ensure_ascii=False) + outfile.close() + + def video_process(self, + video_list, + video_dict, + output_path, + result_dict, + snms_alpha=0.4, + snms_t1=0.55, + snms_t2=0.9): + + for video_name in video_list: + logger.info("Processing video........" + video_name) + df = pd.read_csv(os.path.join(output_path, video_name + ".csv")) + if len(df) > 1: + df = soft_nms(df, snms_alpha, snms_t1, snms_t2) + + video_duration = video_dict[video_name]["duration_second"] + proposal_list = [] + for idx in range(min(100, len(df))): + tmp_prop={"score":df.score.values[idx], \ + "segment":[max(0,df.xmin.values[idx])*video_duration, \ + min(1,df.xmax.values[idx])*video_duration]} + proposal_list.append(tmp_prop) + + video_name = video_name[2:] if video_name[:2] == 'v_' else video_name + result_dict[video_name] = proposal_list + + def cal_metrics(self, + ground_truth_filename, + proposal_filename, + max_avg_nr_proposals=100, + tiou_thresholds=np.linspace(0.5, 0.95, 10), + subset='validation'): + + anet_proposal = ANETproposal(ground_truth_filename, + proposal_filename, + tiou_thresholds=tiou_thresholds, + max_avg_nr_proposals=max_avg_nr_proposals, + subset=subset, + verbose=True, + check_status=False) + anet_proposal.evaluate() + recall = anet_proposal.recall + average_recall = anet_proposal.avg_recall + average_nr_proposals = anet_proposal.proposals_per_video + + return (average_nr_proposals, average_recall, recall) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/build.py b/Bank_second_part/detect_process/paddlevideo/metrics/build.py new file mode 100644 index 0000000..82e4b50 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/build.py @@ -0,0 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .registry import METRIC +from ..utils import build + + +def build_metric(cfg): + return build(cfg, METRIC) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric.py new file mode 100644 index 0000000..0ca6112 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric.py @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from typing import List + +import paddle +from paddlevideo.utils import get_logger + +from .base import BaseMetric +from .registry import METRIC + +logger = get_logger("paddlevideo") + + +@METRIC.register +class CenterCropMetric(BaseMetric): + def __init__(self, data_size, batch_size, log_interval=1, **kwargs): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval, **kwargs) + self.rest_data_size = data_size # Number of samples remaining to be tested + self.all_outputs = [] + self.all_labels = [] + self.topk = kwargs.get("topk", [1, 5]) + + def update(self, batch_id: int, data: List, outputs: paddle.Tensor) -> None: + """update metrics during each iter + + Args: + batch_id (int): iter id of current batch. + data (List): list of batched data, such as [inputs, labels] + outputs (paddle.Tensor): batched outputs from model + """ + labels = data[1] + if self.world_size > 1: + labels_gathered = self.gather_from_gpu(labels, concat_axis=0) + outpus_gathered = self.gather_from_gpu(outputs, concat_axis=0) + else: + labels_gathered = labels + outpus_gathered = outputs + + # Avoid resampling effects when testing with multiple cards + labels_gathered = labels_gathered[0:min(len(labels_gathered), self. + rest_data_size)] + outpus_gathered = outpus_gathered[0:min(len(outpus_gathered), self. + rest_data_size)] + self.all_labels.append(labels_gathered) + self.all_outputs.append(outpus_gathered) + self.rest_data_size -= outpus_gathered.shape[0] + + # preds ensemble + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def accumulate(self): + """accumulate, compute, and show metrics when finished all iters. + """ + self.all_outputs = paddle.concat(self.all_outputs, axis=0) + self.all_labels = paddle.concat(self.all_labels, axis=0) + + result_str = [] + for _k in self.topk: + topk_val = paddle.metric.accuracy(input=self.all_outputs, + label=self.all_labels, + k=_k).item() + result_str.append(f"avg_acc{_k}={topk_val}") + result_str = ", ".join(result_str) + logger.info(f"[TEST] finished, {result_str}") diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric_MRI.py b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric_MRI.py new file mode 100644 index 0000000..b6d231a --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/center_crop_metric_MRI.py @@ -0,0 +1,61 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np +import paddle + +from .registry import METRIC +from .base import BaseMetric +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@METRIC.register +class CenterCropMetric_MRI(BaseMetric): + def __init__(self, data_size, batch_size, log_interval=1, if_slowfast=0): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.top1 = [] + self.if_slowfast = if_slowfast + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + labels = data[1] + + if self.if_slowfast: + labels = data[2] + + top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1) + #top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5) + #NOTE(shipping): deal with multi cards validate + if self.world_size > 1: + top1 = paddle.distributed.all_reduce( + top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size + # top5 = paddle.distributed.all_reduce( + # top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size + + self.top1.append(top1.numpy()) + #self.top5.append(top5.numpy()) + # preds ensemble + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + logger.info('[TEST] finished, avg_acc1= {}'.format( + np.mean(np.array(self.top1)))) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/depth_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/depth_metric.py new file mode 100644 index 0000000..c160e16 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/depth_metric.py @@ -0,0 +1,77 @@ +import numpy as np +import paddle +from paddlevideo.utils import get_logger + +from .base import BaseMetric +from .registry import METRIC + +logger = get_logger("paddlevideo") + + +@METRIC.register +class DepthMetric(BaseMetric): + def __init__(self, data_size, batch_size, log_interval=1): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.abs_rel = [] + self.sq_rel = [] + self.rmse = [] + self.rmse_log = [] + self.a1 = [] + self.a2 = [] + self.a3 = [] + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = outputs['abs_rel'], outputs['sq_rel'], outputs['rmse'], \ + outputs['rmse_log'], outputs['a1'], outputs['a2'],outputs['a3'] + # preds ensemble + if self.world_size > 1: + abs_rel = paddle.distributed.all_reduce( + outputs['abs_rel'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + sq_rel = paddle.distributed.all_reduce( + outputs['sq_rel'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + rmse = paddle.distributed.all_reduce( + outputs['rmse'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + rmse_log = paddle.distributed.all_reduce( + outputs['rmse_log'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + a1 = paddle.distributed.all_reduce( + outputs['a1'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + a2 = paddle.distributed.all_reduce( + outputs['a2'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + a3 = paddle.distributed.all_reduce( + outputs['a3'], + op=paddle.distributed.ReduceOp.SUM) / self.world_size + + self.abs_rel.append(abs_rel) + self.sq_rel.append(sq_rel) + self.rmse.append(rmse) + self.rmse_log.append(rmse_log) + self.a1.append(a1) + self.a2.append(a2) + self.a3.append(a3) + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + logger.info( + '[TEST] finished, abs_rel= {}, sq_rel= {} , rmse= {}, rmse_log= {},' + 'a1= {}, a2= {}, a3= {}'.format(np.mean(np.array(self.abs_rel)), + np.mean(np.array(self.sq_rel)), + np.mean(np.array(self.rmse)), + np.mean(np.array(self.rmse_log)), + np.mean(np.array(self.a1)), + np.mean(np.array(self.a2)), + np.mean(np.array(self.a3)))) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/msrvtt_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/msrvtt_metric.py new file mode 100644 index 0000000..99e7334 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/msrvtt_metric.py @@ -0,0 +1,62 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np +import paddle +import paddle.nn.functional as F + +from .registry import METRIC +from .base import BaseMetric +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@METRIC.register +class MSRVTTMetric(BaseMetric): + def __init__(self, data_size, batch_size, log_interval=1): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.score_matrix = np.zeros((data_size, data_size)) + self.target_matrix = np.zeros((data_size, data_size)) + self.rank_matrix = np.ones((data_size)) * data_size + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + target = data[-1] + cm_logit = outputs[-1] + + self.score_matrix[batch_id, :] = F.softmax( + cm_logit, axis=1)[:, 0].reshape([-1]).numpy() + self.target_matrix[batch_id, :] = target.reshape([-1]).numpy() + + rank = np.where((np.argsort(-self.score_matrix[batch_id]) == np.where( + self.target_matrix[batch_id] == 1)[0][0]) == 1)[0][0] + self.rank_matrix[batch_id] = rank + + rank_matrix_tmp = self.rank_matrix[:batch_id + 1] + r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp) + r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp) + r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp) + + medr = np.floor(np.median(rank_matrix_tmp) + 1) + meanr = np.mean(rank_matrix_tmp) + 1 + logger.info( + "[{}] Final r1:{:.3f}, r5:{:.3f}, r10:{:.3f}, mder:{:.3f}, meanr:{:.3f}" + .format(batch_id, r1, r5, r10, medr, meanr)) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + logger.info("Eval Finished!") diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/multi_crop_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/multi_crop_metric.py new file mode 100644 index 0000000..5f20ced --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/multi_crop_metric.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np +import paddle +from paddle.hapi.model import _all_gather + +from paddlevideo.utils import get_logger +from .registry import METRIC +from .base import BaseMetric + +logger = get_logger("paddlevideo") +""" An example for metrics class. + MultiCropMetric for slowfast. +""" + + +@METRIC.register +class MultiCropMetric(BaseMetric): + def __init__(self, + data_size, + batch_size, + num_ensemble_views, + num_spatial_crops, + num_classes, + log_interval=1): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.num_ensemble_views = num_ensemble_views + self.num_spatial_crops = num_spatial_crops + self.num_classes = num_classes + + self.num_clips = self.num_ensemble_views * self.num_spatial_crops + num_videos = self.data_size // self.num_clips + self.video_preds = np.zeros((num_videos, self.num_classes)) + self.video_labels = np.zeros((num_videos, 1), dtype="int64") + self.clip_count = {} + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + labels = data[2] + clip_ids = data[3] + + # gather mulit card, results of following process in each card is the same. + if self.world_size > 1: + outputs = _all_gather(outputs, self.world_size) + labels = _all_gather(labels.cuda(), self.world_size) + clip_ids = _all_gather(clip_ids.cuda(), self.world_size) + + # to numpy + preds = outputs.numpy() + labels = labels.numpy().astype("int64") + clip_ids = clip_ids.numpy() + + # preds ensemble + for ind in range(preds.shape[0]): + vid_id = int(clip_ids[ind]) // self.num_clips + ts_idx = int(clip_ids[ind]) % self.num_clips + if vid_id not in self.clip_count: + self.clip_count[vid_id] = [] + if ts_idx in self.clip_count[vid_id]: + logger.info( + "[TEST] Passed!! read video {} clip index {} / {} repeatedly." + .format(vid_id, ts_idx, clip_ids[ind])) + else: + self.clip_count[vid_id].append(ts_idx) + self.video_preds[vid_id] += preds[ind] # ensemble method: sum + if self.video_labels[vid_id].sum() > 0: + assert self.video_labels[vid_id] == labels[ind] + self.video_labels[vid_id] = labels[ind] + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + # check clip index of each video + for key in self.clip_count.keys(): + if len(self.clip_count[key]) != self.num_clips or sum( + self.clip_count[key]) != self.num_clips * (self.num_clips - + 1) / 2: + logger.info( + "[TEST] Count Error!! video [{}] clip count [{}] not match number clips {}" + .format(key, self.clip_count[key], self.num_clips)) + + video_preds = paddle.to_tensor(self.video_preds) + video_labels = paddle.to_tensor(self.video_labels) + acc_top1 = paddle.metric.accuracy(input=video_preds, + label=video_labels, + k=1) + acc_top5 = paddle.metric.accuracy(input=video_preds, + label=video_labels, + k=5) + logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {} '.format( + acc_top1.numpy(), acc_top5.numpy())) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/recall.py b/Bank_second_part/detect_process/paddlevideo/metrics/recall.py new file mode 100644 index 0000000..3612e22 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/recall.py @@ -0,0 +1,84 @@ +import numpy as np +import paddle + +def _recalls(all_ious, proposal_nums, thrs): + + img_num = all_ious.shape[0] + total_gt_num = sum([ious.shape[0] for ious in all_ious]) + + ious_ = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32) + for k, proposal_num in enumerate(proposal_nums): + tmp_ious = np.zeros(0) + for i in range(img_num): + ious = all_ious[i][:, :proposal_num].copy() + gt_ious = np.zeros(ious.shape[0]) + if ious.size == 0: + tmp_ious = np.hstack((tmp_ious, gt_ious)) + continue + for j in range(ious.shape[0]): + gt_max_overlaps = ious.argmax(axis=1) + max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps] + gt_idx = max_ious.argmax() + gt_ious[j] = max_ious[gt_idx] + box_idx = gt_max_overlaps[gt_idx] + ious[gt_idx, :] = -1 + ious[:, box_idx] = -1 + tmp_ious = np.hstack((tmp_ious, gt_ious)) + ious_[k, :] = tmp_ious + + ious_ = np.fliplr(np.sort(ious_, axis=1)) + recalls = np.zeros((proposal_nums.size, thrs.size)) + for i, thr in enumerate(thrs): + recalls[:, i] = (ious_ >= thr).sum(axis=1) / float(total_gt_num) + + return recalls + + +def set_recall_param(proposal_nums, iou_thrs): + if isinstance(proposal_nums, list): + proposal_nums_ = np.array(proposal_nums) + elif isinstance(proposal_nums, int): + proposal_nums_ = np.array([proposal_nums]) + else: + proposal_nums_ = proposal_nums + + if iou_thrs is None: + _iou_thrs = np.array([0.5]) + elif isinstance(iou_thrs, list): + _iou_thrs = np.array(iou_thrs) + elif isinstance(iou_thrs, float): + _iou_thrs = np.array([iou_thrs]) + else: + _iou_thrs = iou_thrs + + return proposal_nums_, _iou_thrs + + +def eval_recalls(gts, proposals, proposal_nums=None, iou_thrs=None): + """Calculate recalls. """ + img_num = len(gts) + assert img_num == len(proposals) + + proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs) + + all_ious = [] + for i in range(img_num): + if proposals[i].ndim == 2 and proposals[i].shape[1] == 5: + scores = proposals[i][:, 4] + sort_idx = np.argsort(scores)[::-1] + img_proposal = proposals[i][sort_idx, :] + else: + img_proposal = proposals[i] + + prop_num = min(img_proposal.shape[0], proposal_nums[-1]) + if gts[i] is None or gts[i].shape[0] == 0: + ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32) + else: + ious = bbox_overlaps( + torch.tensor(gts[i]), + torch.tensor(img_proposal[:prop_num, :4])) + ious = ious.data.numpy() + all_ious.append(ious) + all_ious = np.array(all_ious) + recalls = _recalls(all_ious, proposal_nums, iou_thrs) + return recalls diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/registry.py b/Bank_second_part/detect_process/paddlevideo/metrics/registry.py new file mode 100644 index 0000000..2214440 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/registry.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import Registry + +METRIC = Registry('metric') diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/segmentation_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/segmentation_metric.py new file mode 100644 index 0000000..3719450 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/segmentation_metric.py @@ -0,0 +1,389 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np +import argparse +import pandas as pd + +from .registry import METRIC +from .base import BaseMetric +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +def get_labels_scores_start_end_time(input_np, + frame_wise_labels, + actions_dict, + bg_class=["background", "None"]): + labels = [] + starts = [] + ends = [] + scores = [] + + boundary_score_ptr = 0 + + last_label = frame_wise_labels[0] + if frame_wise_labels[0] not in bg_class: + labels.append(frame_wise_labels[0]) + starts.append(0) + for i in range(len(frame_wise_labels)): + if frame_wise_labels[i] != last_label: + if frame_wise_labels[i] not in bg_class: + labels.append(frame_wise_labels[i]) + starts.append(i) + if last_label not in bg_class: + ends.append(i) + score = np.mean( + input_np[actions_dict[labels[boundary_score_ptr]], \ + starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)] + ) + scores.append(score) + boundary_score_ptr = boundary_score_ptr + 1 + last_label = frame_wise_labels[i] + if last_label not in bg_class: + ends.append(i + 1) + score = np.mean( + input_np[actions_dict[labels[boundary_score_ptr]], \ + starts[boundary_score_ptr]:(ends[boundary_score_ptr] + 1)] + ) + scores.append(score) + boundary_score_ptr = boundary_score_ptr + 1 + + return labels, starts, ends, scores + + +def get_labels_start_end_time(frame_wise_labels, + bg_class=["background", "None"]): + labels = [] + starts = [] + ends = [] + last_label = frame_wise_labels[0] + if frame_wise_labels[0] not in bg_class: + labels.append(frame_wise_labels[0]) + starts.append(0) + for i in range(len(frame_wise_labels)): + if frame_wise_labels[i] != last_label: + if frame_wise_labels[i] not in bg_class: + labels.append(frame_wise_labels[i]) + starts.append(i) + if last_label not in bg_class: + ends.append(i) + last_label = frame_wise_labels[i] + if last_label not in bg_class: + ends.append(i + 1) + return labels, starts, ends + + +def levenstein(p, y, norm=False): + m_row = len(p) + n_col = len(y) + D = np.zeros([m_row + 1, n_col + 1], np.float) + for i in range(m_row + 1): + D[i, 0] = i + for i in range(n_col + 1): + D[0, i] = i + + for j in range(1, n_col + 1): + for i in range(1, m_row + 1): + if y[j - 1] == p[i - 1]: + D[i, j] = D[i - 1, j - 1] + else: + D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1, + D[i - 1, j - 1] + 1) + + if norm: + score = (1 - D[-1, -1] / max(m_row, n_col)) * 100 + else: + score = D[-1, -1] + + return score + + +def edit_score(recognized, + ground_truth, + norm=True, + bg_class=["background", "None"]): + P, _, _ = get_labels_start_end_time(recognized, bg_class) + Y, _, _ = get_labels_start_end_time(ground_truth, bg_class) + return levenstein(P, Y, norm) + + +def f_score(recognized, ground_truth, overlap, bg_class=["background", "None"]): + p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class) + y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class) + + tp = 0 + fp = 0 + + hits = np.zeros(len(y_label)) + + for j in range(len(p_label)): + intersection = np.minimum(p_end[j], y_end) - np.maximum( + p_start[j], y_start) + union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start) + IoU = (1.0 * intersection / union) * ( + [p_label[j] == y_label[x] for x in range(len(y_label))]) + # Get the best scoring segment + idx = np.array(IoU).argmax() + + if IoU[idx] >= overlap and not hits[idx]: + tp += 1 + hits[idx] = 1 + else: + fp += 1 + fn = len(y_label) - sum(hits) + return float(tp), float(fp), float(fn) + + +def boundary_AR(pred_boundary, gt_boundary, overlap_list, max_proposal): + + p_label, p_start, p_end, p_scores = pred_boundary + y_label, y_start, y_end, _ = gt_boundary + + # sort proposal + pred_dict = { + "label": p_label, + "start": p_start, + "end": p_end, + "scores": p_scores + } + pdf = pd.DataFrame(pred_dict) + pdf = pdf.sort_values(by="scores", ascending=False) + p_label = list(pdf["label"]) + p_start = list(pdf["start"]) + p_end = list(pdf["end"]) + p_scores = list(pdf["scores"]) + + # refine AN + if len(p_label) < max_proposal and len(p_label) > 0: + p_label = p_label + [p_label[-1]] * (max_proposal - len(p_label)) + p_start = p_start + [p_start[-1]] * (max_proposal - len(p_start)) + p_start = p_start + p_start[len(p_start) - + (max_proposal - len(p_start)):] + p_end = p_end + [p_end[-1]] * (max_proposal - len(p_end)) + p_scores = p_scores + [p_scores[-1]] * (max_proposal - len(p_scores)) + elif len(p_label) > max_proposal: + p_label[max_proposal:] = [] + p_start[max_proposal:] = [] + p_end[max_proposal:] = [] + p_scores[max_proposal:] = [] + + t_AR = np.zeros(len(overlap_list)) + + for i in range(len(overlap_list)): + overlap = overlap_list[i] + + tp = 0 + fp = 0 + hits = np.zeros(len(y_label)) + + for j in range(len(p_label)): + intersection = np.minimum(p_end[j], y_end) - np.maximum( + p_start[j], y_start) + union = np.maximum(p_end[j], y_end) - np.minimum( + p_start[j], y_start) + IoU = (1.0 * intersection / union) + # Get the best scoring segment + idx = np.array(IoU).argmax() + + if IoU[idx] >= overlap and not hits[idx]: + tp += 1 + hits[idx] = 1 + else: + fp += 1 + fn = len(y_label) - sum(hits) + + recall = float(tp) / (float(tp) + float(fn)) + t_AR[i] = recall + + AR = np.mean(t_AR) + return AR + + +@METRIC.register +class SegmentationMetric(BaseMetric): + """ + Test for Video Segmentation based model. + """ + + def __init__(self, + data_size, + batch_size, + overlap, + actions_map_file_path, + log_interval=1, + tolerance=5, + boundary_threshold=0.7, + max_proposal=100): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + # actions dict generate + file_ptr = open(actions_map_file_path, 'r') + actions = file_ptr.read().split('\n')[:-1] + file_ptr.close() + self.actions_dict = dict() + for a in actions: + self.actions_dict[a.split()[1]] = int(a.split()[0]) + + # cls score + self.overlap = overlap + self.overlap_len = len(overlap) + + self.cls_tp = np.zeros(self.overlap_len) + self.cls_fp = np.zeros(self.overlap_len) + self.cls_fn = np.zeros(self.overlap_len) + self.total_correct = 0 + self.total_edit = 0 + self.total_frame = 0 + self.total_video = 0 + + # boundary score + self.max_proposal = max_proposal + self.AR_at_AN = [[] for _ in range(max_proposal)] + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + groundTruth = data[1] + + predicted = outputs['predict'] + output_np = outputs['output_np'] + + outputs_np = predicted.numpy() + outputs_arr = output_np.numpy()[0, :] + gt_np = groundTruth.numpy()[0, :] + + recognition = [] + for i in range(outputs_np.shape[0]): + recognition = np.concatenate((recognition, [ + list(self.actions_dict.keys())[list( + self.actions_dict.values()).index(outputs_np[i])] + ])) + recog_content = list(recognition) + + gt_content = [] + for i in range(gt_np.shape[0]): + gt_content = np.concatenate((gt_content, [ + list(self.actions_dict.keys())[list( + self.actions_dict.values()).index(gt_np[i])] + ])) + gt_content = list(gt_content) + + pred_boundary = get_labels_scores_start_end_time( + outputs_arr, recog_content, self.actions_dict) + gt_boundary = get_labels_scores_start_end_time( + np.ones(outputs_arr.shape), gt_content, self.actions_dict) + + # cls score + correct = 0 + total = 0 + edit = 0 + + for i in range(len(gt_content)): + total += 1 + #accumulate + self.total_frame += 1 + + if gt_content[i] == recog_content[i]: + correct += 1 + #accumulate + self.total_correct += 1 + + edit_num = edit_score(recog_content, gt_content) + edit += edit_num + self.total_edit += edit_num + + for s in range(self.overlap_len): + tp1, fp1, fn1 = f_score(recog_content, gt_content, self.overlap[s]) + + # accumulate + self.cls_tp[s] += tp1 + self.cls_fp[s] += fp1 + self.cls_fn[s] += fn1 + + # accumulate + self.total_video += 1 + + # proposal score + for AN in range(self.max_proposal): + AR = boundary_AR(pred_boundary, + gt_boundary, + self.overlap, + max_proposal=(AN + 1)) + self.AR_at_AN[AN].append(AR) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + # cls metric + Acc = 100 * float(self.total_correct) / self.total_frame + Edit = (1.0 * self.total_edit) / self.total_video + Fscore = dict() + for s in range(self.overlap_len): + precision = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fp[s]) + recall = self.cls_tp[s] / float(self.cls_tp[s] + self.cls_fn[s]) + + f1 = 2.0 * (precision * recall) / (precision + recall) + + f1 = np.nan_to_num(f1) * 100 + Fscore[self.overlap[s]] = f1 + + # proposal metric + proposal_AUC = np.array(self.AR_at_AN) * 100 + AUC = np.mean(proposal_AUC) + AR_at_AN1 = np.mean(proposal_AUC[0, :]) + AR_at_AN5 = np.mean(proposal_AUC[4, :]) + AR_at_AN15 = np.mean(proposal_AUC[14, :]) + + # log metric + log_mertic_info = "dataset model performence: " + # preds ensemble + log_mertic_info += "Acc: {:.4f}, ".format(Acc) + log_mertic_info += 'Edit: {:.4f}, '.format(Edit) + for s in range(len(self.overlap)): + log_mertic_info += 'F1@{:0.2f}: {:.4f}, '.format( + self.overlap[s], Fscore[self.overlap[s]]) + + # boundary metric + log_mertic_info += "Auc: {:.4f}, ".format(AUC) + log_mertic_info += "AR@AN1: {:.4f}, ".format(AR_at_AN1) + log_mertic_info += "AR@AN5: {:.4f}, ".format(AR_at_AN5) + log_mertic_info += "AR@AN15: {:.4f}, ".format(AR_at_AN15) + logger.info(log_mertic_info) + + # log metric + metric_dict = dict() + metric_dict['Acc'] = Acc + metric_dict['Edit'] = Edit + for s in range(len(self.overlap)): + metric_dict['F1@{:0.2f}'.format( + self.overlap[s])] = Fscore[self.overlap[s]] + metric_dict['Auc'] = AUC + metric_dict['AR@AN1'] = AR_at_AN1 + metric_dict['AR@AN5'] = AR_at_AN5 + metric_dict['AR@AN15'] = AR_at_AN15 + + # clear for next epoch + # cls + self.cls_tp = np.zeros(self.overlap_len) + self.cls_fp = np.zeros(self.overlap_len) + self.cls_fn = np.zeros(self.overlap_len) + self.total_correct = 0 + self.total_edit = 0 + self.total_frame = 0 + self.total_video = 0 + # proposal + self.AR_at_AN = [[] for _ in range(self.max_proposal)] + + return metric_dict diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/skeleton_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/skeleton_metric.py new file mode 100644 index 0000000..7978478 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/skeleton_metric.py @@ -0,0 +1,96 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np +import paddle +import csv +import paddle.nn.functional as F + +from .registry import METRIC +from .base import BaseMetric +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@METRIC.register +class SkeletonMetric(BaseMetric): + """ + Test for Skeleton based model. + note: only support batch size = 1, single card test. + + Args: + out_file: str, file to save test results. + """ + + def __init__(self, + data_size, + batch_size, + out_file='submission.csv', + log_interval=1, + top_k=5): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.top1 = [] + self.top5 = [] + self.values = [] + self.out_file = out_file + self.k = top_k + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + if data[0].shape[0] != outputs.shape[0]: + num_segs = data[0].shape[1] + batch_size = outputs.shape[0] + outputs = outputs.reshape( + [batch_size // num_segs, num_segs, outputs.shape[-1]]) + outputs = outputs.mean(axis=1) + if len(data) == 2: # data with label + labels = data[1] + top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1) + top5 = paddle.metric.accuracy(input=outputs, label=labels, k=self.k) + if self.world_size > 1: + top1 = paddle.distributed.all_reduce( + top1, op=paddle.distributed.ReduceOp.SUM) / self.world_size + top5 = paddle.distributed.all_reduce( + top5, op=paddle.distributed.ReduceOp.SUM) / self.world_size + self.top1.append(top1.numpy()) + self.top5.append(top5.numpy()) + else: # data without label, only support batch_size=1. Used for fsd-10. + prob = F.softmax(outputs) + clas = paddle.argmax(prob, axis=1).numpy()[0] + self.values.append((batch_id, clas)) + + # preds ensemble + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + if self.top1: # data with label + logger.info('[TEST] finished, avg_acc1= {}, avg_acc5= {}'.format( + np.mean(np.array(self.top1)), np.mean(np.array(self.top5)))) + else: + headers = ['sample_index', 'predict_category'] + with open( + self.out_file, + 'w', + ) as fp: + writer = csv.writer(fp) + writer.writerow(headers) + writer.writerows(self.values) + logger.info("Results saved in {} !".format(self.out_file)) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/transnetv2_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/transnetv2_metric.py new file mode 100644 index 0000000..3370881 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/transnetv2_metric.py @@ -0,0 +1,174 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import numpy as np + +from .registry import METRIC +from .base import BaseMetric +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +def predictions_to_scenes(predictions): + scenes = [] + t, t_prev, start = -1, 0, 0 + for i, t in enumerate(predictions): + if t_prev == 1 and t == 0: + start = i + if t_prev == 0 and t == 1 and i != 0: + scenes.append([start, i]) + t_prev = t + if t == 0: + scenes.append([start, i]) + + # just fix if all predictions are 1 + if len(scenes) == 0: + return np.array([[0, len(predictions) - 1]], dtype=np.int32) + + return np.array(scenes, dtype=np.int32) + + +def evaluate_scenes(gt_scenes, pred_scenes, n_frames_miss_tolerance=2): + """ + Adapted from: https://github.com/gyglim/shot-detection-evaluation + The original based on: http://imagelab.ing.unimore.it/imagelab/researchActivity.asp?idActivity=19 + + n_frames_miss_tolerance: + Number of frames it is possible to miss ground truth by, and still being counted as a correct detection. + + Examples of computation with different tolerance margin: + n_frames_miss_tolerance = 0 + pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.5, 5.5]] + gt_scenes: [[0, 5], [6, 9]] -> gt_trans: [[5.5, 5.5]] -> HIT + gt_scenes: [[0, 4], [5, 9]] -> gt_trans: [[4.5, 4.5]] -> MISS + n_frames_miss_tolerance = 1 + pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[5.0, 6.0]] + gt_scenes: [[0, 5], [6, 9]] -> gt_trans: [[5.0, 6.0]] -> HIT + gt_scenes: [[0, 4], [5, 9]] -> gt_trans: [[4.0, 5.0]] -> HIT + gt_scenes: [[0, 3], [4, 9]] -> gt_trans: [[3.0, 4.0]] -> MISS + n_frames_miss_tolerance = 2 + pred_scenes: [[0, 5], [6, 9]] -> pred_trans: [[4.5, 6.5]] + gt_scenes: [[0, 5], [6, 9]] -> gt_trans: [[4.5, 6.5]] -> HIT + gt_scenes: [[0, 4], [5, 9]] -> gt_trans: [[3.5, 5.5]] -> HIT + gt_scenes: [[0, 3], [4, 9]] -> gt_trans: [[2.5, 4.5]] -> HIT + gt_scenes: [[0, 2], [3, 9]] -> gt_trans: [[1.5, 3.5]] -> MISS + + Users should be careful about adopting these functions in any commercial matters. + """ + + shift = n_frames_miss_tolerance / 2 + gt_scenes = gt_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]]) + pred_scenes = pred_scenes.astype(np.float32) + np.array([[-0.5 + shift, 0.5 - shift]]) + + gt_trans = np.stack([gt_scenes[:-1, 1], gt_scenes[1:, 0]], 1) + pred_trans = np.stack([pred_scenes[:-1, 1], pred_scenes[1:, 0]], 1) + + i, j = 0, 0 + tp, fp, fn = 0, 0, 0 + + while i < len(gt_trans) or j < len(pred_trans): + if j == len(pred_trans) or pred_trans[j, 0] > gt_trans[i, 1]: + fn += 1 + i += 1 + elif i == len(gt_trans) or pred_trans[j, 1] < gt_trans[i, 0]: + fp += 1 + j += 1 + else: + i += 1 + j += 1 + tp += 1 + + if tp + fp != 0: + p = tp / (tp + fp) + else: + p = 0 + + if tp + fn != 0: + r = tp / (tp + fn) + else: + r = 0 + + if p + r != 0: + f1 = (p * r * 2) / (p + r) + else: + f1 = 0 + + assert tp + fn == len(gt_trans) + assert tp + fp == len(pred_trans) + + return p, r, f1, (tp, fp, fn) + + +def create_scene_based_summaries(one_hot_pred, one_hot_gt): + thresholds = np.array([ + 0.02, 0.06, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 + ]) + precision, recall, f1, tp, fp, fn = np.zeros_like(thresholds), np.zeros_like(thresholds),\ + np.zeros_like(thresholds), np.zeros_like(thresholds),\ + np.zeros_like(thresholds), np.zeros_like(thresholds) + + gt_scenes = predictions_to_scenes(one_hot_gt) + for i in range(len(thresholds)): + pred_scenes = predictions_to_scenes( + (one_hot_pred > thresholds[i]).astype(np.uint8) + ) + precision[i], recall[i], f1[i], (tp[i], fp[i], fn[i]) = evaluate_scenes(gt_scenes, pred_scenes) + + best_idx = np.argmax(f1) + + return f1[best_idx] + + +@METRIC.register +class TransNetV2Metric(BaseMetric): + def __init__(self, data_size, batch_size, log_interval=1): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.predictions = [] + self.total_stats = {"tp": 0, "fp": 0, "fn": 0} + + def update(self, batch_id, data, one_hot): + """update metrics during each iter + """ + if isinstance(one_hot, tuple): + one_hot = one_hot[0] + one_hot = paddle.nn.functional.sigmoid(one_hot)[0] + self.predictions.append(one_hot.numpy()[25:75]) + gt_scenes = data[1] + is_new_file = data[2] + if is_new_file: + self.compute(gt_scenes) + # preds ensemble + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def compute(self, gt_scenes): + predictions = np.concatenate(self.predictions, 0)[:len(frames)] + _, _, _, (tp, fp, fn), fp_mistakes, fn_mistakes = evaluate_scenes( + gt_scenes, predictions_to_scenes((predictions >= args.thr).astype(np.uint8))) + + self.total_stats["tp"] += tp + self.total_stats["fp"] += fp + self.total_stats["fn"] += fn + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + p = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fp"]) + r = self.total_stats["tp"] / (self.total_stats["tp"] + self.total_stats["fn"]) + f1 = (p * r * 2) / (p + r) + logger.info('[TEST] finished, Precision= {:5.2f}, Recall= {:5.2f} , F1 Score= {:5.2f} '.format( + p * 100, r * 100, f1 * 100)) \ No newline at end of file diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/ucf24_utils.py b/Bank_second_part/detect_process/paddlevideo/metrics/ucf24_utils.py new file mode 100644 index 0000000..6552645 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/ucf24_utils.py @@ -0,0 +1,783 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Forked from: https://github.com/rafaelpadilla/Object-Detection-Metrics +# Developed by: Rafael Padilla (rafael.padilla@smt.ufrj.br) + +import glob +import os +import shutil +import sys +from collections import Counter +import numpy as np +from enum import Enum +import cv2 + + +class MethodAveragePrecision(Enum): + """ + Class representing if the coordinates are relative to the + image size or are absolute values. + + Developed by: Rafael Padilla + Last modification: Apr 28 2018 + """ + EveryPointInterpolation = 1 + ElevenPointInterpolation = 2 + + +class CoordinatesType(Enum): + """ + Class representing if the coordinates are relative to the + image size or are absolute values. + + Developed by: Rafael Padilla + Last modification: Apr 28 2018 + """ + Relative = 1 + Absolute = 2 + + +class BBType(Enum): + """ + Class representing if the bounding box is groundtruth or not. + + Developed by: Rafael Padilla + Last modification: May 24 2018 + """ + GroundTruth = 1 + Detected = 2 + + +class BBFormat(Enum): + """ + Class representing the format of a bounding box. + It can be (X,Y,width,height) => XYWH + or (X1,Y1,X2,Y2) => XYX2Y2 + Developed by: Rafael Padilla + Last modification: May 24 2018 + """ + XYWH = 1 + XYX2Y2 = 2 + + +def convertToRelativeValues(size, box): + dw = 1. / (size[0]) + dh = 1. / (size[1]) + cx = (box[1] + box[0]) / 2.0 + cy = (box[3] + box[2]) / 2.0 + w = box[1] - box[0] + h = box[3] - box[2] + x = cx * dw + y = cy * dh + w = w * dw + h = h * dh + return x, y, w, h + + +def convertToAbsoluteValues(size, box): + xIn = round(((2 * float(box[0]) - float(box[2])) * size[0] / 2)) + yIn = round(((2 * float(box[1]) - float(box[3])) * size[1] / 2)) + xEnd = xIn + round(float(box[2]) * size[0]) + yEnd = yIn + round(float(box[3]) * size[1]) + if xIn < 0: + xIn = 0 + if yIn < 0: + yIn = 0 + if xEnd >= size[0]: + xEnd = size[0] - 1 + if yEnd >= size[1]: + yEnd = size[1] - 1 + return xIn, yIn, xEnd, yEnd + + +def add_bb_into_image(image, bb, color=(255, 0, 0), thickness=2, label=None): + r = int(color[0]) + g = int(color[1]) + b = int(color[2]) + + font = cv2.FONT_HERSHEY_SIMPLEX + fontScale = 0.5 + fontThickness = 1 + + x1, y1, x2, y2 = bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2) + x1 = int(x1) + y1 = int(y1) + x2 = int(x2) + y2 = int(y2) + cv2.rectangle(image, (x1, y1), (x2, y2), (b, g, r), thickness) + # Add label + if label is not None: + # Get size of the text box + (tw, th) = cv2.getTextSize(label, font, fontScale, fontThickness)[0] + # Top-left coord of the textbox + (xin_bb, yin_bb) = (x1 + thickness, y1 - th + int(12.5 * fontScale)) + # Checking position of the text top-left (outside or inside the bb) + if yin_bb - th <= 0: # if outside the image + yin_bb = y1 + th # put it inside the bb + r_Xin = x1 - int(thickness / 2) + r_Yin = y1 - th - int(thickness / 2) + # Draw filled rectangle to put the text in it + cv2.rectangle(image, (r_Xin, r_Yin - thickness), + (r_Xin + tw + thickness * 3, r_Yin + th + int(12.5 * fontScale)), (b, g, r), + -1) + cv2.putText(image, label, (xin_bb, yin_bb), font, fontScale, (0, 0, 0), fontThickness, + cv2.LINE_AA) + return image + + +class BoundingBox: + def __init__(self, + imageName, + classId, + x, + y, + w, + h, + typeCoordinates=None, + imgSize=None, + bbType=None, + classConfidence=None, + format=None): + """Constructor. + Args: + imageName: String representing the image name. + classId: String value representing class id. + x: Float value representing the X upper-left coordinate of the bounding box. + y: Float value representing the Y upper-left coordinate of the bounding box. + w: Float value representing the width bounding box. + h: Float value representing the height bounding box. + typeCoordinates: (optional) Enum (Relative or Absolute) represents if the bounding box + coordinates (x,y,w,h) are absolute or relative to size of the image. Default:'Absolute'. + imgSize: (optional) 2D vector (width, height)=>(int, int) represents the size of the + image of the bounding box. If typeCoordinates is 'Relative', imgSize is required. + bbType: (optional) Enum (Groundtruth or Detection) identifies if the bounding box + represents a ground truth or a detection. If it is a detection, the classConfidence has + to be informed. + classConfidence: (optional) Float value representing the confidence of the detected + class. If detectionType is Detection, classConfidence needs to be informed. + format: (optional) Enum (BBFormat.XYWH or BBFormat.XYX2Y2) indicating the format of the + coordinates of the bounding boxes. BBFormat.XYWH: + BBFormat.XYX2Y2: . + """ + self._imageName = imageName + self._typeCoordinates = typeCoordinates + if typeCoordinates == CoordinatesType.Relative and imgSize is None: + raise IOError( + 'Parameter \'imgSize\' is required. It is necessary to inform the image size.') + if bbType == BBType.Detected and classConfidence is None: + raise IOError( + 'For bbType=\'Detection\', it is necessary to inform the classConfidence value.') + + self._classConfidence = classConfidence + self._bbType = bbType + self._classId = classId + self._format = format + + # If relative coordinates, convert to absolute values + # For relative coords: (x,y,w,h)=(X_center/img_width , Y_center/img_height) + if typeCoordinates == CoordinatesType.Relative: + (self._x, self._y, self._w, self._h) = convertToAbsoluteValues(imgSize, (x, y, w, h)) + self._width_img = imgSize[0] + self._height_img = imgSize[1] + if format == BBFormat.XYWH: + self._x2 = self._w + self._y2 = self._h + self._w = self._x2 - self._x + self._h = self._y2 - self._y + else: + raise IOError( + 'For relative coordinates, the format must be XYWH (x,y,width,height)') + # For absolute coords: (x,y,w,h)=real bb coords + else: + self._x = x + self._y = y + if format == BBFormat.XYWH: + self._w = w + self._h = h + self._x2 = self._x + self._w + self._y2 = self._y + self._h + else: # format == BBFormat.XYX2Y2: . + self._x2 = w + self._y2 = h + self._w = self._x2 - self._x + self._h = self._y2 - self._y + if imgSize is None: + self._width_img = None + self._height_img = None + else: + self._width_img = imgSize[0] + self._height_img = imgSize[1] + + def getAbsoluteBoundingBox(self, format=None): + if format == BBFormat.XYWH: + return self._x, self._y, self._w, self._h + elif format == BBFormat.XYX2Y2: + return self._x, self._y, self._x2, self._y2 + + def getRelativeBoundingBox(self, imgSize=None): + if imgSize is None and self._width_img is None and self._height_img is None: + raise IOError( + 'Parameter \'imgSize\' is required. It is necessary to inform the image size.') + if imgSize is None: + return convertToRelativeValues((imgSize[0], imgSize[1]), + (self._x, self._y, self._w, self._h)) + else: + return convertToRelativeValues((self._width_img, self._height_img), + (self._x, self._y, self._w, self._h)) + + def getImageName(self): + return self._imageName + + def getConfidence(self): + return self._classConfidence + + def getFormat(self): + return self._format + + def getClassId(self): + return self._classId + + def getImageSize(self): + return self._width_img, self._height_img + + def getCoordinatesType(self): + return self._typeCoordinates + + def getBBType(self): + return self._bbType + + @staticmethod + def compare(det1, det2): + det1BB = det1.getAbsoluteBoundingBox(format=BBFormat.XYWH) + det1ImgSize = det1.getImageSize() + det2BB = det2.getAbsoluteBoundingBox(format=BBFormat.XYWH) + det2ImgSize = det2.getImageSize() + + if det1.getClassId() == det2.getClassId() and \ + det1.classConfidence == det2.classConfidenc() and \ + det1BB[0] == det2BB[0] and \ + det1BB[1] == det2BB[1] and \ + det1BB[2] == det2BB[2] and \ + det1BB[3] == det2BB[3] and \ + det1ImgSize[0] == det1ImgSize[0] and \ + det2ImgSize[1] == det2ImgSize[1]: + return True + return False + + @staticmethod + def clone(boundingBox): + absBB = boundingBox.getAbsoluteBoundingBox(format=BBFormat.XYWH) + newBoundingBox = BoundingBox( + boundingBox.getImageName(), + boundingBox.getClassId(), + absBB[0], + absBB[1], + absBB[2], + absBB[3], + typeCoordinates=boundingBox.getCoordinatesType(), + imgSize=boundingBox.getImageSize(), + bbType=boundingBox.getBBType(), + classConfidence=boundingBox.getConfidence(), + format=BBFormat.XYWH) + return newBoundingBox + + +class BoundingBoxes: + def __init__(self): + self._boundingBoxes = [] + + def addBoundingBox(self, bb): + self._boundingBoxes.append(bb) + + def removeBoundingBox(self, _boundingBox): + for d in self._boundingBoxes: + if BoundingBox.compare(d, _boundingBox): + del self._boundingBoxes[d] + return + + def removeAllBoundingBoxes(self): + self._boundingBoxes = [] + + def getBoundingBoxes(self): + return self._boundingBoxes + + def getBoundingBoxByClass(self, classId): + boundingBoxes = [] + for d in self._boundingBoxes: + if d.getClassId() == classId: # get only specified bounding box type + boundingBoxes.append(d) + return boundingBoxes + + def getClasses(self): + classes = [] + for d in self._boundingBoxes: + c = d.getClassId() + if c not in classes: + classes.append(c) + return classes + + def getBoundingBoxesByType(self, bbType): + # get only specified bb type + return [d for d in self._boundingBoxes if d.getBBType() == bbType] + + def getBoundingBoxesByImageName(self, imageName): + # get only specified bb type + return [d for d in self._boundingBoxes if d.getImageName() == imageName] + + def count(self, bbType=None): + if bbType is None: # Return all bounding boxes + return len(self._boundingBoxes) + count = 0 + for d in self._boundingBoxes: + if d.getBBType() == bbType: # get only specified bb type + count += 1 + return count + + def clone(self): + newBoundingBoxes = BoundingBoxes() + for d in self._boundingBoxes: + det = BoundingBox.clone(d) + newBoundingBoxes.addBoundingBox(det) + return newBoundingBoxes + + def drawAllBoundingBoxes(self, image, imageName): + bbxes = self.getBoundingBoxesByImageName(imageName) + for bb in bbxes: + if bb.getBBType() == BBType.GroundTruth: # if ground truth + image = add_bb_into_image(image, bb, color=(0, 255, 0)) # green + else: # if detection + image = add_bb_into_image(image, bb, color=(255, 0, 0)) # red + return image + + +class Evaluator: + def GetPascalVOCMetrics(self, + boundingboxes, + IOUThreshold=0.5, + method=None): + """Get the metrics used by the VOC Pascal 2012 challenge. + Get + Args: + boundingboxes: Object of the class BoundingBoxes representing ground truth and detected + bounding boxes; + IOUThreshold: IOU threshold indicating which detections will be considered TP or FP + (default value = 0.5); + method (default = EveryPointInterpolation): It can be calculated as the implementation + in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point + interpolatio as described in the paper "The PASCAL Visual Object Classes(VOC) Challenge" + or EveryPointInterpolation" (ElevenPointInterpolation); + Returns: + A list of dictionaries. Each dictionary contains information and metrics of each class. + The keys of each dictionary are: + dict['class']: class representing the current dictionary; + dict['precision']: array with the precision values; + dict['recall']: array with the recall values; + dict['AP']: average precision; + dict['interpolated precision']: interpolated precision values; + dict['interpolated recall']: interpolated recall values; + dict['total positives']: total number of ground truth positives; + dict['total TP']: total number of True Positive detections; + dict['total FP']: total number of False Negative detections; + """ + ret = [] # list containing metrics (precision, recall, average precision) of each class + # List with all ground truths (Ex: [imageName,class,confidence=1, (bb coordinates XYX2Y2)]) + groundTruths = [] + # List with all detections (Ex: [imageName,class,confidence,(bb coordinates XYX2Y2)]) + detections = [] + # Get all classes + classes = [] + # Loop through all bounding boxes and separate them into GTs and detections + for bb in boundingboxes.getBoundingBoxes(): + # [imageName, class, confidence, (bb coordinates XYX2Y2)] + if bb.getBBType() == BBType.GroundTruth: + groundTruths.append([ + bb.getImageName(), + bb.getClassId(), 1, + bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2) + ]) + else: + detections.append([ + bb.getImageName(), + bb.getClassId(), + bb.getConfidence(), + bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2) + ]) + # get class + if bb.getClassId() not in classes: + classes.append(bb.getClassId()) + classes = sorted(classes) + # Precision x Recall is obtained individually by each class + # Loop through by classes + for c in classes: + # Get only detection of class c + dects = [] + [dects.append(d) for d in detections if d[1] == c] + # Get only ground truths of class c + gts = [] + [gts.append(g) for g in groundTruths if g[1] == c] + npos = len(gts) + # sort detections by decreasing confidence + dects = sorted(dects, key=lambda conf: conf[2], reverse=True) + TP = np.zeros(len(dects)) + FP = np.zeros(len(dects)) + # create dictionary with amount of gts for each image + det = Counter([cc[0] for cc in gts]) + for key, val in det.items(): + det[key] = np.zeros(val) + # Loop through detections + for d in range(len(dects)): + # Find ground truth image + gt = [gt for gt in gts if gt[0] == dects[d][0]] + iouMax = sys.float_info.min + for j in range(len(gt)): + iou = Evaluator.iou(dects[d][3], gt[j][3]) + if iou > iouMax: + iouMax = iou + jmax = j + # Assign detection as true positive/don't care/false positive + if iouMax >= IOUThreshold: + if det[dects[d][0]][jmax] == 0: + TP[d] = 1 # count as true positive + det[dects[d][0]][jmax] = 1 # flag as already 'seen' + else: + FP[d] = 1 # count as false positive + # - A detected "cat" is overlaped with a GT "cat" with IOU >= IOUThreshold. + else: + FP[d] = 1 # count as false positive + # compute precision, recall and average precision + acc_FP = np.cumsum(FP) + acc_TP = np.cumsum(TP) + rec = acc_TP / npos + prec = np.divide(acc_TP, (acc_FP + acc_TP)) + # Depending on the method, call the right implementation + if method == MethodAveragePrecision.EveryPointInterpolation: + [ap, mpre, mrec, ii] = Evaluator.CalculateAveragePrecision(rec, prec) + else: + [ap, mpre, mrec, _] = Evaluator.ElevenPointInterpolatedAP(rec, prec) + # add class result in the dictionary to be returned + r = { + 'class': c, + 'precision': prec, + 'recall': rec, + 'AP': ap, + 'interpolated precision': mpre, + 'interpolated recall': mrec, + 'total positives': npos, + 'total TP': np.sum(TP), + 'total FP': np.sum(FP) + } + ret.append(r) + return ret + + @staticmethod + def CalculateAveragePrecision(rec, prec): + mrec = [0] + [mrec.append(e) for e in rec] + mrec.append(1) + mpre = [0] + [mpre.append(e) for e in prec] + mpre.append(0) + for i in range(len(mpre) - 1, 0, -1): + mpre[i - 1] = max(mpre[i - 1], mpre[i]) + ii = [] + for i in range(len(mrec) - 1): + if mrec[1:][i] != mrec[0:-1][i]: + ii.append(i + 1) + ap = 0 + for i in ii: + ap = ap + np.sum((mrec[i] - mrec[i - 1]) * mpre[i]) + return [ap, mpre[0:len(mpre) - 1], mrec[0:len(mpre) - 1], ii] + + @staticmethod + # 11-point interpolated average precision + def ElevenPointInterpolatedAP(rec, prec): + mrec = [] + [mrec.append(e) for e in rec] + mpre = [] + [mpre.append(e) for e in prec] + recallValues = np.linspace(0, 1, 11) + recallValues = list(recallValues[::-1]) + rhoInterp = [] + recallValid = [] + for r in recallValues: + # Obtain all recall values higher or equal than r + argGreaterRecalls = np.argwhere(mrec[:] >= r) + pmax = 0 + # If there are recalls above r + if argGreaterRecalls.size != 0: + pmax = max(mpre[argGreaterRecalls.min():]) + recallValid.append(r) + rhoInterp.append(pmax) + # By definition AP = sum(max(precision whose recall is above r))/11 + ap = sum(rhoInterp) / 11 + # Generating values for the plot + rvals = [recallValid[0]] + [rvals.append(e) for e in recallValid] + rvals.append(0) + pvals = [0] + [pvals.append(e) for e in rhoInterp] + pvals.append(0) + # rhoInterp = rhoInterp[::-1] + cc = [] + for i in range(len(rvals)): + p = (rvals[i], pvals[i - 1]) + if p not in cc: + cc.append(p) + p = (rvals[i], pvals[i]) + if p not in cc: + cc.append(p) + recallValues = [i[0] for i in cc] + rhoInterp = [i[1] for i in cc] + return [ap, rhoInterp, recallValues, None] + + # For each detections, calculate IOU with reference + @staticmethod + def _getAllIOUs(reference, detections): + ret = [] + bbReference = reference.getAbsoluteBoundingBox(BBFormat.XYX2Y2) + # img = np.zeros((200,200,3), np.uint8) + for d in detections: + bb = d.getAbsoluteBoundingBox(BBFormat.XYX2Y2) + iou = Evaluator.iou(bbReference, bb) + ret.append((iou, reference, d)) # iou, reference, detection + return sorted(ret, key=lambda i: i[0], reverse=True) # sort by iou (from highest to lowest) + + @staticmethod + def iou(boxA, boxB): + # if boxes dont intersect + if Evaluator._boxesIntersect(boxA, boxB) is False: + return 0 + interArea = Evaluator._getIntersectionArea(boxA, boxB) + union = Evaluator._getUnionAreas(boxA, boxB, interArea=interArea) + # intersection over union + iou = interArea / union + assert iou >= 0 + return iou + + @staticmethod + def _boxesIntersect(boxA, boxB): + if boxA[0] > boxB[2]: + return False # boxA is right of boxB + if boxB[0] > boxA[2]: + return False # boxA is left of boxB + if boxA[3] < boxB[1]: + return False # boxA is above boxB + if boxA[1] > boxB[3]: + return False # boxA is below boxB + return True + + @staticmethod + def _getIntersectionArea(boxA, boxB): + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + # intersection area + return (xB - xA + 1) * (yB - yA + 1) + + @staticmethod + def _getUnionAreas(boxA, boxB, interArea=None): + area_A = Evaluator._getArea(boxA) + area_B = Evaluator._getArea(boxB) + if interArea is None: + interArea = Evaluator._getIntersectionArea(boxA, boxB) + return float(area_A + area_B - interArea) + + @staticmethod + def _getArea(box): + return (box[2] - box[0] + 1) * (box[3] - box[1] + 1) + + +# Validate formats +def ValidateFormats(argFormat, argName, errors): + if argFormat == 'xywh': + return BBFormat.XYWH + elif argFormat == 'xyrb': + return BBFormat.XYX2Y2 + elif argFormat is None: + return BBFormat.XYWH # default when nothing is passed + else: + errors.append( + 'argument %s: invalid value. It must be either \'xywh\' or \'xyrb\'' % argName) + + +# Validate mandatory args +def ValidateMandatoryArgs(arg, argName, errors): + if arg is None: + errors.append('argument %s: required argument' % argName) + else: + return True + + +def ValidateImageSize(arg, argName, argInformed, errors): + errorMsg = 'argument %s: required argument if %s is relative' % (argName, argInformed) + ret = None + if arg is None: + errors.append(errorMsg) + else: + arg = arg.replace('(', '').replace(')', '') + args = arg.split(',') + if len(args) != 2: + errors.append( + '%s. It must be in the format \'width,height\' (e.g. \'600,400\')' % errorMsg) + else: + if not args[0].isdigit() or not args[1].isdigit(): + errors.append( + '%s. It must be in INdiaTEGER the format \'width,height\' (e.g. \'600,400\')' % + errorMsg) + else: + ret = (int(args[0]), int(args[1])) + return ret + + +# Validate coordinate types +def ValidateCoordinatesTypes(arg, argName, errors): + if arg == 'abs': + return CoordinatesType.Absolute + elif arg == 'rel': + return CoordinatesType.Relative + elif arg is None: + return CoordinatesType.Absolute # default when nothing is passed + errors.append('argument %s: invalid value. It must be either \'rel\' or \'abs\'' % argName) + + +def getBoundingBoxes(directory, + isGT, + bbFormat, + coordType, + allBoundingBoxes=None, + allClasses=None, + imgSize=(0, 0)): + """Read txt files containing bounding boxes (ground truth and detections).""" + print(directory) + if allBoundingBoxes is None: + allBoundingBoxes = BoundingBoxes() + if allClasses is None: + allClasses = [] + # Read ground truths + os.chdir(directory) + files = glob.glob("*.txt") + files.sort() + + for f in files: + nameOfImage = f.replace(".txt", "") + fh1 = open(f, "r") + for line in fh1: + line = line.replace("\n", "") + if line.replace(' ', '') == '': + continue + splitLine = line.split(" ") + if isGT: + idClass = (splitLine[0]) # class + x = float(splitLine[1]) + y = float(splitLine[2]) + w = float(splitLine[3]) + h = float(splitLine[4]) + bb = BoundingBox( + nameOfImage, + idClass, + x, + y, + w, + h, + coordType, + imgSize, + BBType.GroundTruth, + format=bbFormat) + else: + idClass = (splitLine[0]) # class + confidence = float(splitLine[1]) + x = float(splitLine[2]) + y = float(splitLine[3]) + w = float(splitLine[4]) + h = float(splitLine[5]) + bb = BoundingBox( + nameOfImage, + idClass, + x, + y, + w, + h, + coordType, + imgSize, + BBType.Detected, + confidence, + format=bbFormat) + allBoundingBoxes.addBoundingBox(bb) + if idClass not in allClasses: + allClasses.append(idClass) + fh1.close() + return allBoundingBoxes, allClasses + + +def get_mAP(gtFolder, detFolder, threshold=0.5, savePath=None): + gtFormat = 'xyrb' + detFormat = 'xyrb' + gtCoordinates = 'abs' + detCoordinates = 'abs' + gtFolder = os.path.join(os.path.abspath('.'), gtFolder) + detFolder = os.path.join(os.path.abspath('.'), detFolder) + + iouThreshold = threshold + + # Arguments validation + errors = [] + # Validate formats + gtFormat = ValidateFormats(gtFormat, 'gtFormat', errors) + detFormat = ValidateFormats(detFormat, '-detformat', errors) + + # Coordinates types + gtCoordType = ValidateCoordinatesTypes(gtCoordinates, '-gtCoordinates', errors) + detCoordType = ValidateCoordinatesTypes(detCoordinates, '-detCoordinates', errors) + imgSize = (0, 0) + + # Create directory to save results + shutil.rmtree(savePath, ignore_errors=True) # Clear folder + if savePath is not None: + os.makedirs(savePath) + + # Get groundtruth boxes + allBoundingBoxes, allClasses = getBoundingBoxes( + gtFolder, True, gtFormat, gtCoordType, imgSize=imgSize) + # Get detected boxes + allBoundingBoxes, allClasses = getBoundingBoxes( + detFolder, False, detFormat, detCoordType, allBoundingBoxes, allClasses, imgSize=imgSize) + allClasses.sort() + + evaluator = Evaluator() + acc_AP = 0 + validClasses = 0 + + # Plot Precision x Recall curve + detections = evaluator.GetPascalVOCMetrics(allBoundingBoxes, iouThreshold, + method=MethodAveragePrecision.EveryPointInterpolation) + + # each detection is a class and store AP and mAP results in AP_res list + AP_res = [] + for metricsPerClass in detections: + # Get metric values per each class + cl = metricsPerClass['class'] + ap = metricsPerClass['AP'] + totalPositives = metricsPerClass['total positives'] + + if totalPositives > 0: + validClasses = validClasses + 1 + acc_AP = acc_AP + ap + ap_str = "{0:.2f}%".format(ap * 100) + AP_res.append('AP: %s (%s)' % (ap_str, cl)) + mAP = acc_AP / validClasses + mAP_str = "{0:.2f}%".format(mAP * 100) + AP_res.append('mAP: %s' % mAP_str) + return AP_res \ No newline at end of file diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/vos_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/vos_metric.py new file mode 100644 index 0000000..54eadb8 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/vos_metric.py @@ -0,0 +1,276 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import os +import paddle +import zipfile +import time +from PIL import Image + +from paddle.io import DataLoader + +from .registry import METRIC +from .base import BaseMetric +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@METRIC.register +class VOSMetric(BaseMetric): + def __init__(self, + data_size, + batch_size, + result_root, + zip_dir, + log_interval=1): + """prepare for metrics + """ + super().__init__(data_size, batch_size, log_interval) + self.video_num = 0 + self.total_time = 0 + self.total_frame = 0 + self.total_sfps = 0 + self.total_video_num = data_size + self.count = 0 + self.result_root = result_root + self.zip_dir = zip_dir + + def update(self, batch_id, data, model): + """update metrics during each iter + """ + self.video_num += 1 + seq_dataset = data + seq_name = seq_dataset.seq_name + + logger.info('Prcessing Seq {} [{}/{}]:'.format(seq_name, self.video_num, + self.total_video_num)) + seq_dataloader = DataLoader(seq_dataset, + return_list=True, + batch_size=1, + shuffle=False, + num_workers=0) + seq_total_time = 0 + seq_total_frame = 0 + ref_embeddings = [] + ref_masks = [] + prev_embedding = [] + prev_mask = [] + with paddle.no_grad(): + for frame_idx, samples in enumerate(seq_dataloader): + time_start = time.time() + all_preds = [] + join_label = None + for aug_idx in range(len(samples)): + if len(ref_embeddings) <= aug_idx: + ref_embeddings.append([]) + ref_masks.append([]) + prev_embedding.append(None) + prev_mask.append(None) + + sample = samples[aug_idx] + ref_emb = ref_embeddings[aug_idx] + ref_m = ref_masks[aug_idx] + prev_emb = prev_embedding[aug_idx] + prev_m = prev_mask[aug_idx] + + current_img = sample['current_img'] + if 'current_label' in sample.keys(): + current_label = sample['current_label'] + current_label = paddle.to_tensor(current_label) + else: + current_label = None + + obj_num = sample['meta']['obj_num'] + imgname = sample['meta']['current_name'] + ori_height = sample['meta']['height'] + ori_width = sample['meta']['width'] + current_img = current_img + obj_num = obj_num + bs, _, h, w = current_img.shape + data_batch = [ + ref_emb, ref_m, prev_emb, prev_m, current_img, + [ori_height, ori_width], obj_num + ] + + all_pred, current_embedding = model(data_batch, mode='test') + + if frame_idx == 0: + if current_label is None: + logger.info( + "No first frame label in Seq {}.".format( + seq_name)) + ref_embeddings[aug_idx].append(current_embedding) + ref_masks[aug_idx].append(current_label) + + prev_embedding[aug_idx] = current_embedding + prev_mask[aug_idx] = current_label + else: + if sample['meta']['flip']: #False + all_pred = self.flip_tensor(all_pred, 3) + # In YouTube-VOS, not all the objects appear in the first frame for the first time. Thus, we + # have to introduce new labels for new objects, if necessary. + if not sample['meta']['flip'] and not ( + current_label is None) and join_label is None: + join_label = paddle.cast(current_label, + dtype='int64') + all_preds.append(all_pred) + if current_label is not None: + ref_embeddings[aug_idx].append(current_embedding) + prev_embedding[aug_idx] = current_embedding + + if frame_idx > 0: + all_preds = paddle.concat(all_preds, axis=0) + all_preds = paddle.mean( + all_preds, axis=0) #average results if augmentation + pred_label = paddle.argmax(all_preds, axis=0) + if join_label is not None: + join_label = paddle.squeeze(paddle.squeeze(join_label, + axis=0), + axis=0) + keep = paddle.cast((join_label == 0), dtype="int64") + pred_label = pred_label * keep + join_label * (1 - keep) + pred_label = pred_label + current_label = paddle.reshape( + pred_label, shape=[1, 1, ori_height, ori_width]) + flip_pred_label = self.flip_tensor(pred_label, 1) + flip_current_label = paddle.reshape( + flip_pred_label, shape=[1, 1, ori_height, ori_width]) + + for aug_idx in range(len(samples)): + if join_label is not None: + if samples[aug_idx]['meta']['flip']: + ref_masks[aug_idx].append(flip_current_label) + else: + ref_masks[aug_idx].append(current_label) + if samples[aug_idx]['meta']['flip']: + prev_mask[aug_idx] = flip_current_label + else: + prev_mask[ + aug_idx] = current_label #update prev_mask + + one_frametime = time.time() - time_start + seq_total_time += one_frametime + seq_total_frame += 1 + obj_num = float(obj_num) + logger.info('Frame: {}, Obj Num: {}, Time: {}'.format( + imgname[0], obj_num, one_frametime)) + self.save_mask( + pred_label, + os.path.join(self.result_root, seq_name, + imgname[0].split('.')[0] + '.png')) + else: + one_frametime = time.time() - time_start + seq_total_time += one_frametime + logger.info('Ref Frame: {}, Time: {}'.format( + imgname[0], one_frametime)) + + del (ref_embeddings) + del (ref_masks) + del (prev_embedding) + del (prev_mask) + del (seq_dataset) + del (seq_dataloader) + + seq_avg_time_per_frame = seq_total_time / seq_total_frame + self.total_time += seq_total_time + self.total_frame += seq_total_frame + total_avg_time_per_frame = self.total_time / self.total_frame + self.total_sfps += seq_avg_time_per_frame + avg_sfps = self.total_sfps / (batch_id + 1) + logger.info("Seq {} FPS: {}, Total FPS: {}, FPS per Seq: {}".format( + seq_name, 1. / seq_avg_time_per_frame, + 1. / total_avg_time_per_frame, 1. / avg_sfps)) + + def flip_tensor(self, tensor, dim=0): + inv_idx = paddle.cast(paddle.arange(tensor.shape[dim] - 1, -1, -1), + dtype="int64") + tensor = paddle.index_select(x=tensor, index=inv_idx, axis=dim) + return tensor + + def save_mask(self, mask_tensor, path): + _palette = [ + 0, 0, 0, 128, 0, 0, 0, 128, 0, 128, 128, 0, 0, 0, 128, 128, 0, 128, + 0, 128, 128, 128, 128, 128, 64, 0, 0, 191, 0, 0, 64, 128, 0, 191, + 128, 0, 64, 0, 128, 191, 0, 128, 64, 128, 128, 191, 128, 128, 0, 64, + 0, 128, 64, 0, 0, 191, 0, 128, 191, 0, 0, 64, 128, 128, 64, 128, 22, + 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27, + 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, + 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39, + 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44, + 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, + 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56, + 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, + 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, + 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, + 73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 76, 77, 77, 77, 78, 78, 78, + 79, 79, 79, 80, 80, 80, 81, 81, 81, 82, 82, 82, 83, 83, 83, 84, 84, + 84, 85, 85, 85, 86, 86, 86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90, + 90, 90, 91, 91, 91, 92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 95, + 96, 96, 96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 100, 100, 100, 101, + 101, 101, 102, 102, 102, 103, 103, 103, 104, 104, 104, 105, 105, + 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 109, 109, 109, + 110, 110, 110, 111, 111, 111, 112, 112, 112, 113, 113, 113, 114, + 114, 114, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118, + 118, 119, 119, 119, 120, 120, 120, 121, 121, 121, 122, 122, 122, + 123, 123, 123, 124, 124, 124, 125, 125, 125, 126, 126, 126, 127, + 127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131, + 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 135, 135, 135, + 136, 136, 136, 137, 137, 137, 138, 138, 138, 139, 139, 139, 140, + 140, 140, 141, 141, 141, 142, 142, 142, 143, 143, 143, 144, 144, + 144, 145, 145, 145, 146, 146, 146, 147, 147, 147, 148, 148, 148, + 149, 149, 149, 150, 150, 150, 151, 151, 151, 152, 152, 152, 153, + 153, 153, 154, 154, 154, 155, 155, 155, 156, 156, 156, 157, 157, + 157, 158, 158, 158, 159, 159, 159, 160, 160, 160, 161, 161, 161, + 162, 162, 162, 163, 163, 163, 164, 164, 164, 165, 165, 165, 166, + 166, 166, 167, 167, 167, 168, 168, 168, 169, 169, 169, 170, 170, + 170, 171, 171, 171, 172, 172, 172, 173, 173, 173, 174, 174, 174, + 175, 175, 175, 176, 176, 176, 177, 177, 177, 178, 178, 178, 179, + 179, 179, 180, 180, 180, 181, 181, 181, 182, 182, 182, 183, 183, + 183, 184, 184, 184, 185, 185, 185, 186, 186, 186, 187, 187, 187, + 188, 188, 188, 189, 189, 189, 190, 190, 190, 191, 191, 191, 192, + 192, 192, 193, 193, 193, 194, 194, 194, 195, 195, 195, 196, 196, + 196, 197, 197, 197, 198, 198, 198, 199, 199, 199, 200, 200, 200, + 201, 201, 201, 202, 202, 202, 203, 203, 203, 204, 204, 204, 205, + 205, 205, 206, 206, 206, 207, 207, 207, 208, 208, 208, 209, 209, + 209, 210, 210, 210, 211, 211, 211, 212, 212, 212, 213, 213, 213, + 214, 214, 214, 215, 215, 215, 216, 216, 216, 217, 217, 217, 218, + 218, 218, 219, 219, 219, 220, 220, 220, 221, 221, 221, 222, 222, + 222, 223, 223, 223, 224, 224, 224, 225, 225, 225, 226, 226, 226, + 227, 227, 227, 228, 228, 228, 229, 229, 229, 230, 230, 230, 231, + 231, 231, 232, 232, 232, 233, 233, 233, 234, 234, 234, 235, 235, + 235, 236, 236, 236, 237, 237, 237, 238, 238, 238, 239, 239, 239, + 240, 240, 240, 241, 241, 241, 242, 242, 242, 243, 243, 243, 244, + 244, 244, 245, 245, 245, 246, 246, 246, 247, 247, 247, 248, 248, + 248, 249, 249, 249, 250, 250, 250, 251, 251, 251, 252, 252, 252, + 253, 253, 253, 254, 254, 254, 255, 255, 255 + ] + mask = mask_tensor.cpu().numpy().astype('uint8') + mask = Image.fromarray(mask).convert('P') + mask.putpalette(_palette) + mask.save(path) + + def zip_folder(self, source_folder, zip_dir): + f = zipfile.ZipFile(zip_dir, 'w', zipfile.ZIP_DEFLATED) + pre_len = len(os.path.dirname(source_folder)) + for dirpath, dirnames, filenames in os.walk(source_folder): + for filename in filenames: + pathfile = os.path.join(dirpath, filename) + arcname = pathfile[pre_len:].strip(os.path.sep) + f.write(pathfile, arcname) + f.close() + + def accumulate(self): + """accumulate metrics when finished all iters. + """ + self.zip_folder(self.result_root, self.zip_dir) + logger.info('Save result to {}.'.format(self.zip_dir)) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__init__.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..81d734c Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/average_precision_calculator.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/average_precision_calculator.cpython-310.pyc new file mode 100644 index 0000000..a1817cc Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/average_precision_calculator.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/eval_util.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/eval_util.cpython-310.pyc new file mode 100644 index 0000000..c6404cf Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/eval_util.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/mean_average_precision_calculator.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/mean_average_precision_calculator.cpython-310.pyc new file mode 100644 index 0000000..1907e7a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/__pycache__/mean_average_precision_calculator.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/average_precision_calculator.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/average_precision_calculator.py new file mode 100644 index 0000000..bdbd6e0 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/average_precision_calculator.py @@ -0,0 +1,274 @@ +# Copyright 2020 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Calculate or keep track of the interpolated average precision. + +It provides an interface for calculating interpolated average precision for an +entire list or the top-n ranked items. For the definition of the +(non-)interpolated average precision: +http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf + +Example usages: +1) Use it as a static function call to directly calculate average precision for +a short ranked list in the memory. + +``` +import random + +p = np.array([random.random() for _ in xrange(10)]) +a = np.array([random.choice([0, 1]) for _ in xrange(10)]) + +ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a) +``` + +2) Use it as an object for long ranked list that cannot be stored in memory or +the case where partial predictions can be observed at a time (Tensorflow +predictions). In this case, we first call the function accumulate many times +to process parts of the ranked list. After processing all the parts, we call +peek_interpolated_ap_at_n. +``` +p1 = np.array([random.random() for _ in xrange(5)]) +a1 = np.array([random.choice([0, 1]) for _ in xrange(5)]) +p2 = np.array([random.random() for _ in xrange(5)]) +a2 = np.array([random.choice([0, 1]) for _ in xrange(5)]) + +# interpolated average precision at 10 using 1000 break points +calculator = average_precision_calculator.AveragePrecisionCalculator(10) +calculator.accumulate(p1, a1) +calculator.accumulate(p2, a2) +ap3 = calculator.peek_ap_at_n() +``` +""" + +import heapq +import random +import numbers + +import numpy + + +class AveragePrecisionCalculator(object): + """Calculate the average precision and average precision at n.""" + def __init__(self, top_n=None): + """Construct an AveragePrecisionCalculator to calculate average precision. + + This class is used to calculate the average precision for a single label. + + Args: + top_n: A positive Integer specifying the average precision at n, or + None to use all provided data points. + + Raises: + ValueError: An error occurred when the top_n is not a positive integer. + """ + if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None): + raise ValueError("top_n must be a positive integer or None.") + + self._top_n = top_n # average precision at n + self._total_positives = 0 # total number of positives have seen + self._heap = [] # max heap of (prediction, actual) + + @property + def heap_size(self): + """Gets the heap size maintained in the class.""" + return len(self._heap) + + @property + def num_accumulated_positives(self): + """Gets the number of positive samples that have been accumulated.""" + return self._total_positives + + def accumulate(self, predictions, actuals, num_positives=None): + """Accumulate the predictions and their ground truth labels. + + After the function call, we may call peek_ap_at_n to actually calculate + the average precision. + Note predictions and actuals must have the same shape. + + Args: + predictions: a list storing the prediction scores. + actuals: a list storing the ground truth labels. Any value + larger than 0 will be treated as positives, otherwise as negatives. + num_positives = If the 'predictions' and 'actuals' inputs aren't complete, + then it's possible some true positives were missed in them. In that case, + you can provide 'num_positives' in order to accurately track recall. + + Raises: + ValueError: An error occurred when the format of the input is not the + numpy 1-D array or the shape of predictions and actuals does not match. + """ + if len(predictions) != len(actuals): + raise ValueError( + "the shape of predictions and actuals does not match.") + + if not num_positives is None: + if not isinstance(num_positives, + numbers.Number) or num_positives < 0: + raise ValueError( + "'num_positives' was provided but it wan't a nonzero number." + ) + + if not num_positives is None: + self._total_positives += num_positives + else: + self._total_positives += numpy.size(numpy.where(actuals > 0)) + topk = self._top_n + heap = self._heap + + for i in range(numpy.size(predictions)): + if topk is None or len(heap) < topk: + heapq.heappush(heap, (predictions[i], actuals[i])) + else: + if predictions[i] > heap[0][0]: # heap[0] is the smallest + heapq.heappop(heap) + heapq.heappush(heap, (predictions[i], actuals[i])) + + def clear(self): + """Clear the accumulated predictions.""" + self._heap = [] + self._total_positives = 0 + + def peek_ap_at_n(self): + """Peek the non-interpolated average precision at n. + + Returns: + The non-interpolated average precision at n (default 0). + If n is larger than the length of the ranked list, + the average precision will be returned. + """ + if self.heap_size <= 0: + return 0 + predlists = numpy.array(list(zip(*self._heap))) + + ap = self.ap_at_n(predlists[0], + predlists[1], + n=self._top_n, + total_num_positives=self._total_positives) + return ap + + @staticmethod + def ap(predictions, actuals): + """Calculate the non-interpolated average precision. + + Args: + predictions: a numpy 1-D array storing the sparse prediction scores. + actuals: a numpy 1-D array storing the ground truth labels. Any value + larger than 0 will be treated as positives, otherwise as negatives. + + Returns: + The non-interpolated average precision at n. + If n is larger than the length of the ranked list, + the average precision will be returned. + + Raises: + ValueError: An error occurred when the format of the input is not the + numpy 1-D array or the shape of predictions and actuals does not match. + """ + return AveragePrecisionCalculator.ap_at_n(predictions, actuals, n=None) + + @staticmethod + def ap_at_n(predictions, actuals, n=20, total_num_positives=None): + """Calculate the non-interpolated average precision. + + Args: + predictions: a numpy 1-D array storing the sparse prediction scores. + actuals: a numpy 1-D array storing the ground truth labels. Any value + larger than 0 will be treated as positives, otherwise as negatives. + n: the top n items to be considered in ap@n. + total_num_positives : (optionally) you can specify the number of total + positive + in the list. If specified, it will be used in calculation. + + Returns: + The non-interpolated average precision at n. + If n is larger than the length of the ranked list, + the average precision will be returned. + + Raises: + ValueError: An error occurred when + 1) the format of the input is not the numpy 1-D array; + 2) the shape of predictions and actuals does not match; + 3) the input n is not a positive integer. + """ + if len(predictions) != len(actuals): + raise ValueError( + "the shape of predictions and actuals does not match.") + + if n is not None: + if not isinstance(n, int) or n <= 0: + raise ValueError("n must be 'None' or a positive integer." + " It was '%s'." % n) + + ap = 0.0 + + predictions = numpy.array(predictions) + actuals = numpy.array(actuals) + + # add a shuffler to avoid overestimating the ap + predictions, actuals = AveragePrecisionCalculator._shuffle( + predictions, actuals) + sortidx = sorted(range(len(predictions)), + key=lambda k: predictions[k], + reverse=True) + + if total_num_positives is None: + numpos = numpy.size(numpy.where(actuals > 0)) + else: + numpos = total_num_positives + + if numpos == 0: + return 0 + + if n is not None: + numpos = min(numpos, n) + delta_recall = 1.0 / numpos + poscount = 0.0 + + # calculate the ap + r = len(sortidx) + if n is not None: + r = min(r, n) + for i in range(r): + if actuals[sortidx[i]] > 0: + poscount += 1 + ap += poscount / (i + 1) * delta_recall + return ap + + @staticmethod + def _shuffle(predictions, actuals): + random.seed(0) + suffidx = random.sample(range(len(predictions)), len(predictions)) + predictions = predictions[suffidx] + actuals = actuals[suffidx] + return predictions, actuals + + @staticmethod + def _zero_one_normalize(predictions, epsilon=1e-7): + """Normalize the predictions to the range between 0.0 and 1.0. + + For some predictions like SVM predictions, we need to normalize them before + calculate the interpolated average precision. The normalization will not + change the rank in the original list and thus won't change the average + precision. + + Args: + predictions: a numpy 1-D array storing the sparse prediction scores. + epsilon: a small constant to avoid denominator being zero. + + Returns: + The normalized prediction. + """ + denominator = numpy.max(predictions) - numpy.min(predictions) + ret = (predictions - numpy.min(predictions)) / numpy.max( + denominator, epsilon) + return ret diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/eval_util.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/eval_util.py new file mode 100644 index 0000000..724c72f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/eval_util.py @@ -0,0 +1,205 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Provides functions to help with evaluating models.""" +import numpy as np +import paddle +from paddlevideo.utils import get_logger + +from ..base import BaseMetric +from ..registry import METRIC +from . import average_precision_calculator as ap_calculator +from . import mean_average_precision_calculator as map_calculator + +logger = get_logger("paddlevideo") + + +def flatten(l): + """ Merges a list of lists into a single list. """ + return [item for sublist in l for item in sublist] + + +def calculate_hit_at_one(predictions, actuals): + """ + Hit@k: indicates the fraction of test samples that contain at least + one of the ground truth labels in the top k predictions, + i.e topk. + + Args: + predictions: Matrix containing the outputs of the model. + Dimensions are 'batch' x 'num_classes'. + actuals: Matrix containing the ground truth labels. + Dimensions are 'batch' x 'num_classes'. + + Returns: + float: The average hit at one across the entire batch. + """ + top_prediction = np.argmax(predictions, 1) + hits = actuals[np.arange(actuals.shape[0]), top_prediction] + return np.mean(hits) + + +def calculate_precision_at_equal_recall_rate(predictions, actuals): + """ + PERR: measures the video-level annotation precision when we retrieve the same number + of entities per video as there are in the ground-truth. + More details please refer to: https://arxiv.org/abs/1609.08675 + + Args: + predictions: Matrix containing the outputs of the model. + Dimensions are 'batch' x 'num_classes'. + actuals: Matrix containing the ground truth labels. + Dimensions are 'batch' x 'num_classes'. + + Returns: + float: The average precision at equal recall rate across the entire batch. + """ + aggregated_precision = 0.0 + num_videos = actuals.shape[0] + for row in np.arange(num_videos): + num_labels = int(np.sum(actuals[row])) + top_indices = np.argpartition(predictions[row], + -num_labels)[-num_labels:] + item_precision = 0.0 + for label_index in top_indices: + if predictions[row][label_index] > 0: + item_precision += actuals[row][label_index] + item_precision /= top_indices.size + aggregated_precision += item_precision + aggregated_precision /= num_videos + return aggregated_precision + + +def calculate_gap(predictions, actuals, top_k=20): + """ + GAP: the global average precision. + + Only the top_k predictions are taken for each of the videos. + + Args: + predictions: Matrix containing the outputs of the model. + Dimensions are 'batch' x 'num_classes'. + actuals: Matrix containing the ground truth labels. + Dimensions are 'batch' x 'num_classes'. + top_k: How many predictions to use per video. + + Returns: + float: The global average precision. + """ + gap_calculator = ap_calculator.AveragePrecisionCalculator() + sparse_predictions, sparse_labels, num_positives = top_k_by_class( + predictions, actuals, top_k) + gap_calculator.accumulate(flatten(sparse_predictions), + flatten(sparse_labels), sum(num_positives)) + return gap_calculator.peek_ap_at_n() + + +def top_k_by_class(predictions, labels, k=20): + """Extracts the top k predictions for each video, sorted by class. + + Args: + predictions: A numpy matrix containing the outputs of the model. + Dimensions are 'batch' x 'num_classes'. + k: the top k non-zero entries to preserve in each prediction. + + Returns: + A tuple (predictions,labels, true_positives). 'predictions' and 'labels' + are lists of lists of floats. 'true_positives' is a list of scalars. The + length of the lists are equal to the number of classes. The entries in the + predictions variable are probability predictions, and + the corresponding entries in the labels variable are the ground truth for + those predictions. The entries in 'true_positives' are the number of true + positives for each class in the ground truth. + + Raises: + ValueError: An error occurred when the k is not a positive integer. + """ + if k <= 0: + raise ValueError("k must be a positive integer.") + k = min(k, predictions.shape[1]) + num_classes = predictions.shape[1] + prediction_triplets = [] + for video_index in range(predictions.shape[0]): + prediction_triplets.extend( + top_k_triplets(predictions[video_index], labels[video_index], k)) + out_predictions = [[] for v in range(num_classes)] + out_labels = [[] for v in range(num_classes)] + for triplet in prediction_triplets: + out_predictions[triplet[0]].append(triplet[1]) + out_labels[triplet[0]].append(triplet[2]) + out_true_positives = [np.sum(labels[:, i]) for i in range(num_classes)] + + return out_predictions, out_labels, out_true_positives + + +def top_k_triplets(predictions, labels, k=20): + """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in + (prediction, class) format""" + m = len(predictions) + k = min(k, m) + indices = np.argpartition(predictions, -k)[-k:] + return [(index, predictions[index], labels[index]) for index in indices] + + +@METRIC.register +class HitOneMetric(BaseMetric): + """A class to store the evaluation metrics.""" + def __init__(self, + num_class, + top_k, + data_size, + batch_size, + log_interval=20): + """Construct an HitOneMetric object to store the evaluation metrics.""" + self.hit_at_one = [] + self.perr = [] + self.gap = [] + super().__init__(data_size, batch_size, log_interval) + + def accumulate(self): + logger.info( + '[TEST] finished, hit_at_one = {:.5f}, perr = {:.5f}, gap = {:.5f}'. + format(np.mean(np.array(self.hit_at_one)), + np.mean(np.array(self.perr)), np.mean(np.array(self.gap)))) + + def clear(self): + """Clear the evaluation metrics and reset the HitOneMetric object.""" + self.hit_at_one = [] + self.perr = [] + self.gap = [] + + def update(self, batch_id, data, outputs): + """update metrics during each iter + """ + hit_at_one = paddle.to_tensor(outputs['hit_at_one']) + perr = paddle.to_tensor(outputs['perr']) + gap = paddle.to_tensor(outputs['gap']) + # NOTE(shipping): deal with multi cards validate + if self.world_size > 1: + hit_at_one = paddle.distributed.all_reduce( + hit_at_one, + op=paddle.distributed.ReduceOp.SUM) / self.world_size + perr = paddle.distributed.all_reduce( + perr, op=paddle.distributed.ReduceOp.SUM) / self.world_size + gap = paddle.distributed.all_reduce( + gap, op=paddle.distributed.ReduceOp.SUM) / self.world_size + + self.hit_at_one.append(hit_at_one.numpy()) + self.perr.append(perr.numpy()) + self.gap.append(gap.numpy()) + # preds ensemble + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{}...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size), + )) diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py new file mode 100644 index 0000000..0ae8b0e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/youtube8m/mean_average_precision_calculator.py @@ -0,0 +1,114 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Calculate the mean average precision. + +It provides an interface for calculating mean average precision +for an entire list or the top-n ranked items. + +Example usages: +We first call the function accumulate many times to process parts of the ranked +list. After processing all the parts, we call peek_map_at_n +to calculate the mean average precision. + +``` +import random + +p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)]) +a = np.array([[random.choice([0, 1]) for _ in xrange(50)] + for _ in xrange(1000)]) + +# mean average precision for 50 classes. +calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator( + num_class=50) +calculator.accumulate(p, a) +aps = calculator.peek_map_at_n() +``` +""" + +import numpy +from . import average_precision_calculator + + +class MeanAveragePrecisionCalculator(object): + """This class is to calculate mean average precision. + """ + + def __init__(self, num_class): + """Construct a calculator to calculate the (macro) average precision. + + Args: + num_class: A positive Integer specifying the number of classes. + top_n_array: A list of positive integers specifying the top n for each + class. The top n in each class will be used to calculate its average + precision at n. + The size of the array must be num_class. + + Raises: + ValueError: An error occurred when num_class is not a positive integer; + or the top_n_array is not a list of positive integers. + """ + if not isinstance(num_class, int) or num_class <= 1: + raise ValueError("num_class must be a positive integer.") + + self._ap_calculators = [] # member of AveragePrecisionCalculator + self._num_class = num_class # total number of classes + for i in range(num_class): + self._ap_calculators.append( + average_precision_calculator.AveragePrecisionCalculator()) + + def accumulate(self, predictions, actuals, num_positives=None): + """Accumulate the predictions and their ground truth labels. + + Args: + predictions: A list of lists storing the prediction scores. The outer + dimension corresponds to classes. + actuals: A list of lists storing the ground truth labels. The dimensions + should correspond to the predictions input. Any value + larger than 0 will be treated as positives, otherwise as negatives. + num_positives: If provided, it is a list of numbers representing the + number of true positives for each class. If not provided, the number of + true positives will be inferred from the 'actuals' array. + + Raises: + ValueError: An error occurred when the shape of predictions and actuals + does not match. + """ + if not num_positives: + num_positives = [None for i in predictions.shape[1]] + + calculators = self._ap_calculators + for i in range(len(predictions)): + calculators[i].accumulate(predictions[i], actuals[i], + num_positives[i]) + + def clear(self): + for calculator in self._ap_calculators: + calculator.clear() + + def is_empty(self): + return ([calculator.heap_size for calculator in self._ap_calculators] == + [0 for _ in range(self._num_class)]) + + def peek_map_at_n(self): + """Peek the non-interpolated mean average precision at n. + + Returns: + An array of non-interpolated average precision at n (default 0) for each + class. + """ + aps = [ + self._ap_calculators[i].peek_ap_at_n() + for i in range(self._num_class) + ] + return aps diff --git a/Bank_second_part/detect_process/paddlevideo/metrics/yowo_metric.py b/Bank_second_part/detect_process/paddlevideo/metrics/yowo_metric.py new file mode 100644 index 0000000..032df0c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/metrics/yowo_metric.py @@ -0,0 +1,82 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import os +from paddlevideo.utils import get_logger +from .registry import METRIC +from .base import BaseMetric +from .ucf24_utils import get_mAP + +logger = get_logger("paddlevideo") + + +@METRIC.register +class YOWOMetric(BaseMetric): + """ + Metrics for YOWO. Two Stages in this metric: + (1) Get test results using trained model, results will be saved in YOWOMetric.result_path; + (2) Calculate metrics using results file from stage (1). + """ + + def __init__(self, + data_size, + batch_size, + gt_folder, + result_path, + threshold=0.5, + save_path=None, + log_interval=1): + """ + Init for BMN metrics. + Params: + gtfolder:groundtruth folder path for ucf24 + """ + super().__init__(data_size, batch_size, log_interval) + self.result_path = result_path + self.gt_folder = gt_folder + self.threshold = threshold + self.save_path = save_path + + if not osp.isdir(self.result_path): + os.makedirs(self.result_path) + + def update(self, batch_id, data, outputs): + frame_idx = outputs['frame_idx'] + boxes = outputs["boxes"] + for j in range(len(frame_idx)): + detection_path = osp.join(self.result_path, frame_idx[j]) + with open(detection_path, 'w+') as f_detect: + for box in boxes[j]: + x1 = round(float(box[0] - box[2] / 2.0) * 320.0) + y1 = round(float(box[1] - box[3] / 2.0) * 240.0) + x2 = round(float(box[0] + box[2] / 2.0) * 320.0) + y2 = round(float(box[1] + box[3] / 2.0) * 240.0) + + det_conf = float(box[4]) + for j in range((len(box) - 5) // 2): + cls_conf = float(box[5 + 2 * j].item()) + prob = det_conf * cls_conf + f_detect.write( + str(int(box[6]) + 1) + ' ' + str(prob) + ' ' + str(x1) + ' ' + str(y1) + ' ' + str( + x2) + ' ' + str(y2) + '\n') + if batch_id % self.log_interval == 0: + logger.info("[TEST] Processing batch {}/{} ...".format( + batch_id, + self.data_size // (self.batch_size * self.world_size))) + + def accumulate(self): + metric_list = get_mAP(self.gt_folder, self.result_path, self.threshold, self.save_path) + for info in metric_list: + logger.info(info) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/__init__.py new file mode 100644 index 0000000..639bd34 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .assigners import MaxIoUAssignerAVA +from .backbones import ResNet +from .builder import (build_backbone, build_head, build_localizer, build_loss, + build_recognizer) +from .framework.detectors import BaseDetector, FastRCNN, TwoStageDetector +from .framework.recognizers import BaseRecognizer, Recognizer2D +from .heads import (AVARoIHead, BaseHead, BBoxHeadAVA, SingleRoIExtractor3D, + TSNHead) +from .losses import CrossEntropyLoss +from .registry import (BACKBONES, DETECTORS, HEADS, LOCALIZERS, LOSSES, + PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS) +from .samplers import RandomSampler +from .weight_init import kaiming_normal_, trunc_normal_, weight_init_ + +__all__ = [ + 'BACKBONES', 'HEADS', 'RECOGNIZERS', 'LOCALIZERS', 'PARTITIONERS', 'LOSSES', + 'build_recognizer', 'build_localizer', 'build_head', 'build_backbone', + 'build_loss', 'ResNet', 'TSNHead', 'BaseHead', 'BaseRecognizer', + 'Recognizer2d', 'CrossEntropyLoss', 'ROI_EXTRACTORS', + 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'MaxIoUAssignerAVA', + 'RandomSampler', 'DETECTORS', 'kaiming_normal_', 'trunc_normal_', + 'weight_init_' +] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..172853e Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/bbox_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/bbox_utils.cpython-310.pyc new file mode 100644 index 0000000..8a6a2b9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/bbox_utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/builder.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/builder.cpython-310.pyc new file mode 100644 index 0000000..53b88d9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/builder.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000..ffcc723 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/registry.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/weight_init.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/weight_init.cpython-310.pyc new file mode 100644 index 0000000..8b4c7aa Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/__pycache__/weight_init.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__init__.py new file mode 100644 index 0000000..a4570db --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .max_iou_assigner_ava import MaxIoUAssignerAVA + +__all__ = ['MaxIoUAssignerAVA'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..7eb2115 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/max_iou_assigner_ava.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/max_iou_assigner_ava.cpython-310.pyc new file mode 100644 index 0000000..485995b Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/__pycache__/max_iou_assigner_ava.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/assigners/max_iou_assigner_ava.py b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/max_iou_assigner_ava.py new file mode 100644 index 0000000..5cc72bf --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/assigners/max_iou_assigner_ava.py @@ -0,0 +1,148 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import numpy as np +from ..registry import BBOX_ASSIGNERS +from ..bbox_utils import bbox_overlaps + +class AssignResult(): + def __init__(self, num_gts, gt_inds, max_overlaps, labels=None): + self.num_gts = num_gts + self.gt_inds = gt_inds + self.max_overlaps = max_overlaps + self.labels = labels + + def add_gt_(self, gt_labels): + """Add ground truth as assigned results. """ + self_inds = paddle.arange(1, len(gt_labels) + 1, dtype="int32") + gt_inds_squeeze = paddle.squeeze(self.gt_inds, axis=0) + self.gt_inds = paddle.concat([self_inds, gt_inds_squeeze]) + gt_label_ones = paddle.full((len(gt_labels), ), 1, dtype='float32') + max_overlaps_squeeze = paddle.squeeze(self.max_overlaps, axis=0) + self.max_overlaps = paddle.concat([gt_label_ones, max_overlaps_squeeze]) + if self.labels is not None: + self.labels = paddle.concat([gt_labels, self.labels]) + +@BBOX_ASSIGNERS.register() +class MaxIoUAssignerAVA(): + """Assign a corresponding gt bbox or background to each bbox. """ + def __init__(self, + pos_iou_thr, + neg_iou_thr, + min_pos_iou=.0, + gt_max_assign_all=True, + ignore_iof_thr=-1, + ignore_wrt_candidates=True, + match_low_quality=True, + gpu_assign_thr=-1, + iou_calculator=dict(type='BboxOverlaps2D')): + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + self.gpu_assign_thr = gpu_assign_thr + self.match_low_quality = match_low_quality + + def assign(self, + bboxes, + gt_bboxes, + gt_labels=None): + """Assign gt to bboxes. """ + overlaps = bbox_overlaps(gt_bboxes, bboxes) + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + return assign_result + + def assign_wrt_overlaps(self, overlaps, gt_labels=None): + """Assign w.r.t. the overlaps of bboxes with gts. """ + num_gts, num_bboxes = overlaps.shape[0], overlaps.shape[1] + # 1. assign -1 + assigned_gt_inds = paddle.full((num_bboxes, ), -1, dtype='int32') + + # for each anchor, which gt best overlaps with it + # for each anchor, the max iou of all gts + max_overlaps, argmax_overlaps = paddle.topk(overlaps, k=1, axis=0) + # for each gt, which anchor best overlaps with it + # for each gt, the max iou of all proposals + gt_max_overlaps, gt_argmax_overlaps = paddle.topk(overlaps, k=1, axis=1) + + # 2. assign negative: below the negative inds are set to be 0 + match_labels = paddle.full(argmax_overlaps.shape, -1, dtype='int32') + match_labels = paddle.where(max_overlaps < self.neg_iou_thr, + paddle.zeros_like(match_labels), match_labels) + + # 3. assign positive: above positive IoU threshold + argmax_overlaps_int32 = paddle.cast(argmax_overlaps, 'int32') + match_labels = paddle.where(max_overlaps >= self.pos_iou_thr, + argmax_overlaps_int32 + 1, match_labels) + assigned_gt_inds = match_labels + if self.match_low_quality: + # Low-quality matching will overwirte the assigned_gt_inds + # assigned in Step 3. Thus, the assigned gt might not be the + # best one for prediction. + # For example, if bbox A has 0.9 and 0.8 iou with GT bbox + # 1 & 2, bbox 1 will be assigned as the best target for bbox A + # in step 3. However, if GT bbox 2's gt_argmax_overlaps = A, + # bbox A's assigned_gt_inds will be overwritten to be bbox B. + # This might be the reason that it is not used in ROI Heads. + for i in range(num_gts): + if gt_max_overlaps.numpy()[i] >= self.min_pos_iou: + if self.gt_max_assign_all: + equal_x_np = overlaps[i, :].numpy() + equal_y_np = gt_max_overlaps[i].numpy() + max_iou_inds = np.equal(equal_x_np, equal_y_np) + max_iou_inds = paddle.to_tensor(max_iou_inds) + max_iou_inds = paddle.reshape( max_iou_inds, [1,max_iou_inds.shape[0]] ) + match_labels_gts = paddle.full(max_iou_inds.shape, i+1, dtype='int32') + match_labels = paddle.where(max_iou_inds, match_labels_gts, match_labels) + assigned_gt_inds = match_labels + else: + assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1 + + if gt_labels is not None: + # consider multi-class case (AVA) + assert len(gt_labels[0]) > 1 + assigned_labels = paddle.full([num_bboxes, len(gt_labels[0])], 0, dtype='float32') + assigned_gt_inds_reshape = assigned_gt_inds.reshape([assigned_gt_inds.shape[1]]) + pos_inds = paddle.nonzero( assigned_gt_inds_reshape , as_tuple=False) + pos_inds_num = float(paddle.numel(pos_inds)) + if pos_inds_num > 0: + pos_inds = paddle.squeeze(pos_inds, axis = 1 ) + assigned_gt_inds_squeeze = paddle.squeeze(assigned_gt_inds, axis=0) + assigned_gt_inds_select = paddle.index_select(assigned_gt_inds_squeeze, pos_inds) - 1 + gt_labels_select = paddle.index_select(gt_labels, assigned_gt_inds_select) + A = assigned_gt_inds_squeeze + X = assigned_gt_inds_squeeze - 1 + Y = paddle.zeros_like(X) + if A.shape[0]==1: + if float(A) > 0: + T=X + else: + T=Y + else: + T = paddle.where(A>0, X, Y) + S = paddle.index_select(gt_labels, T) + AE = paddle.expand(A, [S.shape[1], A.shape[0]]) + AET = paddle.transpose(AE, perm=[1, 0]) + R = paddle.where(AET>0, S, assigned_labels) + assigned_labels = R + else: + assigned_labels = None + ret = AssignResult( + num_gts, + assigned_gt_inds, + max_overlaps, + labels=assigned_labels) + return ret diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__init__.py new file mode 100644 index 0000000..a88cedc --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__init__.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .actbert import BertForMultiModalPreTraining +from .adds import ADDS_DepthNet +from .agcn import AGCN +from .asrf import ASRF +from .bmn import BMN +from .cfbi import CFBI +from .movinet import MoViNet +from .ms_tcn import MSTCN +from .resnet import ResNet +from .resnet_slowfast import ResNetSlowFast +from .resnet_slowfast_MRI import ResNetSlowFast_MRI +from .resnet_tsm import ResNetTSM +from .resnet_tsm_MRI import ResNetTSM_MRI +from .resnet_tsn_MRI import ResNetTSN_MRI +from .resnet_tweaks_tsm import ResNetTweaksTSM +from .resnet_tweaks_tsn import ResNetTweaksTSN +from .stgcn import STGCN +from .swin_transformer import SwinTransformer3D +from .transnetv2 import TransNetV2 +from .vit import VisionTransformer +from .vit_tweaks import VisionTransformer_tweaks +from .ms_tcn import MSTCN +from .asrf import ASRF +from .resnet_tsn_MRI import ResNetTSN_MRI +from .resnet_tsm_MRI import ResNetTSM_MRI +from .resnet_slowfast_MRI import ResNetSlowFast_MRI +from .cfbi import CFBI +from .ctrgcn import CTRGCN +from .agcn2s import AGCN2s +from .movinet import MoViNet +from .resnet3d_slowonly import ResNet3dSlowOnly +from .toshift_vit import TokenShiftVisionTransformer +from .pptsm_mv2 import PPTSM_MobileNetV2 +from .pptsm_mv3 import PPTSM_MobileNetV3 +from .pptsm_v2 import PPTSM_v2 +from .yowo import YOWO + +__all__ = [ + 'ResNet', 'ResNetTSM', 'ResNetTweaksTSM', 'ResNetSlowFast', 'BMN', + 'ResNetTweaksTSN', 'VisionTransformer', 'STGCN', 'AGCN', 'TransNetV2', + 'ADDS_DepthNet', 'VisionTransformer_tweaks', 'BertForMultiModalPreTraining', + 'ResNetTSN_MRI', 'ResNetTSM_MRI', 'ResNetSlowFast_MRI', 'CFBI', 'MSTCN', + 'ASRF', 'MoViNet', 'SwinTransformer3D', 'CTRGCN', + 'TokenShiftVisionTransformer', 'AGCN2s', 'PPTSM_MobileNetV2', + 'PPTSM_MobileNetV3', 'PPTSM_v2', 'ResNet3dSlowOnly', 'YOWO' +] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..754b221 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/actbert.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/actbert.cpython-310.pyc new file mode 100644 index 0000000..7d92dc9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/actbert.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/adds.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/adds.cpython-310.pyc new file mode 100644 index 0000000..24f8479 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/adds.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn.cpython-310.pyc new file mode 100644 index 0000000..dbf6f27 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn2s.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn2s.cpython-310.pyc new file mode 100644 index 0000000..d2ee340 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/agcn2s.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/asrf.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/asrf.cpython-310.pyc new file mode 100644 index 0000000..3732e07 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/asrf.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/bmn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/bmn.cpython-310.pyc new file mode 100644 index 0000000..faa0914 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/bmn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/cfbi.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/cfbi.cpython-310.pyc new file mode 100644 index 0000000..63efba1 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/cfbi.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ctrgcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ctrgcn.cpython-310.pyc new file mode 100644 index 0000000..86eb240 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ctrgcn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/darknet.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/darknet.cpython-310.pyc new file mode 100644 index 0000000..ea3eb95 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/darknet.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/deeplab.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/deeplab.cpython-310.pyc new file mode 100644 index 0000000..6fe7a62 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/deeplab.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/movinet.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/movinet.cpython-310.pyc new file mode 100644 index 0000000..7a42080 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/movinet.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ms_tcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ms_tcn.cpython-310.pyc new file mode 100644 index 0000000..88c6561 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/ms_tcn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv2.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv2.cpython-310.pyc new file mode 100644 index 0000000..ed5a0fd Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv2.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv3.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv3.cpython-310.pyc new file mode 100644 index 0000000..c701ace Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_mv3.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_v2.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_v2.cpython-310.pyc new file mode 100644 index 0000000..19c44f4 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/pptsm_v2.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet.cpython-310.pyc new file mode 100644 index 0000000..47ddaac Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d.cpython-310.pyc new file mode 100644 index 0000000..1229e2d Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc new file mode 100644 index 0000000..fb791ab Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet3d_slowonly.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast.cpython-310.pyc new file mode 100644 index 0000000..bd78377 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast_MRI.cpython-310.pyc new file mode 100644 index 0000000..7a72c27 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_slowfast_MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm.cpython-310.pyc new file mode 100644 index 0000000..7708769 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm_MRI.cpython-310.pyc new file mode 100644 index 0000000..0c86eae Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsm_MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsn_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsn_MRI.cpython-310.pyc new file mode 100644 index 0000000..4825ba0 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tsn_MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsm.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsm.cpython-310.pyc new file mode 100644 index 0000000..9181cc7 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsm.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsn.cpython-310.pyc new file mode 100644 index 0000000..86c321f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnet_tweaks_tsn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnext101.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnext101.cpython-310.pyc new file mode 100644 index 0000000..8d8419a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/resnext101.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/stgcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/stgcn.cpython-310.pyc new file mode 100644 index 0000000..ec6d4cb Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/stgcn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/swin_transformer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/swin_transformer.cpython-310.pyc new file mode 100644 index 0000000..c45b075 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/swin_transformer.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/toshift_vit.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/toshift_vit.cpython-310.pyc new file mode 100644 index 0000000..fbe323a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/toshift_vit.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/transnetv2.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/transnetv2.cpython-310.pyc new file mode 100644 index 0000000..eba1461 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/transnetv2.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit.cpython-310.pyc new file mode 100644 index 0000000..2421c7e Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit_tweaks.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit_tweaks.cpython-310.pyc new file mode 100644 index 0000000..2bd75fc Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/vit_tweaks.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/yowo.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/yowo.cpython-310.pyc new file mode 100644 index 0000000..6210026 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/__pycache__/yowo.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/actbert.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/actbert.py new file mode 100644 index 0000000..dbee1fd --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/actbert.py @@ -0,0 +1,1158 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import numpy as np +import math +import copy + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout) +from paddle.nn.initializer import Constant, Normal +from ...utils.save_load import load_ckpt +from ..registry import BACKBONES +from ..weight_init import weight_init_ + +ACT2FN = {"gelu": F.gelu, "relu": F.relu, "swish": F.swish} + + +class BertEmbeddings(nn.Layer): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, vocab_size, max_position_embeddings, type_vocab_size, + hidden_size, hidden_dropout_prob): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(vocab_size, + hidden_size, + padding_idx=0) + self.position_embeddings = nn.Embedding(max_position_embeddings, + hidden_size) + self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) + + self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12) + self.dropout = nn.Dropout(hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None): + seq_length = input_ids.shape[1] + position_ids = paddle.arange(end=seq_length, dtype="int64") + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = paddle.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) #8,36 -> 8,36,768 + position_embeddings = self.position_embeddings( + position_ids) #8,36 -> 8,36,768 + token_type_embeddings = self.token_type_embeddings( + token_type_ids) #8,36 -> 8,36,768 + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertImageEmbeddings(nn.Layer): + def __init__(self, v_feature_size, v_hidden_size, v_hidden_dropout_prob): + super(BertImageEmbeddings, self).__init__() + self.image_embeddings = nn.Linear(v_feature_size, v_hidden_size) + self.image_location_embeddings = nn.Linear(5, v_hidden_size) + self.LayerNorm = nn.LayerNorm(v_hidden_size, epsilon=1e-12) + self.dropout = nn.Dropout(v_hidden_dropout_prob) + + def forward(self, input_ids, input_loc): + img_embeddings = self.image_embeddings( + input_ids) #8,37,2048 -> 8,37,1024 + loc_embeddings = self.image_location_embeddings( + input_loc) #8,37,5 -> 8,37,1024 + embeddings = self.LayerNorm(img_embeddings + loc_embeddings) + embeddings = self.dropout(embeddings) + return embeddings # shape: bs*seq_len*hs + + +class BertActionEmbeddings(nn.Layer): + def __init__(self, a_feature_size, a_hidden_size, a_hidden_dropout_prob): + super(BertActionEmbeddings, self).__init__() + self.action_embeddings = nn.Linear(a_feature_size, a_hidden_size) + self.LayerNorm = nn.LayerNorm(a_hidden_size, epsilon=1e-12) + self.dropout = nn.Dropout(a_hidden_dropout_prob) + + def forward(self, input_ids): + action_embeddings = self.action_embeddings( + input_ids) #8,5,2048 -> 8,5,768 + embeddings = self.LayerNorm(action_embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Layer): + def __init__(self, hidden_size, num_attention_heads, + attention_probs_dropout_prob): + super(BertSelfAttention, self).__init__() + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + self.num_attention_heads = num_attention_heads + self.attention_head_size = int(hidden_size / num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(hidden_size, self.all_head_size) + self.key = nn.Linear(hidden_size, self.all_head_size) + self.value = nn.Linear(hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.shape[:-1] + [ + self.num_attention_heads, + self.attention_head_size, + ] + x = x.reshape(new_x_shape) + return x.transpose((0, 2, 1, 3)) + + def forward(self, hidden_states, attention_mask): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = paddle.matmul(query_layer, + key_layer.transpose((0, 1, 3, 2))) + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = paddle.matmul(attention_probs, value_layer) + context_layer = context_layer.transpose((0, 2, 1, 3)) + new_context_layer_shape = context_layer.shape[:-2] + [ + self.all_head_size + ] + context_layer = context_layer.reshape(new_context_layer_shape) + + return context_layer, attention_probs + + +class BertSelfOutput(nn.Layer): + def __init__(self, hidden_size, hidden_dropout_prob): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(hidden_size, hidden_size) + self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12) + self.dropout = nn.Dropout(hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Layer): + def __init__(self, hidden_size, hidden_dropout_prob, num_attention_heads, + attention_probs_dropout_prob): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(hidden_size, num_attention_heads, + attention_probs_dropout_prob) + self.output = BertSelfOutput(hidden_size, hidden_dropout_prob) + + def forward(self, input_tensor, attention_mask): + self_output, attention_probs = self.self(input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output, attention_probs + + +class BertIntermediate(nn.Layer): + def __init__(self, hidden_size, intermediate_size, hidden_act): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(hidden_size, intermediate_size) + if isinstance(hidden_act, str) or (sys.version_info[0] == 2 + and isinstance(hidden_act, str)): + self.intermediate_act_fn = ACT2FN[hidden_act] + else: + self.intermediate_act_fn = hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Layer): + def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob): + super(BertOutput, self).__init__() + self.dense = nn.Linear(intermediate_size, hidden_size) + self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12) + self.dropout = nn.Dropout(hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertEntAttention(nn.Layer): + """Core mudule of tangled transformer. + """ + def __init__( + self, + hidden_size, + v_hidden_size, + a_hidden_size, + bi_hidden_size, + attention_probs_dropout_prob, + v_attention_probs_dropout_prob, + a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, + at_attention_probs_dropout_prob, + bi_num_attention_heads, + ): + super(BertEntAttention, self).__init__() + if bi_hidden_size % bi_num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (bi_hidden_size, bi_num_attention_heads)) + + self.num_attention_heads = bi_num_attention_heads + self.attention_head_size = int(bi_hidden_size / bi_num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + # self attention layers for vision input + self.query1 = nn.Linear(v_hidden_size, self.all_head_size) + self.key1 = nn.Linear(v_hidden_size, self.all_head_size) + self.value1 = nn.Linear(v_hidden_size, self.all_head_size) + self.dropout1 = nn.Dropout(v_attention_probs_dropout_prob) + + # self attention layers for text input + self.query2 = nn.Linear(hidden_size, self.all_head_size) + self.key2 = nn.Linear(hidden_size, self.all_head_size) + self.value2 = nn.Linear(hidden_size, self.all_head_size) + self.dropout2 = nn.Dropout(attention_probs_dropout_prob) + + # self attention layers for action input + self.query3 = nn.Linear(a_hidden_size, self.all_head_size) + self.key3 = nn.Linear(a_hidden_size, self.all_head_size) + self.value3 = nn.Linear(a_hidden_size, self.all_head_size) + self.dropout3 = nn.Dropout(a_attention_probs_dropout_prob) + + # self attention layers for action_text + self.key_at = nn.Linear(bi_hidden_size, self.all_head_size) + self.value_at = nn.Linear(bi_hidden_size, self.all_head_size) + self.dropout_at = nn.Dropout(av_attention_probs_dropout_prob) + + # self attention layers for action_vision + self.key_av = nn.Linear(bi_hidden_size, self.all_head_size) + self.value_av = nn.Linear(bi_hidden_size, self.all_head_size) + self.dropout_av = nn.Dropout(at_attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.shape[:-1] + [ + self.num_attention_heads, + self.attention_head_size, + ] + x = x.reshape(new_x_shape) + return x.transpose((0, 2, 1, 3)) + + def forward( + self, + input_tensor1, + attention_mask1, + input_tensor2, + attention_mask2, + input_tensor3, + attention_mask3, + ): + + # for vision input. + mixed_query_layer1 = self.query1(input_tensor1) + mixed_key_layer1 = self.key1(input_tensor1) + mixed_value_layer1 = self.value1(input_tensor1) + + query_layer1 = self.transpose_for_scores(mixed_query_layer1) + key_layer1 = self.transpose_for_scores(mixed_key_layer1) + value_layer1 = self.transpose_for_scores(mixed_value_layer1) + + # for text input: + mixed_query_layer2 = self.query2(input_tensor2) + mixed_key_layer2 = self.key2(input_tensor2) + mixed_value_layer2 = self.value2(input_tensor2) + + query_layer2 = self.transpose_for_scores(mixed_query_layer2) + key_layer2 = self.transpose_for_scores(mixed_key_layer2) + value_layer2 = self.transpose_for_scores(mixed_value_layer2) + + # for action input: + mixed_query_layer3 = self.query3(input_tensor3) + mixed_key_layer3 = self.key3(input_tensor3) + mixed_value_layer3 = self.value3(input_tensor3) + + query_layer3 = self.transpose_for_scores(mixed_query_layer3) + key_layer3 = self.transpose_for_scores(mixed_key_layer3) + value_layer3 = self.transpose_for_scores(mixed_value_layer3) + + def do_attention(query_layer, key_layer, value_layer, attention_mask, + dropout): + """ compute attention """ + attention_scores = paddle.matmul(query_layer, + key_layer.transpose((0, 1, 3, 2))) + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(axis=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs) + + context_layer = paddle.matmul(attention_probs, value_layer) + context_layer = context_layer.transpose((0, 2, 1, 3)) + new_context_layer_shape = context_layer.shape[:-2] + [ + self.all_head_size + ] + context_layer = context_layer.reshape(new_context_layer_shape) + return context_layer + + context_av = do_attention(query_layer3, key_layer1, value_layer1, + attention_mask1, self.dropout_av) + context_at = do_attention(query_layer3, key_layer2, value_layer2, + attention_mask2, self.dropout_at) + + context_key_av = self.key_av(context_av).transpose((0, 2, 1)) + # interpolate only support 4-D tensor now. + context_key_av = F.interpolate(context_key_av.unsqueeze(-1), + size=(key_layer2.shape[2], + 1)).squeeze(-1) + context_key_av = self.transpose_for_scores( + context_key_av.transpose((0, 2, 1))) + key_layer2 = key_layer2 + context_key_av + + context_key_at = self.key_at(context_at).transpose((0, 2, 1)) + context_key_at = F.interpolate(context_key_at.unsqueeze(-1), + size=(key_layer1.shape[2], + 1)).squeeze(-1) + context_key_at = self.transpose_for_scores( + context_key_at.transpose((0, 2, 1))) + key_layer1 = key_layer1 + context_key_at + + context_val_av = self.value_at(context_av).transpose((0, 2, 1)) + context_val_av = F.interpolate(context_val_av.unsqueeze(-1), + size=(value_layer2.shape[2], + 1)).squeeze(-1) + context_val_av = self.transpose_for_scores( + context_val_av.transpose((0, 2, 1))) + value_layer2 = value_layer2 + context_val_av + + context_val_at = self.value_at(context_at).transpose((0, 2, 1)) + context_val_at = F.interpolate(context_val_at.unsqueeze(-1), + size=(value_layer1.shape[2], + 1)).squeeze(-1) + context_val_at = self.transpose_for_scores( + context_val_at.transpose((0, 2, 1))) + value_layer1 = value_layer1 + context_val_at + + context_layer1 = do_attention(query_layer1, key_layer1, value_layer1, + attention_mask1, self.dropout1) + context_layer2 = do_attention(query_layer2, key_layer2, value_layer2, + attention_mask2, self.dropout2) + context_layer3 = do_attention(query_layer3, key_layer3, value_layer3, + attention_mask3, self.dropout3) + + return context_layer1, context_layer2, context_layer3 # vision, text, action + + +class BertEntOutput(nn.Layer): + def __init__( + self, + bi_hidden_size, + hidden_size, + v_hidden_size, + v_hidden_dropout_prob, + hidden_dropout_prob, + ): + super(BertEntOutput, self).__init__() + + self.dense1 = nn.Linear(bi_hidden_size, v_hidden_size) + self.LayerNorm1 = nn.LayerNorm(v_hidden_size, epsilon=1e-12) + self.dropout1 = nn.Dropout(v_hidden_dropout_prob) + + self.dense2 = nn.Linear(bi_hidden_size, hidden_size) + self.LayerNorm2 = nn.LayerNorm(hidden_size, epsilon=1e-12) + self.dropout2 = nn.Dropout(hidden_dropout_prob) + + self.dense3 = nn.Linear(bi_hidden_size, hidden_size) + self.LayerNorm3 = nn.LayerNorm(hidden_size, epsilon=1e-12) + self.dropout3 = nn.Dropout(hidden_dropout_prob) + + def forward( + self, + hidden_states1, + input_tensor1, + hidden_states2, + input_tensor2, + hidden_states3, + input_tensor3, + ): + context_state1 = self.dense1(hidden_states1) + context_state1 = self.dropout1(context_state1) + + context_state2 = self.dense2(hidden_states2) + context_state2 = self.dropout2(context_state2) + + context_state3 = self.dense3(hidden_states3) + context_state3 = self.dropout3(context_state3) + + hidden_states1 = self.LayerNorm1(context_state1 + input_tensor1) + hidden_states2 = self.LayerNorm2(context_state2 + input_tensor2) + hidden_states3 = self.LayerNorm3(context_state3 + input_tensor3) + + return hidden_states1, hidden_states2, hidden_states3 + + +class BertLayer(nn.Layer): + def __init__(self, hidden_size, intermediate_size, hidden_act, + hidden_dropout_prob, num_attention_heads, + attention_probs_dropout_prob): + super(BertLayer, self).__init__() + self.attention = BertAttention(hidden_size, hidden_dropout_prob, + num_attention_heads, + attention_probs_dropout_prob) + self.intermediate = BertIntermediate(hidden_size, intermediate_size, + hidden_act) + self.output = BertOutput(intermediate_size, hidden_size, + hidden_dropout_prob) + + def forward(self, hidden_states, attention_mask): + attention_output, attention_probs = self.attention( + hidden_states, attention_mask) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output, attention_probs + + +class BertConnectionLayer(nn.Layer): + def __init__(self, hidden_size, v_hidden_size, a_hidden_size, + bi_hidden_size, bi_num_attention_heads, + attention_probs_dropout_prob, v_attention_probs_dropout_prob, + a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, + at_attention_probs_dropout_prob, intermediate_size, + v_intermediate_size, a_intermediate_size, hidden_act, + v_hidden_act, a_hidden_act, hidden_dropout_prob, + v_hidden_dropout_prob, a_hidden_dropout_prob): + super(BertConnectionLayer, self).__init__() + self.ent_attention = BertEntAttention( + hidden_size, + v_hidden_size, + a_hidden_size, + bi_hidden_size, + attention_probs_dropout_prob, + v_attention_probs_dropout_prob, + a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, + at_attention_probs_dropout_prob, + bi_num_attention_heads, + ) + + self.ent_output = BertEntOutput( + bi_hidden_size, + hidden_size, + v_hidden_size, + v_hidden_dropout_prob, + hidden_dropout_prob, + ) + + self.v_intermediate = BertIntermediate(v_hidden_size, + v_intermediate_size, + v_hidden_act) + self.v_output = BertOutput(v_intermediate_size, v_hidden_size, + v_hidden_dropout_prob) + + self.t_intermediate = BertIntermediate(hidden_size, intermediate_size, + hidden_act) + self.t_output = BertOutput(intermediate_size, hidden_size, + hidden_dropout_prob) + + self.a_intermediate = BertIntermediate(a_hidden_size, + a_intermediate_size, + a_hidden_act) + self.a_output = BertOutput(a_intermediate_size, a_hidden_size, + a_hidden_dropout_prob) + + def forward( + self, + input_tensor1, + attention_mask1, + input_tensor2, + attention_mask2, + input_tensor3, + attention_mask3, + ): + + ent_output1, ent_output2, ent_output3 = self.ent_attention( + input_tensor1, attention_mask1, input_tensor2, attention_mask2, + input_tensor3, attention_mask3) + + attention_output1, attention_output2, attention_output3 = self.ent_output( + ent_output1, input_tensor1, ent_output2, input_tensor2, ent_output3, + input_tensor3) + + intermediate_output1 = self.v_intermediate(attention_output1) + layer_output1 = self.v_output(intermediate_output1, attention_output1) + + intermediate_output2 = self.t_intermediate(attention_output2) + layer_output2 = self.t_output(intermediate_output2, attention_output2) + + intermediate_output3 = self.a_intermediate(attention_output3) + layer_output3 = self.a_output(intermediate_output3, attention_output3) + + return layer_output1, layer_output2, layer_output3 + + +class BertEncoder(nn.Layer): + """ + ActBert Encoder, consists 3 pathway of multi-BertLayers and BertConnectionLayer. + """ + def __init__( + self, + v_ent_attention_id, + t_ent_attention_id, + a_ent_attention_id, + fixed_t_layer, + fixed_v_layer, + hidden_size, + v_hidden_size, + a_hidden_size, + bi_hidden_size, + intermediate_size, + v_intermediate_size, + a_intermediate_size, + hidden_act, + v_hidden_act, + a_hidden_act, + hidden_dropout_prob, + v_hidden_dropout_prob, + a_hidden_dropout_prob, + attention_probs_dropout_prob, + v_attention_probs_dropout_prob, + a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, + at_attention_probs_dropout_prob, + num_attention_heads, + v_num_attention_heads, + a_num_attention_heads, + bi_num_attention_heads, + num_hidden_layers, + v_num_hidden_layers, + a_num_hidden_layers, + ): + super(BertEncoder, self).__init__() + self.v_ent_attention_id = v_ent_attention_id + self.t_ent_attention_id = t_ent_attention_id + self.a_ent_attention_id = a_ent_attention_id + self.fixed_t_layer = fixed_t_layer + self.fixed_v_layer = fixed_v_layer + + layer = BertLayer(hidden_size, intermediate_size, hidden_act, + hidden_dropout_prob, num_attention_heads, + attention_probs_dropout_prob) + v_layer = BertLayer(v_hidden_size, v_intermediate_size, v_hidden_act, + v_hidden_dropout_prob, v_num_attention_heads, + v_attention_probs_dropout_prob) + a_layer = BertLayer(a_hidden_size, a_intermediate_size, a_hidden_act, + a_hidden_dropout_prob, a_num_attention_heads, + a_attention_probs_dropout_prob) + connect_layer = BertConnectionLayer( + hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size, + bi_num_attention_heads, attention_probs_dropout_prob, + v_attention_probs_dropout_prob, a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, at_attention_probs_dropout_prob, + intermediate_size, v_intermediate_size, a_intermediate_size, + hidden_act, v_hidden_act, a_hidden_act, hidden_dropout_prob, + v_hidden_dropout_prob, a_hidden_dropout_prob) + + self.layer = nn.LayerList( + [copy.deepcopy(layer) for _ in range(num_hidden_layers)]) #12 + self.v_layer = nn.LayerList( + [copy.deepcopy(v_layer) for _ in range(v_num_hidden_layers)]) #2 + self.a_layer = nn.LayerList( + [copy.deepcopy(a_layer) for _ in range(a_num_hidden_layers)]) #3 + self.c_layer = nn.LayerList([ + copy.deepcopy(connect_layer) for _ in range(len(v_ent_attention_id)) + ] #2 [0,1] + ) + + def forward( + self, + txt_embedding, + image_embedding, + action_embedding, + txt_attention_mask, + image_attention_mask, + action_attention_mask, + output_all_encoded_layers=True, + ): + v_start, a_start, t_start = 0, 0, 0 + count = 0 + all_encoder_layers_t = [] + all_encoder_layers_v = [] + all_encoder_layers_a = [] + + for v_layer_id, a_layer_id, t_layer_id in zip(self.v_ent_attention_id, + self.a_ent_attention_id, + self.t_ent_attention_id): + v_end = v_layer_id + a_end = a_layer_id + t_end = t_layer_id + + assert self.fixed_t_layer <= t_end + assert self.fixed_v_layer <= v_end + + ### region embedding + for idx in range(v_start, + self.fixed_v_layer): #两次训练,这个循环都没有进去 #前面的层固定住 + with paddle.no_grad(): + image_embedding, image_attention_probs = self.v_layer[idx]( + image_embedding, image_attention_mask) + v_start = self.fixed_v_layer + for idx in range(v_start, v_end): + image_embedding, image_attention_probs = self.v_layer[idx]( + image_embedding, image_attention_mask) + + ### action embedding + for idx in range(a_start, a_end): + action_embedding, action_attention_probs = self.a_layer[idx]( + action_embedding, action_attention_mask) + + ### text embedding + for idx in range(t_start, self.fixed_t_layer): + with paddle.no_grad(): + txt_embedding, txt_attention_probs = self.layer[idx]( + txt_embedding, txt_attention_mask) + t_start = self.fixed_t_layer + for idx in range(t_start, t_end): + txt_embedding, txt_attention_probs = self.layer[idx]( + txt_embedding, txt_attention_mask) + + image_embedding, txt_embedding, action_embedding = self.c_layer[ + count](image_embedding, image_attention_mask, txt_embedding, + txt_attention_mask, action_embedding, + action_attention_mask) + + v_start = v_end + t_start = t_end + a_start = a_end + count += 1 + + if output_all_encoded_layers: + all_encoder_layers_t.append(txt_embedding) + all_encoder_layers_v.append(image_embedding) + all_encoder_layers_a.append(action_embedding) + + for idx in range(v_start, len(self.v_layer)): # 1 + image_embedding, image_attention_probs = self.v_layer[idx]( + image_embedding, image_attention_mask) + + for idx in range(a_start, len(self.a_layer)): + action_embedding, action_attention_probs = self.a_layer[idx]( + action_embedding, action_attention_mask) + + for idx in range(t_start, len(self.layer)): + txt_embedding, txt_attention_probs = self.layer[idx]( + txt_embedding, txt_attention_mask) + + # add the end part to finish. + if not output_all_encoded_layers: + all_encoder_layers_t.append(txt_embedding) #8, 36, 768 + all_encoder_layers_v.append(image_embedding) #8, 37, 1024 + all_encoder_layers_a.append(action_embedding) #8, 5, 768 + + return all_encoder_layers_t, all_encoder_layers_v, all_encoder_layers_a + + +class BertPooler(nn.Layer): + """ "Pool" the model by simply taking the hidden state corresponding + to the first token. + """ + def __init__(self, hidden_size, bi_hidden_size): + super(BertPooler, self).__init__() + self.dense = nn.Linear(hidden_size, bi_hidden_size) + self.activation = nn.ReLU() + + def forward(self, hidden_states): + first_token_tensor = hidden_states[:, 0] #8, 768 + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertModel(nn.Layer): + def __init__( + self, + vocab_size, + max_position_embeddings, + type_vocab_size, + v_feature_size, + a_feature_size, + num_hidden_layers, + v_num_hidden_layers, + a_num_hidden_layers, + v_ent_attention_id, + t_ent_attention_id, + a_ent_attention_id, + fixed_t_layer, + fixed_v_layer, + hidden_size, + v_hidden_size, + a_hidden_size, + bi_hidden_size, + intermediate_size, + v_intermediate_size, + a_intermediate_size, + hidden_act, + v_hidden_act, + a_hidden_act, + hidden_dropout_prob, + v_hidden_dropout_prob, + a_hidden_dropout_prob, + attention_probs_dropout_prob, + v_attention_probs_dropout_prob, + a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, + at_attention_probs_dropout_prob, + num_attention_heads, + v_num_attention_heads, + a_num_attention_heads, + bi_num_attention_heads, + ): + super(BertModel, self).__init__() + # initilize word embedding + self.embeddings = BertEmbeddings(vocab_size, max_position_embeddings, + type_vocab_size, hidden_size, + hidden_dropout_prob) + # initlize the region embedding + self.v_embeddings = BertImageEmbeddings(v_feature_size, v_hidden_size, + v_hidden_dropout_prob) + # initlize the action embedding + self.a_embeddings = BertActionEmbeddings(a_feature_size, a_hidden_size, + a_hidden_dropout_prob) + + self.encoder = BertEncoder( + v_ent_attention_id, t_ent_attention_id, a_ent_attention_id, + fixed_t_layer, fixed_v_layer, hidden_size, v_hidden_size, + a_hidden_size, bi_hidden_size, intermediate_size, + v_intermediate_size, a_intermediate_size, hidden_act, v_hidden_act, + a_hidden_act, hidden_dropout_prob, v_hidden_dropout_prob, + a_hidden_dropout_prob, attention_probs_dropout_prob, + v_attention_probs_dropout_prob, a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, at_attention_probs_dropout_prob, + num_attention_heads, v_num_attention_heads, a_num_attention_heads, + bi_num_attention_heads, num_hidden_layers, v_num_hidden_layers, + a_num_hidden_layers) + + self.t_pooler = BertPooler(hidden_size, bi_hidden_size) + self.v_pooler = BertPooler(v_hidden_size, bi_hidden_size) + self.a_pooler = BertPooler(a_hidden_size, bi_hidden_size) + + def forward( + self, + text_ids, + action_feat, + image_feat, + image_loc, + token_type_ids=None, + text_mask=None, + image_mask=None, + action_mask=None, + output_all_encoded_layers=False, + ): + """ + text_ids: input text ids. Shape: [batch_size, seqence_length] + action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim] + image_feat: input image feature. Shape: [batch_size, region_length, image_feature_dim]] + image_loc: input region location. Shape: [batch_size, region_length, region_location_dim] + token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length] + text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length] + image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length] + action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length] + output_all_encoded_layers: is output encoded layers feature or not. Type: Bool. + """ + if text_mask is None: + text_mask = paddle.ones_like(text_ids) + if token_type_ids is None: + token_type_ids = paddle.zeros_like(text_ids) + if image_mask is None: + image_mask = paddle.ones(image_feat.shape[0], + image_feat.shape[1]).astype(text_ids.dtype) + if action_mask is None: + action_mask = paddle.ones(action_feat.shape[0], + action_feat.shape[1]).astype( + text_ids.dtype) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]. + extended_text_mask = text_mask.unsqueeze(1).unsqueeze(2) + extended_image_mask = image_mask.unsqueeze(1).unsqueeze(2) + extended_action_mask = action_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + def set_mask(extended_attention_mask): + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + extended_text_mask = set_mask(extended_text_mask) + extended_image_mask = set_mask(extended_image_mask) + extended_action_mask = set_mask(extended_action_mask) + + t_embedding_output = self.embeddings(text_ids, token_type_ids) + v_embedding_output = self.v_embeddings(image_feat, image_loc) + a_embedding_output = self.a_embeddings(action_feat) + + # var = [t_embedding_output, v_embedding_output, a_embedding_output] + # import numpy as np + # for i, item in enumerate(var): + # np.save('tmp/' + str(i)+'.npy', item.numpy()) + + encoded_layers_t, encoded_layers_v, encoded_layers_a = self.encoder( + t_embedding_output, + v_embedding_output, + a_embedding_output, + extended_text_mask, + extended_image_mask, + extended_action_mask, + output_all_encoded_layers=output_all_encoded_layers, + ) + + sequence_output_t = encoded_layers_t[-1] #get item from list + sequence_output_v = encoded_layers_v[-1] + sequence_output_a = encoded_layers_a[-1] + + pooled_output_t = self.t_pooler(sequence_output_t) + pooled_output_v = self.v_pooler(sequence_output_v) + pooled_output_a = self.a_pooler(sequence_output_a) + + if not output_all_encoded_layers: + encoded_layers_t = encoded_layers_t[-1] + encoded_layers_v = encoded_layers_v[-1] + encoded_layers_a = encoded_layers_a[-1] + + return encoded_layers_t, encoded_layers_v, encoded_layers_a, \ + pooled_output_t, pooled_output_v, pooled_output_a + + +# For Head +class BertPredictionHeadTransform(nn.Layer): + def __init__(self, hidden_size, hidden_act): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(hidden_size, hidden_size) + if isinstance(hidden_act, str) or (sys.version_info[0] == 2 + and isinstance(hidden_act, str)): + self.transform_act_fn = ACT2FN[hidden_act] + else: + self.transform_act_fn = hidden_act + self.LayerNorm = nn.LayerNorm(hidden_size, epsilon=1e-12) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Layer): + def __init__(self, hidden_size, hidden_act, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(hidden_size, hidden_act) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + assert bert_model_embedding_weights.shape[1] == hidden_size + vocab_size = bert_model_embedding_weights.shape[0] + + # another implementation which would create another big params: + # self.decoder = nn.Linear(hidden_size, vocab_size) # NOTE bias default: constant 0.0 + # self.decoder.weight = self.create_parameter(shape=[hidden_size, vocab_size], + # default_initializer=nn.initializer.Assign( + # bert_model_embedding_weights.t())) # transpose + + self.decoder_weight = bert_model_embedding_weights + self.decoder_bias = self.create_parameter( + shape=[vocab_size], + dtype=bert_model_embedding_weights.dtype, + is_bias=True) # NOTE bias default: constant 0.0 + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = paddle.tensor.matmul( + hidden_states, self.decoder_weight, + transpose_y=True) + self.decoder_bias + return hidden_states + + +class BertImageActionPredictionHead(nn.Layer): + def __init__(self, hidden_size, hidden_act, target_size): + super(BertImageActionPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(hidden_size, hidden_act) + + self.decoder = nn.Linear(hidden_size, target_size) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertPreTrainingHeads(nn.Layer): + def __init__(self, hidden_size, v_hidden_size, a_hidden_size, + bi_hidden_size, hidden_act, v_hidden_act, a_hidden_act, + v_target_size, a_target_size, fusion_method, + bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(hidden_size, hidden_act, + bert_model_embedding_weights) + self.seq_relationship = nn.Linear(bi_hidden_size, 2) + self.imagePredictions = BertImageActionPredictionHead( + v_hidden_size, v_hidden_act, v_target_size) # visual class number + self.actionPredictions = BertImageActionPredictionHead( + a_hidden_size, a_hidden_act, a_target_size) # action class number + self.fusion_method = fusion_method + self.dropout = nn.Dropout(0.1) + + def forward(self, sequence_output_t, sequence_output_v, sequence_output_a, + pooled_output_t, pooled_output_v, pooled_output_a): + + if self.fusion_method == 'sum': + pooled_output = self.dropout(pooled_output_t + pooled_output_v + + pooled_output_a) + elif self.fusion_method == 'mul': + pooled_output = self.dropout(pooled_output_t * pooled_output_v + + pooled_output_a) + else: + assert False + + prediction_scores_t = self.predictions( + sequence_output_t) # 8, 36 ,30522 + seq_relationship_score = self.seq_relationship(pooled_output) # 8, 2 + prediction_scores_v = self.imagePredictions( + sequence_output_v) # 8, 37, 1601 + prediction_scores_a = self.actionPredictions( + sequence_output_a) # 8, 5, 401 + + return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score + + +@BACKBONES.register() +class BertForMultiModalPreTraining(nn.Layer): + """BERT model with multi modal pre-training heads. + """ + def __init__( + self, + vocab_size=30522, + max_position_embeddings=512, + type_vocab_size=2, + v_target_size=1601, + a_target_size=700, + v_feature_size=2048, + a_feature_size=2048, + num_hidden_layers=12, + v_num_hidden_layers=2, + a_num_hidden_layers=3, + t_ent_attention_id=[10, 11], + v_ent_attention_id=[0, 1], + a_ent_attention_id=[0, 1], + fixed_t_layer=0, + fixed_v_layer=0, + hidden_size=768, + v_hidden_size=1024, + a_hidden_size=768, + bi_hidden_size=1024, + intermediate_size=3072, + v_intermediate_size=1024, + a_intermediate_size=3072, + hidden_act="gelu", + v_hidden_act="gelu", + a_hidden_act="gelu", + hidden_dropout_prob=0.1, + v_hidden_dropout_prob=0.1, + a_hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + v_attention_probs_dropout_prob=0.1, + a_attention_probs_dropout_prob=0.1, + av_attention_probs_dropout_prob=0.1, + at_attention_probs_dropout_prob=0.1, + num_attention_heads=12, + v_num_attention_heads=8, + a_num_attention_heads=12, + bi_num_attention_heads=8, + fusion_method="mul", + pretrained=None, + ): + """ + vocab_size: vocabulary size. Default: 30522. + max_position_embeddings: max position id. Default: 512. + type_vocab_size: max segment id. Default: 2. + v_target_size: class number of visual word. Default: 1601. + a_target_size: class number of action word. Default: 700. + v_feature_size: input visual feature dimension. Default: 2048. + a_feature_size: input action feature dimension. Default: 2048. + num_hidden_layers: number of BertLayer in text transformer. Default: 12. + v_num_hidden_layers: number of BertLayer in visual transformer. Default: 2. + a_num_hidden_layers: number of BertLayer in action transformer. Default:3. + t_ent_attention_id: index id of BertConnectionLayer in text transformer. Default: [10, 11]. + v_ent_attention_id: index id of BertConnectionLayer in visual transformer. Default:[0, 1]. + a_ent_attention_id: index id of BertConnectionLayer in action transformer. Default:[0, 1]. + fixed_t_layer: index id of fixed BertLayer in text transformer. Default: 0. + fixed_v_layer: index id of fixed BertLayer in visual transformer. Default: 0. + hidden_size: hidden size in text BertLayer. Default: 768. + v_hidden_size: hidden size in visual BertLayer. Default: 1024. + a_hidden_size: hidden size in action BertLayer. Default: 768. + bi_hidden_size: hidden size in BertConnectionLayer. Default: 1024, + intermediate_size: intermediate size in text BertLayer. Default: 3072. + v_intermediate_size: intermediate size in visual BertLayer. Default: 1024. + a_intermediate_size: intermediate size in text BertLayer. Default: 3072. + hidden_act: hidden activation function in text BertLayer. Default: "gelu". + v_hidden_act: hidden activation function in visual BertLayer. Default: "gelu". + a_hidden_act: hidden activation function in action BertLayer. Default: "gelu". + hidden_dropout_prob: hidden dropout probability in text Embedding Layer. Default: 0.1 + v_hidden_dropout_prob: hidden dropout probability in visual Embedding Layer. Default: 0.1 + a_hidden_dropout_prob: hidden dropout probability in action Embedding Layer. Default: 0.1 + attention_probs_dropout_prob: attention dropout probability in text BertLayer. Default: 0.1 + v_attention_probs_dropout_prob: attention dropout probability in visual BertLayer. Default: 0.1 + a_attention_probs_dropout_prob: attention dropout probability in action BertLayer. Default: 0.1 + av_attention_probs_dropout_prob: attention dropout probability in action-visual BertConnectionLayer. Default: 0.1 + at_attention_probs_dropout_prob: attention dropout probability in action-text BertConnectionLayer. Default: 0.1 + num_attention_heads: number of heads in text BertLayer. Default: 12. + v_num_attention_heads: number of heads in visual BertLayer. Default: 8. + a_num_attention_heads: number of heads in action BertLayer. Default: 12. + bi_num_attention_heads: number of heads in BertConnectionLayer. Default: 8. + fusion_method: methods of fusing pooled output from 3 transformer. Default: "mul". + """ + super(BertForMultiModalPreTraining, self).__init__() + self.pretrained = pretrained + self.vocab_size = vocab_size + self.a_target_size = a_target_size + + self.bert = BertModel( + vocab_size, + max_position_embeddings, + type_vocab_size, + v_feature_size, + a_feature_size, + num_hidden_layers, + v_num_hidden_layers, + a_num_hidden_layers, + v_ent_attention_id, + t_ent_attention_id, + a_ent_attention_id, + fixed_t_layer, + fixed_v_layer, + hidden_size, + v_hidden_size, + a_hidden_size, + bi_hidden_size, + intermediate_size, + v_intermediate_size, + a_intermediate_size, + hidden_act, + v_hidden_act, + a_hidden_act, + hidden_dropout_prob, + v_hidden_dropout_prob, + a_hidden_dropout_prob, + attention_probs_dropout_prob, + v_attention_probs_dropout_prob, + a_attention_probs_dropout_prob, + av_attention_probs_dropout_prob, + at_attention_probs_dropout_prob, + num_attention_heads, + v_num_attention_heads, + a_num_attention_heads, + bi_num_attention_heads, + ) + self.cls = BertPreTrainingHeads( + hidden_size, v_hidden_size, a_hidden_size, bi_hidden_size, + hidden_act, v_hidden_act, a_hidden_act, v_target_size, + a_target_size, fusion_method, + self.bert.embeddings.word_embeddings.weight) + + def init_weights(self): + """Initiate the parameters. + """ + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, (nn.Linear, nn.Embedding)): + weight_init_(layer, 'Normal', std=0.02) + elif isinstance(layer, nn.LayerNorm): + weight_init_(layer, 'Constant', value=1) + + def forward( + self, + text_ids, #8,36 + action_feat, #8,5,2048 + image_feat, #8,37,2048 + image_loc, #8,37,5 + token_type_ids=None, #8,36 + text_mask=None, #8,36 + image_mask=None, #8,37 + action_mask=None, #8,5 + ): + """ + text_ids: input text ids. Shape: [batch_size, seqence_length] + action_feat: input action feature. Shape: [batch_size, action_length, action_feature_dim] + image_feat: input image feature. Shape: [batch_size, region_length+1, image_feature_dim]], add 1 for image global feature. + image_loc: input region location. Shape: [batch_size, region_length+1, region_location_dim], add 1 for image global feature location. + token_type_ids: segment ids of each video clip. Shape: [batch_size, seqence_length] + text_mask: text mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, seqence_length] + image_mask: image mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, region_length] + action_mask: action mask, 1 for real tokens and 0 for padding tokens. Shape: [batch_size, action_length] + """ + sequence_output_t, sequence_output_v, sequence_output_a, \ + pooled_output_t, pooled_output_v, pooled_output_a = self.bert( + text_ids, + action_feat, + image_feat, + image_loc, + token_type_ids, + text_mask, + image_mask, + action_mask, + output_all_encoded_layers=False, + ) + + prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.cls( + sequence_output_t, sequence_output_v, sequence_output_a, + pooled_output_t, pooled_output_v, pooled_output_a) + + return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/adds.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/adds.py new file mode 100644 index 0000000..21cd212 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/adds.py @@ -0,0 +1,1146 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import OrderedDict + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import BatchNorm2D, Conv2D +from paddle.nn.initializer import Constant, Normal +from paddle.vision.models import ResNet + +from ...utils import load_ckpt +from ..registry import BACKBONES +from ..weight_init import kaiming_normal_, _calculate_fan_in_and_fan_out + +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) +normal_ = Normal(mean=0, std=1e-3) + + +def disp_to_depth(disp, min_depth, max_depth): + """Convert network's sigmoid output into depth prediction + The formula for this conversion is given in the 'additional considerations' + section of the paper. + """ + min_disp = 1 / max_depth + max_disp = 1 / min_depth + scaled_disp = min_disp + (max_disp - min_disp) * disp + depth = 1 / scaled_disp + return scaled_disp, depth + + +def gram_matrix(y): + (b, ch, h, w) = y.shape + features = y.reshape([b, ch, w * h]) + features_t = paddle.transpose(features, [0, 2, 1]) + gram = features.bmm(features_t) / (ch * h * w) + return gram + + +def convt_bn_relu(in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + bn=True, + relu=True): + bias = not bn + layers = [] + layers.append( + nn.Conv2DTranspose(in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + bias_attr=bias)) + if bn: + layers.append(nn.BatchNorm2D(out_channels)) + + if relu: + layers.append(nn.LeakyReLU(0.2)) + layers = nn.Sequential(*layers) + + # initialize the weights + for m in layers.sublayers(include_self=True): + if isinstance(m, nn.Conv2DTranspose): + normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2D): + ones_(m.weight) + zeros_(m.bias) + return layers + + +def transformation_from_parameters(axisangle, translation, invert=False): + """Convert the network's (axisangle, translation) output into a 4x4 matrix + """ + R = rot_from_axisangle(axisangle) + t = translation.clone() + + if invert: + R = R.transpose([0, 2, 1]) + t *= -1 + + T = get_translation_matrix(t) + + if invert: + M = paddle.matmul(R, T) + else: + M = paddle.matmul(T, R) + + return M + + +def get_translation_matrix(translation_vector): + """Convert a translation vector into a 4x4 transformation matrix + """ + t = translation_vector.reshape([-1, 3, 1]) + gather_object = paddle.stack([ + paddle.zeros([ + translation_vector.shape[0], + ], paddle.float32), + paddle.ones([ + translation_vector.shape[0], + ], paddle.float32), + paddle.squeeze(t[:, 0], axis=-1), + paddle.squeeze(t[:, 1], axis=-1), + paddle.squeeze(t[:, 2], axis=-1), + ]) + gather_index = paddle.to_tensor([ + [1], + [0], + [0], + [2], + [0], + [1], + [0], + [3], + [0], + [0], + [1], + [4], + [0], + [0], + [0], + [1], + ]) + T = paddle.gather_nd(gather_object, gather_index) + T = T.reshape([4, 4, -1]).transpose((2, 0, 1)) + return T + + +def rot_from_axisangle(vec): + """Convert an axisangle rotation into a 4x4 transformation matrix + (adapted from https://github.com/Wallacoloo/printipi) + Input 'vec' has to be Bx1x3 + """ + angle = paddle.norm(vec, 2, 2, True) + axis = vec / (angle + 1e-7) + + ca = paddle.cos(angle) + sa = paddle.sin(angle) + C = 1 - ca + + x = axis[..., 0].unsqueeze(1) + y = axis[..., 1].unsqueeze(1) + z = axis[..., 2].unsqueeze(1) + + xs = x * sa + ys = y * sa + zs = z * sa + xC = x * C + yC = y * C + zC = z * C + xyC = x * yC + yzC = y * zC + zxC = z * xC + + gather_object = paddle.stack([ + paddle.squeeze(x * xC + ca, axis=(-1, -2)), + paddle.squeeze(xyC - zs, axis=(-1, -2)), + paddle.squeeze(zxC + ys, axis=(-1, -2)), + paddle.squeeze(xyC + zs, axis=(-1, -2)), + paddle.squeeze(y * yC + ca, axis=(-1, -2)), + paddle.squeeze(yzC - xs, axis=(-1, -2)), + paddle.squeeze(zxC - ys, axis=(-1, -2)), + paddle.squeeze(yzC + xs, axis=(-1, -2)), + paddle.squeeze(z * zC + ca, axis=(-1, -2)), + paddle.ones([ + vec.shape[0], + ], dtype=paddle.float32), + paddle.zeros([ + vec.shape[0], + ], dtype=paddle.float32) + ]) + gather_index = paddle.to_tensor([ + [0], + [1], + [2], + [10], + [3], + [4], + [5], + [10], + [6], + [7], + [8], + [10], + [10], + [10], + [10], + [9], + ]) + rot = paddle.gather_nd(gather_object, gather_index) + rot = rot.reshape([4, 4, -1]).transpose((2, 0, 1)) + return rot + + +def upsample(x): + """Upsample input tensor by a factor of 2 + """ + return F.interpolate(x, scale_factor=2, mode="nearest") + + +def get_smooth_loss(disp, img): + """Computes the smoothness loss for a disparity image + The color image is used for edge-aware smoothness + """ + grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:]) + grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :]) + + grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), + 1, + keepdim=True) + grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), + 1, + keepdim=True) + + grad_disp_x *= paddle.exp(-grad_img_x) + grad_disp_y *= paddle.exp(-grad_img_y) + + return grad_disp_x.mean() + grad_disp_y.mean() + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2D(in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias_attr=False, + dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2D(in_planes, + out_planes, + kernel_size=1, + stride=stride, + bias_attr=False) + + +def resnet_multiimage_input(num_layers, num_input_images=1): + """Constructs a ResNet model. + Args: + num_layers (int): Number of resnet layers. Must be 18 or 50 + pretrained (bool): If True, returns a model pre-trained on ImageNet + num_input_images (int): Number of frames stacked as input + """ + assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet" + blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers] + + block_type = {18: BasicBlock, 50: Bottleneck}[num_layers] + + model = ResNetMultiImageInput(block_type, + num_layers, + blocks, + num_input_images=num_input_images) + model.init_weights() + return model + + +class ConvBlock(nn.Layer): + """Layer to perform a convolution followed by ELU + """ + def __init__(self, in_channels, out_channels): + super(ConvBlock, self).__init__() + + self.conv = Conv3x3(in_channels, out_channels) + self.nonlin = nn.ELU() + + def forward(self, x): + out = self.conv(x) + out = self.nonlin(out) + return out + + +class Conv3x3(nn.Layer): + """Layer to pad and convolve input + """ + def __init__(self, in_channels, out_channels, use_refl=True): + super(Conv3x3, self).__init__() + + if use_refl: + self.pad = nn.Pad2D(1, mode='reflect') + else: + self.pad = nn.Pad2D(1) + self.conv = nn.Conv2D(int(in_channels), int(out_channels), 3) + + def forward(self, x): + out = self.pad(x) + out = self.conv(out) + return out + + +class BackprojectDepth(nn.Layer): + """Layer to transform a depth image into a point cloud + """ + def __init__(self, batch_size, height, width): + super(BackprojectDepth, self).__init__() + + self.batch_size = batch_size + self.height = height + self.width = width + + meshgrid = np.meshgrid(range(self.width), + range(self.height), + indexing='xy') + id_coords = np.stack(meshgrid, axis=0).astype(np.float32) + self.id_coords = self.create_parameter(shape=list(id_coords.shape), + dtype=paddle.float32) + self.id_coords.set_value(id_coords) + self.add_parameter("id_coords", self.id_coords) + self.id_coords.stop_gradient = True + + self.ones = self.create_parameter( + shape=[self.batch_size, 1, self.height * self.width], + default_initializer=ones_) + self.add_parameter("ones", self.ones) + self.ones.stop_gradient = True + + pix_coords = paddle.unsqueeze( + paddle.stack([ + self.id_coords[0].reshape([ + -1, + ]), self.id_coords[1].reshape([ + -1, + ]) + ], 0), 0) + pix_coords = pix_coords.tile([batch_size, 1, 1]) + pix_coords = paddle.concat([pix_coords, self.ones], 1) + self.pix_coords = self.create_parameter(shape=list(pix_coords.shape), ) + self.pix_coords.set_value(pix_coords) + self.add_parameter("pix_coords", self.pix_coords) + self.pix_coords.stop_gradient = True + + def forward(self, depth, inv_K): + cam_points = paddle.matmul(inv_K[:, :3, :3], self.pix_coords) + cam_points = depth.reshape([self.batch_size, 1, -1]) * cam_points + cam_points = paddle.concat([cam_points, self.ones], 1) + + return cam_points + + +class Project3D(nn.Layer): + """Layer which projects 3D points into a camera with intrinsics K and at position T + """ + def __init__(self, batch_size, height, width, eps=1e-7): + super(Project3D, self).__init__() + + self.batch_size = batch_size + self.height = height + self.width = width + self.eps = eps + + def forward(self, points, K, T): + P = paddle.matmul(K, T)[:, :3, :] + + cam_points = paddle.matmul(P, points) + + pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) + + self.eps) + pix_coords = pix_coords.reshape( + [self.batch_size, 2, self.height, self.width]) + pix_coords = pix_coords.transpose([0, 2, 3, 1]) + pix_coords[..., 0] /= self.width - 1 + pix_coords[..., 1] /= self.height - 1 + pix_coords = (pix_coords - 0.5) * 2 + return pix_coords + + +class SSIM(nn.Layer): + """Layer to compute the SSIM loss between a pair of images + """ + def __init__(self): + super(SSIM, self).__init__() + self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False) + + self.refl = nn.Pad2D(1, mode='reflect') + + self.C1 = 0.01**2 + self.C2 = 0.03**2 + + def forward(self, x, y): + x = self.refl(x) + y = self.refl(y) + + mu_x = self.mu_x_pool(x) + mu_y = self.mu_y_pool(y) + + sigma_x = self.sig_x_pool(x**2) - mu_x**2 + sigma_y = self.sig_y_pool(y**2) - mu_y**2 + sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y + + SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2) + SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2) + + return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1) + + +class ResNetMultiImageInput(ResNet): + """Constructs a resnet model with varying number of input images. + Adapted from https://github.com/pypaddle/vision/blob/master/paddlevision/models/resnet.py + """ + def __init__(self, block, depth, layers, num_input_images=1): + super(ResNetMultiImageInput, self).__init__(block, depth) + self.inplanes = 64 + self.conv1 = nn.Conv2D(num_input_images * 3, + 64, + kernel_size=7, + stride=2, + padding=3, + bias_attr=False) + self.bn1 = nn.BatchNorm2D(64) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + def init_weights(self): + for layer in self.sublayers(include_self=True): + if isinstance(layer, nn.Conv2D): + kaiming_normal_(layer.weight, + mode='fan_out', + nonlinearity='relu') + elif isinstance(layer, nn.BatchNorm2D): + ones_(layer.weight) + zeros_(layer.bias) + + +class ConvBNLayer(nn.Layer): + """Conv2D and BatchNorm2D layer. + + Args: + in_channels (int): Number of channels for the input. + out_channels (int): Number of channels for the output. + kernel_size (int): Kernel size. + stride (int): Stride in the Conv2D layer. Default: 1. + groups (int): Groups in the Conv2D, Default: 1. + act (str): Indicate activation after BatchNorm2D layer. + name (str): the name of an instance of ConvBNLayer. + + Note: weight and bias initialization include initialize values + and name the restored parameters, values initialization + are explicit declared in the ```init_weights``` method. + + """ + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias_attr=False) + + self._act = act + + self._batch_norm = BatchNorm2D(out_channels) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self._act: + y = getattr(paddle.nn.functional, self._act)(y) + return y + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2D + if groups != 1 or base_width != 64: + raise ValueError( + 'BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError( + "Dilation > 1 not supported in BasicBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU() + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Layer): + # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) + # while original implementation places the stride at the first 1x1 convolution(self.conv1) + # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. + # This variant is also known as ResNet V1.5 and improves accuracy according to + # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. + + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2D + width = int(planes * (base_width / 64.)) * groups + + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class DepthDecoder(nn.Layer): + def __init__(self, + num_ch_enc, + scales=range(4), + num_output_channels=1, + use_skips=True): + super(DepthDecoder, self).__init__() + + self.num_output_channels = num_output_channels + self.use_skips = use_skips + self.upsample_mode = 'nearest' + self.scales = scales + + self.num_ch_enc = num_ch_enc + self.num_ch_dec = np.array([16, 32, 64, 128, 256]) + + # decoder + self.convs = OrderedDict() + for i in range(4, -1, -1): + # upconv_0 + num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + + 1] + num_ch_out = self.num_ch_dec[i] + self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) + + # upconv_1 + num_ch_in = self.num_ch_dec[i] + if self.use_skips and i > 0: + num_ch_in += self.num_ch_enc[i - 1] + num_ch_out = self.num_ch_dec[i] + self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) + + for s in self.scales: + self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], + self.num_output_channels) + + self.decoder = nn.LayerList(list(self.convs.values())) + self.sigmoid = nn.Sigmoid() + + def forward(self, input_features): + outputs = {} + + # decoder + x = input_features[-1] + for i in range(4, -1, -1): + x = self.convs[("upconv", i, 0)](x) + x = [upsample(x)] + if self.use_skips and i > 0: + x += [input_features[i - 1]] + x = paddle.concat(x, 1) + x = self.convs[("upconv", i, 1)](x) + if i in self.scales: + outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", + i)](x)) + return outputs + + +class PoseDecoder(nn.Layer): + def __init__(self, + num_ch_enc, + num_input_features, + num_frames_to_predict_for=None, + stride=1): + super(PoseDecoder, self).__init__() + + self.num_ch_enc = num_ch_enc + self.num_input_features = num_input_features + + if num_frames_to_predict_for is None: + num_frames_to_predict_for = num_input_features - 1 + self.num_frames_to_predict_for = num_frames_to_predict_for + + self.convs = OrderedDict() + self.convs[("squeeze")] = nn.Conv2D(self.num_ch_enc[-1], 256, 1) + self.convs[("pose", 0)] = nn.Conv2D(num_input_features * 256, 256, 3, + stride, 1) + self.convs[("pose", 1)] = nn.Conv2D(256, 256, 3, stride, 1) + self.convs[("pose", 2)] = nn.Conv2D(256, 6 * num_frames_to_predict_for, + 1) + + self.relu = nn.ReLU() + + self.net = nn.LayerList(list(self.convs.values())) + + def forward(self, input_features): + last_features = [f[-1] for f in input_features] + + cat_features = [ + self.relu(self.convs["squeeze"](f)) for f in last_features + ] + cat_features = paddle.concat(cat_features, 1) + + out = cat_features + for i in range(3): + out = self.convs[("pose", i)](out) + if i != 2: + out = self.relu(out) + + out = out.mean(3).mean(2) + + out = 0.01 * out.reshape([-1, self.num_frames_to_predict_for, 1, 6]) + + axisangle = out[..., :3] + translation = out[..., 3:] + + return axisangle, translation + + +class ResnetEncoder(nn.Layer): + """Pypaddle module for a resnet encoder + """ + def __init__(self, num_layers, pretrained=False, num_input_images=1): + super(ResnetEncoder, self).__init__() + + self.num_ch_enc = np.array([64, 64, 128, 256, 512]) + + resnets = { + 18: paddle.vision.models.resnet18, + 34: paddle.vision.models.resnet34, + 50: paddle.vision.models.resnet50, + 101: paddle.vision.models.resnet101, + 152: paddle.vision.models.resnet152 + } + + if num_layers not in resnets: + raise ValueError( + "{} is not a valid number of resnet layers".format(num_layers)) + + if num_input_images > 1: + self.encoder = resnet_multiimage_input(num_layers, pretrained, + num_input_images) + else: + self.encoder = resnets[num_layers](pretrained) + + if num_layers > 34: + self.num_ch_enc[1:] *= 4 + + ###################################### + # night public first conv + ###################################### + self.conv1 = nn.Conv2D(3, + 64, + kernel_size=7, + stride=2, + padding=3, + bias_attr=False) + self.bn1 = nn.BatchNorm2D(64) + self.relu = nn.ReLU() # NOTE + + self.conv_shared = nn.Conv2D(512, 64, kernel_size=1) + + ########################################## + # private source encoder, day + ########################################## + self.encoder_day = resnets[num_layers](pretrained) + self.conv_diff_day = nn.Conv2D( + 512, 64, kernel_size=1) # no bn after conv, so bias=true + + ########################################## + # private target encoder, night + ########################################## + self.encoder_night = resnets[num_layers](pretrained) + self.conv_diff_night = nn.Conv2D(512, 64, kernel_size=1) + + ###################################### + # shared decoder (small decoder), use a simple de-conv to upsample the features with no skip connection + ###################################### + self.convt5 = convt_bn_relu(in_channels=512, + out_channels=256, + kernel_size=3, + stride=2, + padding=1, + output_padding=1) + self.convt4 = convt_bn_relu(in_channels=256, + out_channels=128, + kernel_size=3, + stride=2, + padding=1, + output_padding=1) + self.convt3 = convt_bn_relu(in_channels=128, + out_channels=64, + kernel_size=3, + stride=2, + padding=1, + output_padding=1) + self.convt2 = convt_bn_relu(in_channels=64, + out_channels=64, + kernel_size=3, + stride=2, + padding=1, + output_padding=1) + self.convt1 = convt_bn_relu(in_channels=64, + out_channels=64, + kernel_size=3, + stride=2, + padding=1, + output_padding=1) + self.convtf = nn.Conv2D(64, 3, kernel_size=1, stride=1, padding=0) + + def forward(self, input_image, is_night): + if self.training: + result = [] + input_data = (input_image - 0.45) / 0.225 + if is_night == 'day': + # source private encoder, day + private_feature = self.encoder_day.conv1(input_data) + private_feature = self.encoder_day.bn1(private_feature) + private_feature = self.encoder_day.relu(private_feature) + private_feature = self.encoder_day.maxpool(private_feature) + private_feature = self.encoder_day.layer1(private_feature) + private_feature = self.encoder_day.layer2(private_feature) + private_feature = self.encoder_day.layer3(private_feature) + private_feature = self.encoder_day.layer4(private_feature) + private_code = self.conv_diff_day(private_feature) + private_gram = gram_matrix(private_feature) + result.append(private_code) + result.append(private_gram) + + elif is_night == 'night': + # target private encoder, night + private_feature = self.encoder_night.conv1(input_data) + private_feature = self.encoder_night.bn1(private_feature) + private_feature = self.encoder_night.relu(private_feature) + private_feature = self.encoder_night.maxpool(private_feature) + private_feature = self.encoder_night.layer1(private_feature) + private_feature = self.encoder_night.layer2(private_feature) + private_feature = self.encoder_night.layer3(private_feature) + private_feature = self.encoder_night.layer4(private_feature) + private_code = self.conv_diff_night(private_feature) + + private_gram = gram_matrix(private_feature) + result.append(private_code) + result.append(private_gram) + + # shared encoder + self.features = [] + x = (input_image - 0.45) / 0.225 + if is_night == 'day': + x = self.encoder.conv1(x) + x = self.encoder.bn1(x) + self.features.append(self.encoder.relu(x)) + else: + x = self.conv1(x) + x = self.bn1(x) + self.features.append(self.relu(x)) + + self.features.append( + self.encoder.layer1(self.encoder.maxpool(self.features[-1]))) + self.features.append(self.encoder.layer2(self.features[-1])) + self.features.append(self.encoder.layer3(self.features[-1])) + self.features.append(self.encoder.layer4(self.features[-1])) + + if self.training: + shared_code = self.conv_shared(self.features[-1]) + shared_gram = gram_matrix(self.features[-1]) + result.append(shared_code) # use this to calculate loss of diff + result.append(shared_gram) + result.append( + self.features[-1]) # use this to calculate loss of similarity + + union_code = private_feature + self.features[-1] + rec_code = self.convt5(union_code) + rec_code = self.convt4(rec_code) + rec_code = self.convt3(rec_code) + rec_code = self.convt2(rec_code) + rec_code = self.convt1(rec_code) + rec_code = self.convtf(rec_code) + result.append(rec_code) + + return self.features, result + else: + return self.features + + +class ResnetEncoder_pose(nn.Layer): + """Pypaddle module for a resnet encoder + """ + def __init__(self, num_layers, pretrained=False, num_input_images=1): + super(ResnetEncoder_pose, self).__init__() + + self.num_ch_enc = np.array([64, 64, 128, 256, 512]) + resnets = { + 18: paddle.vision.models.resnet18, + 34: paddle.vision.models.resnet34, + 50: paddle.vision.models.resnet50, + 101: paddle.vision.models.resnet101, + 152: paddle.vision.models.resnet152 + } + + if num_layers not in resnets: + raise ValueError( + "{} is not a valid number of resnet layers".format(num_layers)) + + if num_input_images > 1: + self.encoder = resnet_multiimage_input(num_layers, num_input_images) + else: + self.encoder = resnets[num_layers](pretrained) + + if num_layers > 34: + self.num_ch_enc[1:] *= 4 + + def forward(self, input_image): + features = [] + x = (input_image - 0.45) / 0.225 + x = self.encoder.conv1(x) + x = self.encoder.bn1(x) + features.append(self.encoder.relu(x)) + features.append(self.encoder.layer1(self.encoder.maxpool(features[-1]))) + features.append(self.encoder.layer2(features[-1])) + features.append(self.encoder.layer3(features[-1])) + features.append(self.encoder.layer4(features[-1])) + + return features + + +@BACKBONES.register() +class ADDS_DepthNet(nn.Layer): + def __init__(self, + num_layers=18, + frame_ids=[0, -1, 1], + height=256, + width=512, + batch_size=6, + pose_model_input="pairs", + use_stereo=False, + only_depth_encoder=False, + pretrained=None, + scales=[0, 1, 2, 3], + min_depth=0.1, + max_depth=100.0, + pose_model_type='separate_resnet', + v1_multiscale=False, + predictive_mask=False, + disable_automasking=False): + super(ADDS_DepthNet, self).__init__() + self.num_layers = num_layers + self.height = height + self.width = width + self.batch_size = batch_size + self.frame_ids = frame_ids + self.pose_model_input = pose_model_input + self.use_stereo = use_stereo + self.only_depth_encoder = only_depth_encoder + self.pretrained = pretrained + self.scales = scales + self.pose_model_type = pose_model_type + self.predictive_mask = predictive_mask + self.disable_automasking = disable_automasking + self.v1_multiscale = v1_multiscale + self.min_depth = min_depth + self.max_depth = max_depth + + self.num_input_frames = len(self.frame_ids) + self.num_pose_frames = 2 if self.pose_model_input == "pairs" else self.num_input_frames + + assert self.frame_ids[0] == 0, "frame_ids must start with 0" + + self.use_pose_net = not (self.use_stereo and self.frame_ids == [0]) + + self.encoder = ResnetEncoder(self.num_layers) + if not self.only_depth_encoder: + self.depth = DepthDecoder(self.encoder.num_ch_enc, self.scales) + if self.use_pose_net and not self.only_depth_encoder: + if self.pose_model_type == "separate_resnet": + self.pose_encoder = ResnetEncoder_pose( + self.num_layers, num_input_images=self.num_pose_frames) + self.pose = PoseDecoder(self.pose_encoder.num_ch_enc, + num_input_features=1, + num_frames_to_predict_for=2) + + self.backproject_depth = {} + self.project_3d = {} + for scale in self.scales: + h = self.height // (2**scale) + w = self.width // (2**scale) + + self.backproject_depth[scale] = BackprojectDepth( + self.batch_size, h, w) + self.project_3d[scale] = Project3D(batch_size, h, w) + + def init_weights(self): + """First init model's weight""" + for m in self.sublayers(include_self=True): + if isinstance(m, nn.Conv2D): + kaiming_normal_(m.weight, a=math.sqrt(5)) + if m.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(m.weight) + bound = 1 / math.sqrt(fan_in) + uniform_ = paddle.nn.initializer.Uniform(-bound, bound) + uniform_(m.bias) + """Second, if provide pretrained ckpt, load it""" + if self.pretrained: # load pretrained weights + load_ckpt(self, self.pretrained) + + def forward(self, inputs, day_or_night='day'): + if self.training: + features, result = self.encoder(inputs["color_aug", 0, 0], 'day') + features_night, result_night = self.encoder( + inputs[("color_n_aug", 0, 0)], 'night') + + outputs = self.depth(features) + outputs_night = self.depth(features_night) + if self.use_pose_net and not self.only_depth_encoder: + outputs.update(self.predict_poses(inputs, 'day')) + outputs_night.update(self.predict_poses(inputs, 'night')) + + self.generate_images_pred(inputs, outputs, 'day') + self.generate_images_pred(inputs, outputs_night, 'night') + + outputs['frame_ids'] = self.frame_ids + outputs['scales'] = self.scales + outputs['result'] = result + outputs['result_night'] = result_night + outputs_night['frame_ids'] = self.frame_ids + outputs_night['scales'] = self.scales + outputs['outputs_night'] = outputs_night + else: + if isinstance(inputs, dict): + input_color = inputs[("color", 0, 0)] + features = self.encoder(input_color, day_or_night[0]) + outputs = self.depth(features) + + pred_disp, _ = disp_to_depth(outputs[("disp", 0)], + self.min_depth, self.max_depth) + + pred_disp = pred_disp[:, 0].numpy() + + outputs['pred_disp'] = np.squeeze(pred_disp) + + outputs['gt'] = np.squeeze(inputs['depth_gt'].numpy()) + else: + input_color = inputs + features = self.encoder(input_color, day_or_night) + outputs = self.depth(features) + + pred_disp, _ = disp_to_depth(outputs[("disp", 0)], + self.min_depth, self.max_depth) + + pred_disp = pred_disp[:, 0] + outputs = paddle.squeeze(pred_disp) + return outputs + + def predict_poses(self, inputs, is_night): + """Predict poses between input frames for monocular sequences. + """ + outputs = {} + if self.num_pose_frames == 2: + if is_night: + pose_feats = { + f_i: inputs["color_n_aug", f_i, 0] + for f_i in self.frame_ids + } + else: + pose_feats = { + f_i: inputs["color_aug", f_i, 0] + for f_i in self.frame_ids + } + + for f_i in self.frame_ids[1:]: + if f_i != "s": + if f_i < 0: + pose_inputs = [pose_feats[f_i], pose_feats[0]] + else: + pose_inputs = [pose_feats[0], pose_feats[f_i]] + + if self.pose_model_type == "separate_resnet": + pose_inputs = [ + self.pose_encoder(paddle.concat(pose_inputs, + axis=1)) + ] + + axisangle, translation = self.pose(pose_inputs) + outputs[("axisangle", 0, f_i)] = axisangle + outputs[("translation", 0, f_i)] = translation + + # Invert the matrix if the frame id is negative + outputs[("cam_T_cam", 0, + f_i)] = transformation_from_parameters( + axisangle[:, 0], + translation[:, 0], + invert=(f_i < 0)) + return outputs + + def generate_images_pred(self, inputs, outputs, is_night): + """Generate the warped (reprojected) color images for a minibatch. + Generated images are saved into the `outputs` dictionary. + """ + _, _, height, width = inputs['color', 0, 0].shape + for scale in self.scales: + disp = outputs[("disp", scale)] + if self.v1_multiscale: + source_scale = scale + else: + disp = F.interpolate(disp, [height, width], + mode="bilinear", + align_corners=False) + source_scale = 0 + + _, depth = disp_to_depth(disp, self.min_depth, self.max_depth) + + outputs[("depth", 0, scale)] = depth + for i, frame_id in enumerate(self.frame_ids[1:]): + + T = outputs[("cam_T_cam", 0, frame_id)] + + cam_points = self.backproject_depth[source_scale]( + depth, inputs[("inv_K", source_scale)]) + pix_coords = self.project_3d[source_scale]( + cam_points, inputs[("K", source_scale)], T) + + outputs[("sample", frame_id, scale)] = pix_coords + + if is_night: + inputs[("color_n", frame_id, + source_scale)].stop_gradient = False + outputs[("color", frame_id, + scale)] = paddle.nn.functional.grid_sample( + inputs[("color_n", frame_id, source_scale)], + outputs[("sample", frame_id, scale)], + padding_mode="border", + align_corners=False) + + else: + inputs[("color", frame_id, + source_scale)].stop_gradient = False + outputs[("color", frame_id, + scale)] = paddle.nn.functional.grid_sample( + inputs[("color", frame_id, source_scale)], + outputs[("sample", frame_id, scale)], + padding_mode="border", + align_corners=False) + + if not self.disable_automasking: + if is_night: + outputs[("color_identity", frame_id, scale)] = \ + inputs[("color_n", frame_id, source_scale)] + else: + outputs[("color_identity", frame_id, scale)] = \ + inputs[("color", frame_id, source_scale)] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn.py new file mode 100644 index 0000000..9f870c6 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn.py @@ -0,0 +1,128 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ..registry import BACKBONES + + +class GCN(nn.Layer): + def __init__(self, in_channels, out_channels, vertex_nums=25, stride=1): + super(GCN, self).__init__() + self.conv1 = nn.Conv2D(in_channels=in_channels, + out_channels=3 * out_channels, + kernel_size=1, + stride=1) + self.conv2 = nn.Conv2D(in_channels=vertex_nums * 3, + out_channels=vertex_nums, + kernel_size=1) + + def forward(self, x): + # x --- N,C,T,V + x = self.conv1(x) # N,3C,T,V + N, C, T, V = x.shape + x = paddle.reshape(x, [N, C // 3, 3, T, V]) # N,C,3,T,V + x = paddle.transpose(x, perm=[0, 1, 2, 4, 3]) # N,C,3,V,T + x = paddle.reshape(x, [N, C // 3, 3 * V, T]) # N,C,3V,T + x = paddle.transpose(x, perm=[0, 2, 1, 3]) # N,3V,C,T + x = self.conv2(x) # N,V,C,T + x = paddle.transpose(x, perm=[0, 2, 3, 1]) # N,C,T,V + return x + + +class Block(paddle.nn.Layer): + def __init__(self, + in_channels, + out_channels, + vertex_nums=25, + temporal_size=9, + stride=1, + residual=True): + super(Block, self).__init__() + self.residual = residual + self.out_channels = out_channels + + self.bn_res = nn.BatchNorm2D(out_channels) + self.conv_res = nn.Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=(stride, 1)) + self.gcn = GCN(in_channels=in_channels, + out_channels=out_channels, + vertex_nums=vertex_nums) + self.tcn = nn.Sequential( + nn.BatchNorm2D(out_channels), + nn.ReLU(), + nn.Conv2D(in_channels=out_channels, + out_channels=out_channels, + kernel_size=(temporal_size, 1), + padding=((temporal_size - 1) // 2, 0), + stride=(stride, 1)), + nn.BatchNorm2D(out_channels), + ) + + def forward(self, x): + if self.residual: + y = self.conv_res(x) + y = self.bn_res(y) + x = self.gcn(x) + x = self.tcn(x) + out = x + y if self.residual else x + out = F.relu(out) + return out + + +@BACKBONES.register() +class AGCN(nn.Layer): + """ + AGCN model improves the performance of ST-GCN using + Adaptive Graph Convolutional Networks. + Args: + in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2. + """ + def __init__(self, in_channels=2, **kwargs): + super(AGCN, self).__init__() + + self.data_bn = nn.BatchNorm1D(25 * 2) + self.agcn = nn.Sequential( + Block(in_channels=in_channels, + out_channels=64, + residual=False, + **kwargs), Block(in_channels=64, out_channels=64, **kwargs), + Block(in_channels=64, out_channels=64, **kwargs), + Block(in_channels=64, out_channels=64, **kwargs), + Block(in_channels=64, out_channels=128, stride=2, **kwargs), + Block(in_channels=128, out_channels=128, **kwargs), + Block(in_channels=128, out_channels=128, **kwargs), + Block(in_channels=128, out_channels=256, stride=2, **kwargs), + Block(in_channels=256, out_channels=256, **kwargs), + Block(in_channels=256, out_channels=256, **kwargs)) + + self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1)) + + def forward(self, x): + # data normalization + N, C, T, V, M = x.shape + + x = x.transpose((0, 4, 1, 2, 3)) # N, M, C, T, V + x = x.reshape((N * M, C, T, V)) + + x = self.agcn(x) + + x = self.pool(x) # NM,C,T,V --> NM,C,1,1 + C = x.shape[1] + x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1) # N,C,1,1 + + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn2s.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn2s.py new file mode 100644 index 0000000..a630c68 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/agcn2s.py @@ -0,0 +1,229 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +from ..registry import BACKBONES + + +def import_class(name): + components = name.split('.') + mod = __import__(components[0]) + for comp in components[1:]: + mod = getattr(mod, comp) + return mod + + +class UnitTCN(nn.Layer): + def __init__(self, in_channels, out_channels, kernel_size=9, stride=1): + super(UnitTCN, self).__init__() + pad = int((kernel_size - 1) / 2) + self.conv = nn.Conv2D(in_channels, + out_channels, + kernel_size=(kernel_size, 1), + padding=(pad, 0), + stride=(stride, 1)) + + self.bn = nn.BatchNorm2D(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + " input size : (N*M, C, T, V)" + x = self.bn(self.conv(x)) + return x + + +class UnitGCN(nn.Layer): + def __init__(self, + in_channels, + out_channels, + A, + coff_embedding=4, + num_subset=3): + super(UnitGCN, self).__init__() + inter_channels = out_channels // coff_embedding + self.inter_c = inter_channels + PA = self.create_parameter(shape=A.shape, dtype='float32') + self.PA = PA + self.A = paddle.to_tensor(A.astype(np.float32)) + self.num_subset = num_subset + + self.conv_a = nn.LayerList() + self.conv_b = nn.LayerList() + self.conv_d = nn.LayerList() + for i in range(self.num_subset): + self.conv_a.append(nn.Conv2D(in_channels, inter_channels, 1)) + self.conv_b.append(nn.Conv2D(in_channels, inter_channels, 1)) + self.conv_d.append(nn.Conv2D(in_channels, out_channels, 1)) + + if in_channels != out_channels: + self.down = nn.Sequential(nn.Conv2D(in_channels, out_channels, 1), + nn.BatchNorm2D(out_channels)) + else: + self.down = lambda x: x + + self.bn = nn.BatchNorm2D(out_channels) + self.soft = nn.Softmax(-2) + self.relu = nn.ReLU() + + def forward(self, x): + N, C, T, V = x.shape + A = self.A + self.PA + + y = None + for i in range(self.num_subset): + A1 = paddle.transpose(self.conv_a[i](x), + perm=[0, 3, 1, + 2]).reshape([N, V, self.inter_c * T]) + A2 = self.conv_b[i](x).reshape([N, self.inter_c * T, V]) + A1 = self.soft(paddle.matmul(A1, A2) / A1.shape[-1]) + A1 = A1 + A[i] + A2 = x.reshape([N, C * T, V]) + z = self.conv_d[i](paddle.matmul(A2, A1).reshape([N, C, T, V])) + y = z + y if y is not None else z + + y = self.bn(y) + y += self.down(x) + return self.relu(y) + + +class Block(nn.Layer): + def __init__(self, in_channels, out_channels, A, stride=1, residual=True): + super(Block, self).__init__() + self.gcn1 = UnitGCN(in_channels, out_channels, A) + self.tcn1 = UnitTCN(out_channels, out_channels, stride=stride) + self.relu = nn.ReLU() + if not residual: + self.residual = lambda x: 0 + + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + + else: + self.residual = UnitTCN(in_channels, + out_channels, + kernel_size=1, + stride=stride) + + def forward(self, x): + x = self.tcn1(self.gcn1(x)) + self.residual(x) + return self.relu(x) + + +# This Graph structure is for the NTURGB+D dataset. If you use a custom dataset, modify num_node and the corresponding graph adjacency structure. +class Graph: + def __init__(self, labeling_mode='spatial'): + num_node = 25 + self_link = [(i, i) for i in range(num_node)] + inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), + (7, 6), (8, 7), (9, 21), (10, 9), (11, 10), + (12, 11), (13, 1), (14, 13), (15, 14), (16, 15), + (17, 1), (18, 17), (19, 18), (20, 19), (22, 23), + (23, 8), (24, 25), (25, 12)] + inward = [(i - 1, j - 1) for (i, j) in inward_ori_index] + outward = [(j, i) for (i, j) in inward] + neighbor = inward + outward + + self.num_node = num_node + self.self_link = self_link + self.inward = inward + self.outward = outward + self.neighbor = neighbor + self.A = self.get_adjacency_matrix(labeling_mode) + + def edge2mat(self, link, num_node): + A = np.zeros((num_node, num_node)) + for i, j in link: + A[j, i] = 1 + return A + + def normalize_digraph(self, A): + Dl = np.sum(A, 0) + h, w = A.shape + Dn = np.zeros((w, w)) + for i in range(w): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-1) + AD = np.dot(A, Dn) + return AD + + def get_spatial_graph(self, num_node, self_link, inward, outward): + I = self.edge2mat(self_link, num_node) + In = self.normalize_digraph(self.edge2mat(inward, num_node)) + Out = self.normalize_digraph(self.edge2mat(outward, num_node)) + A = np.stack((I, In, Out)) + return A + + def get_adjacency_matrix(self, labeling_mode=None): + if labeling_mode is None: + return self.A + if labeling_mode == 'spatial': + A = self.get_spatial_graph(self.num_node, self.self_link, + self.inward, self.outward) + else: + raise ValueError() + return A + + +@BACKBONES.register() +class AGCN2s(nn.Layer): + def __init__(self, + num_point=25, + num_person=2, + graph='ntu_rgb_d', + graph_args=dict(), + in_channels=3): + super(AGCN2s, self).__init__() + + if graph == 'ntu_rgb_d': + self.graph = Graph(**graph_args) + else: + raise ValueError() + + A = self.graph.A + self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point) + + self.l1 = Block(in_channels, 64, A, residual=False) + self.l2 = Block(64, 64, A) + self.l3 = Block(64, 64, A) + self.l4 = Block(64, 64, A) + self.l5 = Block(64, 128, A, stride=2) + self.l6 = Block(128, 128, A) + self.l7 = Block(128, 128, A) + self.l8 = Block(128, 256, A, stride=2) + self.l9 = Block(256, 256, A) + self.l10 = Block(256, 256, A) + + def forward(self, x): + N, C, T, V, M = x.shape + + x = x.transpose([0, 4, 3, 1, 2]).reshape_([N, M * V * C, T]) + x = self.data_bn(x) + x = x.reshape_([N, M, V, C, + T]).transpose([0, 1, 3, 4, + 2]).reshape_([N * M, C, T, V]) + + x = self.l1(x) + x = self.l2(x) + x = self.l3(x) + x = self.l4(x) + x = self.l5(x) + x = self.l6(x) + x = self.l7(x) + x = self.l8(x) + x = self.l9(x) + x = self.l10(x) + + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/asrf.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/asrf.py new file mode 100644 index 0000000..37437b3 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/asrf.py @@ -0,0 +1,75 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# https://github.com/yabufarha/ms-tcn/blob/master/model.py +# https://github.com/yiskw713/asrf/libs/models/tcn.py + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import copy +import random +import math + +from paddle import ParamAttr +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from .ms_tcn import DilatedResidualLayer +from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch + + +@BACKBONES.register() +class ASRF(nn.Layer): + + def __init__(self, in_channel, num_features, num_classes, num_stages, + num_layers): + super().__init__() + self.in_channel = in_channel + self.num_features = num_features + self.num_classes = num_classes + self.num_stages = num_stages + self.num_layers = num_layers + + # define layers + self.conv_in = nn.Conv1D(self.in_channel, self.num_features, 1) + + shared_layers = [ + DilatedResidualLayer(2**i, self.num_features, self.num_features) + for i in range(self.num_layers) + ] + self.shared_layers = nn.LayerList(shared_layers) + + self.init_weights() + + def init_weights(self): + """ + initialize model layers' weight + """ + # init weight + for layer in self.sublayers(): + if isinstance(layer, nn.Conv1D): + layer.weight.set_value( + KaimingUniform_like_torch(layer.weight).astype('float32')) + if layer.bias is not None: + layer.bias.set_value( + init_bias(layer.weight, layer.bias).astype('float32')) + + def forward(self, x): + """ ASRF forward + """ + out = self.conv_in(x) + for layer in self.shared_layers: + out = layer(out) + return out diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/bmn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/bmn.py new file mode 100644 index 0000000..200d192 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/bmn.py @@ -0,0 +1,290 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numpy as np +import paddle +from paddle import ParamAttr +from ..registry import BACKBONES + + +def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample, + num_sample_perbin): + """ generate sample mask for a boundary-matching pair """ + plen = float(seg_xmax - seg_xmin) + plen_sample = plen / (num_sample * num_sample_perbin - 1.0) + total_samples = [ + seg_xmin + plen_sample * ii + for ii in range(num_sample * num_sample_perbin) + ] + p_mask = [] + for idx in range(num_sample): + bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) * + num_sample_perbin] + bin_vector = np.zeros([tscale]) + for sample in bin_samples: + sample_upper = math.ceil(sample) + sample_decimal, sample_down = math.modf(sample) + if (tscale - 1) >= int(sample_down) >= 0: + bin_vector[int(sample_down)] += 1 - sample_decimal + if (tscale - 1) >= int(sample_upper) >= 0: + bin_vector[int(sample_upper)] += sample_decimal + bin_vector = 1.0 / num_sample_perbin * bin_vector + p_mask.append(bin_vector) + p_mask = np.stack(p_mask, axis=1) + return p_mask + + +def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample, + num_sample_perbin): + """ generate sample mask for each point in Boundary-Matching Map """ + mask_mat = [] + for start_index in range(tscale): + mask_mat_vector = [] + for duration_index in range(dscale): + if start_index + duration_index < tscale: + p_xmin = start_index + p_xmax = start_index + duration_index + center_len = float(p_xmax - p_xmin) + 1 + sample_xmin = p_xmin - center_len * prop_boundary_ratio + sample_xmax = p_xmax + center_len * prop_boundary_ratio + p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax, + tscale, num_sample, + num_sample_perbin) + else: + p_mask = np.zeros([tscale, num_sample]) + mask_mat_vector.append(p_mask) + mask_mat_vector = np.stack(mask_mat_vector, axis=2) + mask_mat.append(mask_mat_vector) + mask_mat = np.stack(mask_mat, axis=3) + mask_mat = mask_mat.astype(np.float32) + + sample_mask = np.reshape(mask_mat, [tscale, -1]) + return sample_mask + + +def init_params(name, in_channels, kernel_size): + fan_in = in_channels * kernel_size * 1 + k = 1. / math.sqrt(fan_in) + param_attr = ParamAttr(name=name, + initializer=paddle.nn.initializer.Uniform(low=-k, + high=k)) + return param_attr + + +@BACKBONES.register() +class BMN(paddle.nn.Layer): + """BMN model from + `"BMN: Boundary-Matching Network for Temporal Action Proposal Generation" `_ + Args: + tscale (int): sequence length, default 100. + dscale (int): max duration length, default 100. + prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5. + num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32. + num_sample_perbin (int): number of selected points in each sample, default 3. + """ + + def __init__( + self, + tscale, + dscale, + prop_boundary_ratio, + num_sample, + num_sample_perbin, + feat_dim=400, + ): + super(BMN, self).__init__() + + #init config + self.feat_dim = feat_dim + self.tscale = tscale + self.dscale = dscale + self.prop_boundary_ratio = prop_boundary_ratio + self.num_sample = num_sample + self.num_sample_perbin = num_sample_perbin + + self.hidden_dim_1d = 256 + self.hidden_dim_2d = 128 + self.hidden_dim_3d = 512 + + # Base Module + self.b_conv1 = paddle.nn.Conv1D( + in_channels=self.feat_dim, + out_channels=self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4, + weight_attr=init_params('Base_1_w', self.feat_dim, 3), + bias_attr=init_params('Base_1_b', self.feat_dim, 3)) + self.b_conv1_act = paddle.nn.ReLU() + + self.b_conv2 = paddle.nn.Conv1D( + in_channels=self.hidden_dim_1d, + out_channels=self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4, + weight_attr=init_params('Base_2_w', self.hidden_dim_1d, 3), + bias_attr=init_params('Base_2_b', self.hidden_dim_1d, 3)) + self.b_conv2_act = paddle.nn.ReLU() + + # Temporal Evaluation Module + self.ts_conv1 = paddle.nn.Conv1D( + in_channels=self.hidden_dim_1d, + out_channels=self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4, + weight_attr=init_params('TEM_s1_w', self.hidden_dim_1d, 3), + bias_attr=init_params('TEM_s1_b', self.hidden_dim_1d, 3)) + self.ts_conv1_act = paddle.nn.ReLU() + + self.ts_conv2 = paddle.nn.Conv1D( + in_channels=self.hidden_dim_1d, + out_channels=1, + kernel_size=1, + padding=0, + groups=1, + weight_attr=init_params('TEM_s2_w', self.hidden_dim_1d, 1), + bias_attr=init_params('TEM_s2_b', self.hidden_dim_1d, 1)) + self.ts_conv2_act = paddle.nn.Sigmoid() + + self.te_conv1 = paddle.nn.Conv1D( + in_channels=self.hidden_dim_1d, + out_channels=self.hidden_dim_1d, + kernel_size=3, + padding=1, + groups=4, + weight_attr=init_params('TEM_e1_w', self.hidden_dim_1d, 3), + bias_attr=init_params('TEM_e1_b', self.hidden_dim_1d, 3)) + self.te_conv1_act = paddle.nn.ReLU() + self.te_conv2 = paddle.nn.Conv1D( + in_channels=self.hidden_dim_1d, + out_channels=1, + kernel_size=1, + padding=0, + groups=1, + weight_attr=init_params('TEM_e2_w', self.hidden_dim_1d, 1), + bias_attr=init_params('TEM_e2_b', self.hidden_dim_1d, 1)) + self.te_conv2_act = paddle.nn.Sigmoid() + + #Proposal Evaluation Module + self.p_conv1 = paddle.nn.Conv1D( + in_channels=self.hidden_dim_1d, + out_channels=self.hidden_dim_2d, + kernel_size=3, + padding=1, + groups=1, + weight_attr=init_params('PEM_1d_w', self.hidden_dim_1d, 3), + bias_attr=init_params('PEM_1d_b', self.hidden_dim_1d, 3)) + self.p_conv1_act = paddle.nn.ReLU() + + # init to speed up + sample_mask = get_interp1d_mask(self.tscale, self.dscale, + self.prop_boundary_ratio, + self.num_sample, self.num_sample_perbin) + self.sample_mask = paddle.to_tensor(sample_mask) + self.sample_mask.stop_gradient = True + + self.p_conv3d1 = paddle.nn.Conv3D( + in_channels=128, + out_channels=self.hidden_dim_3d, + kernel_size=(self.num_sample, 1, 1), + stride=(self.num_sample, 1, 1), + padding=0, + weight_attr=ParamAttr(name="PEM_3d1_w"), + bias_attr=ParamAttr(name="PEM_3d1_b")) + self.p_conv3d1_act = paddle.nn.ReLU() + + self.p_conv2d1 = paddle.nn.Conv2D( + in_channels=512, + out_channels=self.hidden_dim_2d, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(name="PEM_2d1_w"), + bias_attr=ParamAttr(name="PEM_2d1_b")) + self.p_conv2d1_act = paddle.nn.ReLU() + + self.p_conv2d2 = paddle.nn.Conv2D( + in_channels=128, + out_channels=self.hidden_dim_2d, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name="PEM_2d2_w"), + bias_attr=ParamAttr(name="PEM_2d2_b")) + self.p_conv2d2_act = paddle.nn.ReLU() + + self.p_conv2d3 = paddle.nn.Conv2D( + in_channels=128, + out_channels=self.hidden_dim_2d, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name="PEM_2d3_w"), + bias_attr=ParamAttr(name="PEM_2d3_b")) + self.p_conv2d3_act = paddle.nn.ReLU() + + self.p_conv2d4 = paddle.nn.Conv2D( + in_channels=128, + out_channels=2, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(name="PEM_2d4_w"), + bias_attr=ParamAttr(name="PEM_2d4_b")) + self.p_conv2d4_act = paddle.nn.Sigmoid() + + def init_weights(self): + pass + + def forward(self, x): + #Base Module + x = self.b_conv1(x) + x = self.b_conv1_act(x) + x = self.b_conv2(x) + x = self.b_conv2_act(x) + + #TEM + xs = self.ts_conv1(x) + xs = self.ts_conv1_act(xs) + xs = self.ts_conv2(xs) + xs = self.ts_conv2_act(xs) + xs = paddle.squeeze(xs, axis=[1]) + xe = self.te_conv1(x) + xe = self.te_conv1_act(xe) + xe = self.te_conv2(xe) + xe = self.te_conv2_act(xe) + xe = paddle.squeeze(xe, axis=[1]) + + #PEM + xp = self.p_conv1(x) + xp = self.p_conv1_act(xp) + #BM layer + xp = paddle.matmul(xp, self.sample_mask) + xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale]) + + xp = self.p_conv3d1(xp) + xp = self.p_conv3d1_act(xp) + xp = paddle.squeeze(xp, axis=[2]) + xp = self.p_conv2d1(xp) + xp = self.p_conv2d1_act(xp) + xp = self.p_conv2d2(xp) + xp = self.p_conv2d2_act(xp) + xp = self.p_conv2d3(xp) + xp = self.p_conv2d3_act(xp) + xp = self.p_conv2d4(xp) + xp = self.p_conv2d4_act(xp) + return xp, xs, xe diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/cfbi.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/cfbi.py new file mode 100644 index 0000000..5fbf044 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/cfbi.py @@ -0,0 +1,88 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ..registry import BACKBONES +from .deeplab import DeepLab + + +class FPN(nn.Layer): + """FPN Layer""" + def __init__(self, in_dim_4x, in_dim_8x, in_dim_16x, out_dim): + super(FPN, self).__init__() + self.toplayer = self._make_layer(in_dim_16x, out_dim) + self.latlayer1 = self._make_layer(in_dim_8x, out_dim) + self.latlayer2 = self._make_layer(in_dim_4x, out_dim) + + self.smooth1 = self._make_layer(out_dim, + out_dim, + kernel_size=3, + padding=1) + self.smooth2 = self._make_layer(out_dim, + out_dim, + kernel_size=3, + padding=1) + + def _make_layer(self, in_dim, out_dim, kernel_size=1, padding=0): + return nn.Sequential( + nn.Conv2D(in_dim, + out_dim, + kernel_size=kernel_size, + stride=1, + padding=padding, + bias_attr=False), + nn.GroupNorm(num_groups=32, num_channels=out_dim)) + + def forward(self, x_4x, x_8x, x_16x): + """ forward function""" + x_16x = self.toplayer(x_16x) + x_8x = self.latlayer1(x_8x) + x_4x = self.latlayer2(x_4x) + + x_8x = x_8x + F.interpolate( + x_16x, size=x_8x.shape[-2:], mode='bilinear', align_corners=True) + x_4x = x_4x + F.interpolate( + x_8x, size=x_4x.shape[-2:], mode='bilinear', align_corners=True) + + x_8x = self.smooth1(x_8x) + x_4x = self.smooth2(x_4x) + + return F.relu(x_4x), F.relu(x_8x), F.relu(x_16x) + + +@BACKBONES.register() +class CFBI(nn.Layer): + """CFBI plus backbone""" + def __init__(self, + backbone='resnet', + freeze_bn=True, + model_aspp_outdim=256, + in_dim_8x=512, + model_semantic_embedding_dim=256): #,epsilon=1e-05): + super(CFBI, self).__init__() + #self.epsilon = epsilon + self.feature_extracter = DeepLab(backbone=backbone, freeze_bn=freeze_bn) + self.fpn = FPN(in_dim_4x=model_aspp_outdim, + in_dim_8x=in_dim_8x, + in_dim_16x=model_aspp_outdim, + out_dim=model_semantic_embedding_dim) + + def forward(self, x): + """forward function""" + x, aspp_x, low_level, mid_level = self.feature_extracter(x, True) + x_4x, x_8x, x_16x = self.fpn(x, mid_level, aspp_x) + return x_4x, x_8x, x_16x, low_level diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ctrgcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ctrgcn.py new file mode 100644 index 0000000..9d645f4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ctrgcn.py @@ -0,0 +1,514 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +from ..registry import BACKBONES +from ..weight_init import weight_init_ + + +def conv_init(conv): + if conv.weight is not None: + weight_init_(conv.weight, 'kaiming_normal_', mode='fan_in') + if conv.bias is not None: + nn.initializer.Constant(value=0.0)(conv.bias) + + +def bn_init(bn, scale): + nn.initializer.Constant(value=float(scale))(bn.weight) + nn.initializer.Constant(value=0.0)(bn.bias) + + +def einsum(x1, x3): + """paddle.einsum only support in dynamic graph mode. + x1 : n c u v + x2 : n c t v + """ + n, c, u, v1 = x1.shape + n, c, t, v3 = x3.shape + assert (v1 == v3), "Args of einsum not match!" + x1 = paddle.transpose(x1, perm=[0, 1, 3, 2]) # n c v u + y = paddle.matmul(x3, x1) + # out: n c t u + return y + + +class CTRGC(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + rel_reduction=8, + mid_reduction=1): + super(CTRGC, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + if in_channels == 3 or in_channels == 9: + self.rel_channels = 8 + self.mid_channels = 16 + else: + self.rel_channels = in_channels // rel_reduction + self.mid_channels = in_channels // mid_reduction + self.conv1 = nn.Conv2D(self.in_channels, + self.rel_channels, + kernel_size=1) + self.conv2 = nn.Conv2D(self.in_channels, + self.rel_channels, + kernel_size=1) + self.conv3 = nn.Conv2D(self.in_channels, + self.out_channels, + kernel_size=1) + self.conv4 = nn.Conv2D(self.rel_channels, + self.out_channels, + kernel_size=1) + self.tanh = nn.Tanh() + + def init_weights(self): + """Initiate the parameters. + """ + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + conv_init(m) + elif isinstance(m, nn.BatchNorm2D): + bn_init(m, 1) + + def forward(self, x, A=None, alpha=1): + x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(-2), self.conv3( + x) + x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2)) + x1 = self.conv4(x1) * alpha + ( + A.unsqueeze(0).unsqueeze(0) if A is not None else 0) # N,C,V,V + # We only support 'paddle.einsum()' in dynamic graph mode, if use in infer model please implement self. + # x1 = paddle.einsum('ncuv,nctv->nctu', x1, x3) + x1 = einsum(x1, x3) + return x1 + + +class TemporalConv(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1): + super(TemporalConv, self).__init__() + pad = (kernel_size + (kernel_size - 1) * (dilation - 1) - 1) // 2 + self.conv = nn.Conv2D(in_channels, + out_channels, + kernel_size=(kernel_size, 1), + padding=(pad, 0), + stride=(stride, 1), + dilation=(dilation, 1)) + + self.bn = nn.BatchNorm2D(out_channels) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class MultiScale_TemporalConv(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilations=[1, 2, 3, 4], + residual=True, + residual_kernel_size=1): + + super(MultiScale_TemporalConv, self).__init__() + assert out_channels % ( + len(dilations) + + 2) == 0, '# out channels should be multiples of # branches' + + # Multiple branches of temporal convolution + self.num_branches = len(dilations) + 2 + branch_channels = out_channels // self.num_branches + if type(kernel_size) == list: + assert len(kernel_size) == len(dilations) + else: + kernel_size = [kernel_size] * len(dilations) + # Temporal Convolution branches + self.branches = nn.LayerList([ + nn.Sequential( + nn.Conv2D(in_channels, + branch_channels, + kernel_size=1, + padding=0), + nn.BatchNorm2D(branch_channels), + nn.ReLU(), + TemporalConv(branch_channels, + branch_channels, + kernel_size=ks, + stride=stride, + dilation=dilation), + ) for ks, dilation in zip(kernel_size, dilations) + ]) + + # Additional Max & 1x1 branch + self.branches.append( + nn.Sequential( + nn.Conv2D(in_channels, + branch_channels, + kernel_size=1, + padding=0), nn.BatchNorm2D(branch_channels), + nn.ReLU(), + nn.MaxPool2D(kernel_size=(3, 1), + stride=(stride, 1), + padding=(1, 0)), nn.BatchNorm2D(branch_channels))) + + self.branches.append( + nn.Sequential( + nn.Conv2D(in_channels, + branch_channels, + kernel_size=1, + padding=0, + stride=(stride, 1)), nn.BatchNorm2D(branch_channels))) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = TemporalConv(in_channels, + out_channels, + kernel_size=residual_kernel_size, + stride=stride) + + def init_weights(self): + """Initiate the parameters. + """ + # initialize + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + conv_init(m) + elif isinstance(m, nn.BatchNorm2D): + weight_init_(m.weight, 'Normal', std=0.02, mean=1.0) + nn.initializer.Constant(value=0.0)(m.bias) + + def forward(self, x): + # Input dim: (N,C,T,V) + res = self.residual(x) + branch_outs = [] + for tempconv in self.branches: + out = tempconv(x) + branch_outs.append(out) + + out = paddle.concat(branch_outs, axis=1) + out += res + return out + + +class unit_tcn(nn.Layer): + + def __init__(self, in_channels, out_channels, kernel_size=9, stride=1): + super(unit_tcn, self).__init__() + pad = int((kernel_size - 1) / 2) + self.conv = nn.Conv2D(in_channels, + out_channels, + kernel_size=(kernel_size, 1), + padding=(pad, 0), + stride=(stride, 1)) + + self.bn = nn.BatchNorm2D(out_channels) + self.relu = nn.ReLU() + conv_init(self.conv) + bn_init(self.bn, 1) + + def forward(self, x): + x = self.bn(self.conv(x)) + return x + + +class unit_gcn(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + A, + coff_embedding=4, + adaptive=True, + residual=True): + super(unit_gcn, self).__init__() + inter_channels = out_channels // coff_embedding + self.inter_c = inter_channels + self.out_c = out_channels + self.in_c = in_channels + self.adaptive = adaptive + self.num_subset = A.shape[0] + self.convs = nn.LayerList() + + for i in range(self.num_subset): + self.convs.append(CTRGC(in_channels, out_channels)) + + if residual: + if in_channels != out_channels: + self.down = nn.Sequential( + nn.Conv2D(in_channels, out_channels, 1), + nn.BatchNorm2D(out_channels)) + else: + self.down = lambda x: x + else: + self.down = lambda x: 0 + if self.adaptive: + pa_param = paddle.ParamAttr( + initializer=paddle.nn.initializer.Assign(A.astype(np.float32))) + self.PA = paddle.create_parameter(shape=A.shape, + dtype='float32', + attr=pa_param) + else: + A_tensor = paddle.to_tensor(A, dtype="float32") + self.A = paddle.create_parameter( + shape=A_tensor.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Assign(A_tensor)) + self.A.stop_gradient = True + alpha_tensor = paddle.to_tensor(np.zeros(1), dtype="float32") + self.alpha = paddle.create_parameter( + shape=alpha_tensor.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Assign(alpha_tensor)) + self.bn = nn.BatchNorm2D(out_channels) + self.soft = nn.Softmax(-2) + self.relu = nn.ReLU() + + def init_weights(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + conv_init(m) + elif isinstance(m, nn.BatchNorm2D): + bn_init(m, 1) + bn_init(self.bn, 1e-6) + + def forward(self, x): + y = None + if self.adaptive: + A = self.PA + else: + A = self.A.cuda(x.get_device()) + for i in range(self.num_subset): + z = self.convs[i](x, A[i], self.alpha) + y = z + y if y is not None else z + y = self.bn(y) + y += self.down(x) + y = self.relu(y) + return y + + +class TCN_GCN_unit(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + A, + stride=1, + residual=True, + adaptive=True, + kernel_size=5, + dilations=[1, 2]): + super(TCN_GCN_unit, self).__init__() + self.gcn1 = unit_gcn(in_channels, out_channels, A, adaptive=adaptive) + self.tcn1 = MultiScale_TemporalConv(out_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + dilations=dilations, + residual=False) + self.relu = nn.ReLU() + if not residual: + self.residual = lambda x: 0 + + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + + else: + self.residual = unit_tcn(in_channels, + out_channels, + kernel_size=1, + stride=stride) + + def forward(self, x): + y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x)) + return y + + +class NTUDGraph: + + def __init__(self, labeling_mode='spatial'): + num_node = 25 + self_link = [(i, i) for i in range(num_node)] + inward_ori_index = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), + (7, 6), (8, 7), (9, 21), (10, 9), (11, 10), + (12, 11), (13, 1), (14, 13), (15, 14), (16, 15), + (17, 1), (18, 17), (19, 18), (20, 19), (22, 23), + (23, 8), (24, 25), (25, 12)] + inward = [(i - 1, j - 1) for (i, j) in inward_ori_index] + outward = [(j, i) for (i, j) in inward] + neighbor = inward + outward + + self.num_node = num_node + self.self_link = self_link + self.inward = inward + self.outward = outward + self.neighbor = neighbor + self.A = self.get_adjacency_matrix(labeling_mode) + + def edge2mat(self, link, num_node): + A = np.zeros((num_node, num_node)) + for i, j in link: + A[j, i] = 1 + return A + + def normalize_digraph(self, A): + Dl = np.sum(A, 0) + h, w = A.shape + Dn = np.zeros((w, w)) + for i in range(w): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-1) + AD = np.dot(A, Dn) + return AD + + def get_spatial_graph(self, num_node, self_link, inward, outward): + I = self.edge2mat(self_link, num_node) + In = self.normalize_digraph(self.edge2mat(inward, num_node)) + Out = self.normalize_digraph(self.edge2mat(outward, num_node)) + A = np.stack((I, In, Out)) + return A + + def get_adjacency_matrix(self, labeling_mode=None): + if labeling_mode is None: + return self.A + if labeling_mode == 'spatial': + A = self.get_spatial_graph(self.num_node, self.self_link, + self.inward, self.outward) + else: + raise ValueError() + return A + + +@BACKBONES.register() +class CTRGCN(nn.Layer): + """ + CTR-GCN model from: + `"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition" `_ + Args: + num_point: int, numbers of sketeton point. + num_person: int, numbers of person. + base_channel: int, model's hidden dim. + graph: str, sketeton adjacency matrix name. + graph_args: dict, sketeton adjacency graph class args. + in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 3. + adaptive: bool, if adjacency matrix can adaptive. + """ + + def __init__(self, + num_point=25, + num_person=2, + base_channel=64, + graph='ntu_rgb_d', + graph_args=dict(), + in_channels=3, + adaptive=True): + super(CTRGCN, self).__init__() + + if graph == 'ntu_rgb_d': + self.graph = NTUDGraph(**graph_args) + else: + raise ValueError() + + A = self.graph.A # 3,25,25 + + self.num_point = num_point + self.data_bn = nn.BatchNorm1D(num_person * in_channels * num_point) + self.base_channel = base_channel + + self.l1 = TCN_GCN_unit(in_channels, + self.base_channel, + A, + residual=False, + adaptive=adaptive) + self.l2 = TCN_GCN_unit(self.base_channel, + self.base_channel, + A, + adaptive=adaptive) + self.l3 = TCN_GCN_unit(self.base_channel, + self.base_channel, + A, + adaptive=adaptive) + self.l4 = TCN_GCN_unit(self.base_channel, + self.base_channel, + A, + adaptive=adaptive) + self.l5 = TCN_GCN_unit(self.base_channel, + self.base_channel * 2, + A, + stride=2, + adaptive=adaptive) + self.l6 = TCN_GCN_unit(self.base_channel * 2, + self.base_channel * 2, + A, + adaptive=adaptive) + self.l7 = TCN_GCN_unit(self.base_channel * 2, + self.base_channel * 2, + A, + adaptive=adaptive) + self.l8 = TCN_GCN_unit(self.base_channel * 2, + self.base_channel * 4, + A, + stride=2, + adaptive=adaptive) + self.l9 = TCN_GCN_unit(self.base_channel * 4, + self.base_channel * 4, + A, + adaptive=adaptive) + self.l10 = TCN_GCN_unit(self.base_channel * 4, + self.base_channel * 4, + A, + adaptive=adaptive) + + def init_weights(self): + bn_init(self.data_bn, 1) + + def forward(self, x): + N, C, T, V, M = x.shape + x = paddle.transpose(x, perm=[0, 4, 3, 1, 2]) + x = paddle.reshape(x, (N, M * V * C, T)) + + x = self.data_bn(x) + + x = paddle.reshape(x, (N, M, V, C, T)) + x = paddle.transpose(x, perm=(0, 1, 3, 4, 2)) + + x = paddle.reshape(x, (N * M, C, T, V)) + + x = self.l1(x) + x = self.l2(x) + x = self.l3(x) + x = self.l4(x) + x = self.l5(x) + x = self.l6(x) + x = self.l7(x) + x = self.l8(x) + x = self.l9(x) + x = self.l10(x) + + return x, N, M diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/darknet.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/darknet.py new file mode 100644 index 0000000..3f48bf6 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/darknet.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + + +class ConvBNLayer(nn.Layer): + def __init__(self, + input_channels, + output_channels, + filter_size, + stride, + padding, + name=None): + super(ConvBNLayer, self).__init__() + + self._conv = nn.Conv2D( + in_channels=input_channels, + out_channels=output_channels, + kernel_size=filter_size, + stride=stride, + padding=padding, + weight_attr=ParamAttr(name=name + ".conv.weights"), + bias_attr=False) + + bn_name = name + ".bn" + self._bn = nn.BatchNorm( + num_channels=output_channels, + act="leaky_relu", + param_attr=ParamAttr(name=bn_name + ".scale"), + bias_attr=ParamAttr(name=bn_name + ".offset"), + moving_mean_name=bn_name + ".mean", + moving_variance_name=bn_name + ".var") + + def forward(self, inputs): + x = self._conv(inputs) + x = self._bn(x) + return x + + +class BasicBlock(nn.Layer): + def __init__(self, input_channels, output_channels, name=None): + super(BasicBlock, self).__init__() + + self._conv1 = ConvBNLayer(input_channels=input_channels, output_channels=output_channels, filter_size=[ + 3, 3], stride=1, padding=1, name=name+'.0') + self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self._conv2 = ConvBNLayer(input_channels=output_channels, output_channels=output_channels * + 2, filter_size=[3, 3], stride=1, padding=1, name=name+'.1') + self._conv3 = ConvBNLayer(input_channels=output_channels*2, output_channels=output_channels, + filter_size=[1, 1], stride=1, padding=0, name=name+'.2') + + def forward(self, x): + x = self._conv1(x) + x = self._max_pool(x) + x = self._conv2(x) + x = self._conv3(x) + return x + + +class Reorg(nn.Layer): + def __init__(self, stride=2): + super(Reorg, self).__init__() + self.stride = stride + + def forward(self, x): + stride = self.stride + assert (x.dim() == 4) + B = x.shape[0] + C = x.shape[1] + H = x.shape[2] + W = x.shape[3] + assert (H % stride == 0) + assert (W % stride == 0) + ws = stride + hs = stride + x = x.reshape([B, C, H // hs, hs, W // ws, ws] + ).transpose([0, 1, 2, 4, 3, 5]) + x = x.reshape([B, C, H // hs * W // ws, hs * ws] + ).transpose([0, 1, 3, 2]) + x = x.reshape([B, C, hs * ws, H // hs, W // ws] + ).transpose([0, 2, 1, 3, 4]) + x = x.reshape([B, hs * ws * C, H // hs, W // ws]) + return x + + +class Darknet(nn.Layer): + def __init__(self, pretrained=None): + super(Darknet, self).__init__() + self.pretrained = pretrained + self._conv1 = ConvBNLayer( + input_channels=3, output_channels=32, filter_size=3, stride=1, padding=1, name='input') + self._max_pool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self._basic_block_11 = BasicBlock( + input_channels=32, output_channels=64, name='1.1') + self._basic_block_12 = BasicBlock( + input_channels=64, output_channels=128, name='1.2') + self._basic_block_13 = BasicBlock( + input_channels=128, output_channels=256, name='1.3') + self._conv2 = ConvBNLayer( + input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='up1') + self._conv3 = ConvBNLayer( + input_channels=512, output_channels=256, filter_size=1, stride=1, padding=0, name='down1') + self._conv4 = ConvBNLayer( + input_channels=256, output_channels=512, filter_size=3, stride=1, padding=1, name='2.1') + self._max_pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self._conv5 = ConvBNLayer( + input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='2.2') + self._conv6 = ConvBNLayer(input_channels=1024, output_channels=512, + filter_size=1, stride=1, padding=0, name='2.3') # ori + self._conv7 = ConvBNLayer( + input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='up2') + self._conv8 = ConvBNLayer(input_channels=1024, output_channels=512, + filter_size=1, stride=1, padding=0, name='down2') + self._conv9 = ConvBNLayer( + input_channels=512, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.1') + self._conv10 = ConvBNLayer( + input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.2') + self._conv11 = ConvBNLayer( + input_channels=1024, output_channels=1024, filter_size=3, stride=1, padding=1, name='3.3') + self._conv12 = ConvBNLayer( + input_channels=512, output_channels=64, filter_size=1, stride=1, padding=0, name='4.1') + self._reorg = Reorg() + self._conv13 = ConvBNLayer( + input_channels=1280, output_channels=1024, filter_size=3, stride=1, padding=1, name='5.1') + self._conv14 = nn.Conv2D(1024, 425, kernel_size=1) + + def forward(self, inputs): + x = self._conv1(inputs) + x = self._max_pool1(x) + x = self._basic_block_11(x) + x = self._basic_block_12(x) + x = self._basic_block_13(x) + x = self._conv2(x) + x = self._conv3(x) + ori = self._conv4(x) + x = self._max_pool2(ori) + x = self._conv5(x) + x = self._conv6(x) + x = self._conv7(x) + x = self._conv8(x) + x = self._conv9(x) + x = self._conv10(x) + x1 = self._conv11(x) + x2 = self._conv12(ori) + x2 = self._reorg(x2) + x = paddle.concat([x2, x1], 1) + x = self._conv13(x) + x = self._conv14(x) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/deeplab.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/deeplab.py new file mode 100644 index 0000000..c566205 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/deeplab.py @@ -0,0 +1,454 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import copy + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ..registry import BACKBONES + + +class FrozenBatchNorm2D(nn.Layer): + """ + BatchNorm2D where the batch statistics and the affine parameters + are fixed + """ + def __init__(self, n, epsilon=1e-5): + super(FrozenBatchNorm2D, self).__init__() + x1 = paddle.ones([n]) + x2 = paddle.zeros([n]) + weight = self.create_parameter( + shape=x1.shape, default_initializer=nn.initializer.Assign(x1)) + bias = self.create_parameter( + shape=x2.shape, default_initializer=nn.initializer.Assign(x2)) + running_mean = self.create_parameter( + shape=x2.shape, default_initializer=nn.initializer.Assign(x2)) + running_var = self.create_parameter( + shape=x1.shape, default_initializer=nn.initializer.Assign(x1)) + self.add_parameter('weight', weight) + self.add_parameter('bias', bias) + self.add_parameter('running_mean', running_mean) + self.add_parameter('running_var', running_var) + self.epsilon = epsilon + + def forward(self, x): + scale = self.weight * paddle.rsqrt((self.running_var + self.epsilon)) + bias = self.bias - self.running_mean * scale + scale = paddle.reshape(scale, [1, -1, 1, 1]) + bias = paddle.reshape(bias, [1, -1, 1, 1]) + return x * scale + bias + + +class Bottleneck(nn.Layer): + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + BatchNorm=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False) + self.bn1 = BatchNorm(planes) + self.conv2 = nn.Conv2D(planes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + padding=dilation, + bias_attr=False) + self.bn2 = BatchNorm(planes) + self.conv3 = nn.Conv2D(planes, + planes * 4, + kernel_size=1, + bias_attr=False) + self.bn3 = BatchNorm(planes * 4) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + self.dilation = dilation + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Layer): + def __init__(self, + block, + layers, + output_stride, + BatchNorm, + pretrained=False): + self.inplanes = 64 + super(ResNet, self).__init__() + blocks = [1, 2, 4] + if output_stride == 16: + strides = [1, 2, 2, 1] + dilations = [1, 1, 1, 2] + elif output_stride == 8: + strides = [1, 2, 1, 1] + dilations = [1, 1, 2, 4] + else: + raise NotImplementedError + + # Modules + self.conv1 = nn.Conv2D(3, + 64, + kernel_size=7, + stride=2, + padding=3, + bias_attr=False) + self.bn1 = BatchNorm(64) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.layer1 = self._make_layer(block, + 64, + layers[0], + stride=strides[0], + dilation=dilations[0], + BatchNorm=BatchNorm) + self.layer2 = self._make_layer(block, + 128, + layers[1], + stride=strides[1], + dilation=dilations[1], + BatchNorm=BatchNorm) + self.layer3 = self._make_layer(block, + 256, + layers[2], + stride=strides[2], + dilation=dilations[2], + BatchNorm=BatchNorm) + self.layer4 = self._make_MG_unit(block, + 512, + blocks=blocks, + stride=strides[3], + dilation=dilations[3], + BatchNorm=BatchNorm) + self._init_weight() + + def _make_layer(self, + block, + planes, + blocks, + stride=1, + dilation=1, + BatchNorm=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D(self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + BatchNorm(planes * block.expansion), + ) + + layers = [] + layers.append( + block(self.inplanes, planes, stride, dilation, downsample, + BatchNorm)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block(self.inplanes, + planes, + dilation=dilation, + BatchNorm=BatchNorm)) + + return nn.Sequential(*layers) + + def _make_MG_unit(self, + block, + planes, + blocks, + stride=1, + dilation=1, + BatchNorm=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D(self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + BatchNorm(planes * block.expansion), + ) + + layers = [] + layers.append( + block(self.inplanes, + planes, + stride, + dilation=blocks[0] * dilation, + downsample=downsample, + BatchNorm=BatchNorm)) + self.inplanes = planes * block.expansion + for i in range(1, len(blocks)): + layers.append( + block(self.inplanes, + planes, + stride=1, + dilation=blocks[i] * dilation, + BatchNorm=BatchNorm)) + + return nn.Sequential(*layers) + + def forward(self, input, return_mid_level=False): + x = self.conv1(input) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + low_level_feat = x + x = self.layer2(x) + mid_level_feat = x + x = self.layer3(x) + x = self.layer4(x) + if return_mid_level: + return x, low_level_feat, mid_level_feat + else: + return x, low_level_feat + + def _init_weight(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + nn.initializer.KaimingNormal() + elif isinstance(m, nn.GroupNorm): + m.weight.data = nn.initializer.Constant(1) + m.bias.data = nn.initializer.Constant(0) + + +class _ASPPModule(nn.Layer): + def __init__(self, inplanes, planes, kernel_size, padding, dilation, + BatchNorm): + super(_ASPPModule, self).__init__() + self.atrous_conv = nn.Conv2D(inplanes, + planes, + kernel_size=kernel_size, + stride=1, + padding=padding, + dilation=dilation, + bias_attr=False) + self.bn = BatchNorm(planes) + self.relu = nn.ReLU() + + self._init_weight() + + def forward(self, x): + x = self.atrous_conv(x) + x = self.bn(x) + + return self.relu(x) + + def _init_weight(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + m.weight_attr = nn.initializer.KaimingNormal() + elif isinstance(m, nn.BatchNorm2D): + m.weight.data.fill_(1) + m.bias.data.zero_() + + +class ASPP(nn.Layer): + def __init__(self, backbone, output_stride, BatchNorm): + super(ASPP, self).__init__() + if backbone == 'drn': + inplanes = 512 + elif backbone == 'mobilenet': + inplanes = 320 + else: + inplanes = 2048 + if output_stride == 16: + dilations = [1, 6, 12, 18] + elif output_stride == 8: + dilations = [1, 12, 24, 36] + else: + raise NotImplementedError + + self.aspp1 = _ASPPModule(inplanes, + 256, + 1, + padding=0, + dilation=dilations[0], + BatchNorm=BatchNorm) + self.aspp2 = _ASPPModule(inplanes, + 256, + 3, + padding=dilations[1], + dilation=dilations[1], + BatchNorm=BatchNorm) + self.aspp3 = _ASPPModule(inplanes, + 256, + 3, + padding=dilations[2], + dilation=dilations[2], + BatchNorm=BatchNorm) + self.aspp4 = _ASPPModule(inplanes, + 256, + 3, + padding=dilations[3], + dilation=dilations[3], + BatchNorm=BatchNorm) + + self.global_avg_pool = nn.Sequential( + nn.AdaptiveAvgPool2D((1, 1)), + nn.Conv2D(inplanes, 256, 1, stride=1, bias_attr=False), + BatchNorm(256), nn.ReLU()) + self.conv1 = nn.Conv2D(1280, 256, 1, bias_attr=False) + self.bn1 = BatchNorm(256) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(0.1) + self._init_weight() + + def forward(self, x): + x1 = self.aspp1(x) + x2 = self.aspp2(x) + x3 = self.aspp3(x) + x4 = self.aspp4(x) + x5 = self.global_avg_pool(x) + x5 = F.interpolate(x5, + size=x4.shape[2:], + mode='bilinear', + align_corners=True) + x = paddle.concat(x=[x1, x2, x3, x4, x5], axis=1) + + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + return self.dropout(x) + + def _init_weight(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + nn.initializer.KaimingNormal() + elif isinstance(m, nn.GroupNorm): + m.weight.data = nn.initializer.Constant(1) + m.bias.data = nn.initializer.Constant(0) + + +class Decoder(nn.Layer): + def __init__(self, backbone, BatchNorm): + super(Decoder, self).__init__() + if backbone == 'resnet': + low_level_inplanes = 256 + elif backbone == 'mobilenet': + raise NotImplementedError + else: + raise NotImplementedError + + self.conv1 = nn.Conv2D(low_level_inplanes, 48, 1, bias_attr=False) + self.bn1 = BatchNorm(48) + self.relu = nn.ReLU() + + self.last_conv = nn.Sequential( + nn.Conv2D(304, + 256, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False), BatchNorm(256), nn.ReLU(), + nn.Sequential(), + nn.Conv2D(256, + 256, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False), BatchNorm(256), nn.ReLU(), + nn.Sequential()) + + self._init_weight() + + def forward(self, x, low_level_feat): + low_level_feat = self.conv1(low_level_feat) + low_level_feat = self.bn1(low_level_feat) + low_level_feat = self.relu(low_level_feat) + + x = F.interpolate(x, + size=low_level_feat.shape[2:], + mode='bilinear', + align_corners=True) + x = paddle.concat(x=[x, low_level_feat], axis=1) + x = self.last_conv(x) + + return x + + def _init_weight(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + nn.initializer.KaimingNormal() + elif isinstance(m, nn.GroupNorm): + m.weight.data = nn.initializer.Constant(1) + m.bias.data = nn.initializer.Constant(0) + + +class DeepLab(nn.Layer): + """DeepLab model for segmentation""" + def __init__(self, backbone='resnet', output_stride=16, freeze_bn=True): + super(DeepLab, self).__init__() + + if freeze_bn == True: + print("Use frozen BN in DeepLab!") + BatchNorm = FrozenBatchNorm2D + else: + BatchNorm = nn.BatchNorm2D + + self.backbone = ResNet(Bottleneck, [3, 4, 23, 3], + output_stride, + BatchNorm, + pretrained=True) + self.aspp = ASPP(backbone, output_stride, BatchNorm) + self.decoder = Decoder(backbone, BatchNorm) + + def forward(self, input, return_aspp=False): + """forward function""" + if return_aspp: + x, low_level_feat, mid_level_feat = self.backbone(input, True) + else: + x, low_level_feat = self.backbone(input) + aspp_x = self.aspp(x) + x = self.decoder(aspp_x, low_level_feat) + + if return_aspp: + return x, aspp_x, low_level_feat, mid_level_feat + else: + return x, low_level_feat diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/movinet.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/movinet.py new file mode 100644 index 0000000..cb6d4fd --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/movinet.py @@ -0,0 +1,574 @@ +import collections.abc +from itertools import repeat +from typing import Any, Callable, Optional, Tuple, Union + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.layer import Identity + +from ..registry import BACKBONES +from collections import OrderedDict + +container_abcs = collections.abc +"""Model Config +""" + +A0 = {'block_num': [0, 1, 3, 3, 4, 4]} +A0['conv1'] = [3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1)] +A0['b2_l0'] = [8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1)] +A0['b3_l0'] = [8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0)] +A0['b3_l1'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b3_l2'] = [32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b4_l0'] = [32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0)] +A0['b4_l1'] = [56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b4_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b5_l0'] = [56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1)] +A0['b5_l1'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b5_l2'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b5_l3'] = [56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1)] +A0['b6_l0'] = [56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1)] +A0['b6_l1'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)] +A0['b6_l2'] = [104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)] +A0['b6_l3'] = [104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1)] +A0['conv7'] = [104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0)] + +MODEL_CONFIG = {'A0': A0} + + +def _ntuple(n): + def parse(x): + if isinstance(x, container_abcs.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +def _make_divisible(v: float, + divisor: int, + min_value: Optional[int] = None) -> int: + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8. + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +_single = _ntuple(1) +_pair = _ntuple(2) +_triple = _ntuple(3) +_quadruple = _ntuple(4) + + +class CausalModule(nn.Layer): + def __init__(self) -> None: + super().__init__() + self.activation = None + + def reset_activation(self) -> None: + self.activation = None + + +class Conv2dBNActivation(nn.Sequential): + def __init__( + self, + in_planes: int, + out_planes: int, + kernel_size: Union[int, Tuple[int, int]], + padding: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + norm_layer: Optional[Callable[..., nn.Layer]] = None, + activation_layer: Optional[Callable[..., nn.Layer]] = None, + **kwargs: Any, + ) -> None: + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + if norm_layer is None: + norm_layer = Identity + if activation_layer is None: + activation_layer = Identity + self.kernel_size = kernel_size + self.stride = stride + dict_layers = (nn.Conv2D(in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + **kwargs), norm_layer(out_planes, + momentum=0.1), + activation_layer()) + + self.out_channels = out_planes + super(Conv2dBNActivation, self).__init__(dict_layers[0], dict_layers[1], + dict_layers[2]) + + +class Conv3DBNActivation(nn.Sequential): + def __init__( + self, + in_planes: int, + out_planes: int, + kernel_size: Union[int, Tuple[int, int, int]], + padding: Union[int, Tuple[int, int, int]], + stride: Union[int, Tuple[int, int, int]] = 1, + groups: int = 1, + norm_layer: Optional[Callable[..., nn.Layer]] = None, + activation_layer: Optional[Callable[..., nn.Layer]] = None, + **kwargs: Any, + ) -> None: + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + if norm_layer is None: + norm_layer = Identity + if activation_layer is None: + activation_layer = Identity + self.kernel_size = kernel_size + self.stride = stride + + dict_layers = (nn.Conv3D(in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + **kwargs), norm_layer(out_planes, + momentum=0.1), + activation_layer()) + self.out_channels = out_planes + super(Conv3DBNActivation, self).__init__(dict_layers[0], dict_layers[1], + dict_layers[2]) + + +class ConvBlock3D(CausalModule): + def __init__( + self, + in_planes: int, + out_planes: int, + kernel_size: Union[int, Tuple[int, int, int]], + causal: bool, + conv_type: str, + padding: Union[int, Tuple[int, int, int]] = 0, + stride: Union[int, Tuple[int, int, int]] = 1, + norm_layer: Optional[Callable[..., nn.Layer]] = None, + activation_layer: Optional[Callable[..., nn.Layer]] = None, + bias_attr: bool = False, + **kwargs: Any, + ) -> None: + super().__init__() + kernel_size = _triple(kernel_size) + stride = _triple(stride) + padding = _triple(padding) + self.conv_2 = None + + if causal is True: + padding = (0, padding[1], padding[2]) + if conv_type != "2plus1d" and conv_type != "3d": + raise ValueError("only 2plus2d or 3d are " + + "allowed as 3d convolutions") + + if conv_type == "2plus1d": + self.conv_1 = Conv2dBNActivation(in_planes, + out_planes, + kernel_size=(kernel_size[1], + kernel_size[2]), + padding=(padding[1], padding[2]), + stride=(stride[1], stride[2]), + activation_layer=activation_layer, + norm_layer=norm_layer, + bias_attr=bias_attr, + **kwargs) + if kernel_size[0] > 1: + self.conv_2 = Conv2dBNActivation( + in_planes, + out_planes, + kernel_size=(kernel_size[0], 1), + padding=(padding[0], 0), + stride=(stride[0], 1), + activation_layer=activation_layer, + norm_layer=norm_layer, + bias_attr=bias_attr, + **kwargs) + elif conv_type == "3d": + self.conv_1 = Conv3DBNActivation(in_planes, + out_planes, + kernel_size=kernel_size, + padding=padding, + activation_layer=activation_layer, + norm_layer=norm_layer, + stride=stride, + bias_attr=bias_attr, + **kwargs) + self.padding = padding + self.kernel_size = kernel_size + self.dim_pad = self.kernel_size[0] - 1 + self.stride = stride + self.causal = causal + self.conv_type = conv_type + + def _forward(self, x: paddle.Tensor) -> paddle.Tensor: + if self.dim_pad > 0 and self.conv_2 is None and self.causal is True: + x = self._cat_stream_buffer(x) + b, c, t, h, w = x.shape + if self.conv_type == "2plus1d": + x = paddle.transpose(x, (0, 2, 1, 3, 4)) # bcthw --> btchw + x = paddle.reshape_(x, (-1, c, h, w)) # btchw --> bt,c,h,w + x = self.conv_1(x) + if self.conv_type == "2plus1d": + b, c, h, w = x.shape + x = paddle.reshape_(x, (-1, t, c, h, w)) # bt,c,h,w --> b,t,c,h,w + x = paddle.transpose(x, (0, 2, 1, 3, 4)) # b,t,c,h,w --> b,c,t,h,w + if self.conv_2 is not None: + if self.dim_pad > 0 and self.causal is True: + x = self._cat_stream_buffer(x) + b, c, t, h, w = x.shape + x = paddle.reshape_(x, (b, c, t, h * w)) + x = self.conv_2(x) + b, c, t, _ = x.shape + x = paddle.reshape_(x, (b, c, t, h, w)) + return x + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self._forward(x) + return x + + def _cat_stream_buffer(self, x: paddle.Tensor) -> paddle.Tensor: + if self.activation is None: + self._setup_activation(x.shape) + x = paddle.concat((self.activation, x), 2) + self._save_in_activation(x) + return x + + def _save_in_activation(self, x: paddle.Tensor) -> None: + assert self.dim_pad > 0 + self.activation = paddle.to_tensor(x.numpy()[:, :, -self.dim_pad:, + ...]).clone().detach() + + def _setup_activation(self, input_shape: Tuple[float, ...]) -> None: + assert self.dim_pad > 0 + self.activation = paddle.zeros(shape=[ + *input_shape[:2], # type: ignore + self.dim_pad, + *input_shape[3:] + ]) + + +class TemporalCGAvgPool3D(CausalModule): + def __init__(self, ) -> None: + super().__init__() + self.n_cumulated_values = 0 + self.register_forward_post_hook(self._detach_activation) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + input_shape = x.shape + cumulative_sum = paddle.cumsum(x, axis=2) + if self.activation is None: + self.activation = cumulative_sum[:, :, -1:].clone() + else: + cumulative_sum += self.activation + self.activation = cumulative_sum[:, :, -1:].clone() + + noe = paddle.arange(1, input_shape[2] + 1) + axis = paddle.to_tensor([0, 1, 3, 4]) + noe = paddle.unsqueeze(noe, axis=axis) + divisor = noe.expand(x.shape) + x = cumulative_sum / (self.n_cumulated_values + divisor) + self.n_cumulated_values += input_shape[2] + return x + + @staticmethod + def _detach_activation(module: CausalModule, inputs: paddle.Tensor, + output: paddle.Tensor) -> None: + module.activation.detach() + + def reset_activation(self) -> None: + super().reset_activation() + self.n_cumulated_values = 0 + + +class SqueezeExcitation(nn.Layer): + def __init__(self, + input_channels: int, + activation_2: nn.Layer, + activation_1: nn.Layer, + conv_type: str, + causal: bool, + squeeze_factor: int = 4, + bias_attr: bool = True) -> None: + super().__init__() + self.causal = causal + se_multiplier = 2 if causal else 1 + squeeze_channels = _make_divisible( + input_channels // squeeze_factor * se_multiplier, 8) + self.temporal_cumualtive_GAvg3D = TemporalCGAvgPool3D() + self.fc1 = ConvBlock3D(input_channels * se_multiplier, + squeeze_channels, + kernel_size=(1, 1, 1), + padding=0, + causal=causal, + conv_type=conv_type, + bias_attr=bias_attr) + self.activation_1 = activation_1() + self.activation_2 = activation_2() + self.fc2 = ConvBlock3D(squeeze_channels, + input_channels, + kernel_size=(1, 1, 1), + padding=0, + causal=causal, + conv_type=conv_type, + bias_attr=bias_attr) + + def _scale(self, inputs: paddle.Tensor) -> paddle.Tensor: + if self.causal: + x_space = paddle.mean(inputs, axis=[3, 4], keepdim=True) + scale = self.temporal_cumualtive_GAvg3D(x_space) + scale = paddle.concat((scale, x_space), axis=1) + else: + scale = F.adaptive_avg_pool3d(inputs, 1) + scale = self.fc1(scale) + scale = self.activation_1(scale) + scale = self.fc2(scale) + return self.activation_2(scale) + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + scale = self._scale(inputs) + return scale * inputs + + +class BasicBneck(nn.Layer): + def __init__( + self, + input_channels, + out_channels, + expanded_channels, + kernel_size, + stride, + padding, + padding_avg, + causal: bool, + conv_type: str, + norm_layer: Optional[Callable[..., nn.Layer]] = None, + activation_layer: Optional[Callable[..., nn.Layer]] = None, + ) -> None: + super().__init__() + + assert type(stride) is tuple + + if (not stride[0] == 1 or not (1 <= stride[1] <= 2) + or not (1 <= stride[2] <= 2)): + raise ValueError('illegal stride value') + + self.res = None + + layers = [] + if expanded_channels != out_channels: + # expand + self.expand = ConvBlock3D(in_planes=input_channels, + out_planes=expanded_channels, + kernel_size=(1, 1, 1), + padding=(0, 0, 0), + causal=causal, + conv_type=conv_type, + norm_layer=norm_layer, + activation_layer=activation_layer) + # deepwise + self.deep = ConvBlock3D(in_planes=expanded_channels, + out_planes=expanded_channels, + kernel_size=kernel_size, + padding=padding, + stride=stride, + groups=expanded_channels, + causal=causal, + conv_type=conv_type, + norm_layer=norm_layer, + activation_layer=activation_layer) + + # SE + self.se = SqueezeExcitation( + expanded_channels, + causal=causal, + activation_1=activation_layer, + activation_2=(nn.Sigmoid if conv_type == "3d" else nn.Hardsigmoid), + conv_type=conv_type) + # project + self.project = ConvBlock3D(expanded_channels, + out_channels, + kernel_size=(1, 1, 1), + padding=(0, 0, 0), + causal=causal, + conv_type=conv_type, + norm_layer=norm_layer, + activation_layer=Identity) + + if not (stride == (1, 1, 1) and input_channels == out_channels): + if stride != (1, 1, 1): + layers.append( + nn.AvgPool3D((1, 3, 3), stride=stride, padding=padding_avg)) + layers.append( + ConvBlock3D( + in_planes=input_channels, + out_planes=out_channels, + kernel_size=(1, 1, 1), + padding=(0, 0, 0), + norm_layer=norm_layer, + activation_layer=Identity, + causal=causal, + conv_type=conv_type, + )) + self.res = nn.Sequential(*layers) + self.alpha = self.create_parameter(shape=[1], dtype="float32") + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + if self.res is not None: + residual = self.res(inputs) + else: + residual = inputs + if self.expand is not None: + x = self.expand(inputs) + else: + x = inputs + + x = self.deep(x) + x = self.se(x) + x = self.project(x) + result = residual + self.alpha * x + return result + + +@BACKBONES.register() +class MoViNet(nn.Layer): + def __init__( + self, + model_type: str = 'A0', + hidden_dim: int = 2048, + causal: bool = True, + num_classes: int = 400, + conv_type: str = "3d", + ) -> None: + super().__init__() + """ + causal: causal mode + num_classes: number of classes for classifcation + conv_type: type of convolution either 3d or 2plus1d + """ + blocks_dic = OrderedDict() + cfg = MODEL_CONFIG[model_type] + + norm_layer = nn.BatchNorm3D if conv_type == "3d" else nn.BatchNorm2D + activation_layer = nn.Swish if conv_type == "3d" else nn.Hardswish + + # conv1 + self.conv1 = ConvBlock3D(in_planes=cfg['conv1'][0], + out_planes=cfg['conv1'][1], + kernel_size=cfg['conv1'][2], + stride=cfg['conv1'][3], + padding=cfg['conv1'][4], + causal=causal, + conv_type=conv_type, + norm_layer=norm_layer, + activation_layer=activation_layer) + # blocks + for i in range(2, len(cfg['block_num']) + 1): + for j in range(cfg['block_num'][i - 1]): + blocks_dic[f'b{i}_l{j}'] = BasicBneck( + cfg[f'b{i}_l{j}'][0], + cfg[f'b{i}_l{j}'][1], + cfg[f'b{i}_l{j}'][2], + cfg[f'b{i}_l{j}'][3], + cfg[f'b{i}_l{j}'][4], + cfg[f'b{i}_l{j}'][5], + cfg[f'b{i}_l{j}'][6], + causal=causal, + conv_type=conv_type, + norm_layer=norm_layer, + activation_layer=activation_layer) + self.blocks = nn.Sequential(*(blocks_dic.values())) + + # conv7 + self.conv7 = ConvBlock3D(in_planes=cfg['conv7'][0], + out_planes=cfg['conv7'][1], + kernel_size=cfg['conv7'][2], + stride=cfg['conv7'][3], + padding=cfg['conv7'][4], + causal=causal, + conv_type=conv_type, + norm_layer=norm_layer, + activation_layer=activation_layer) + # pool + self.classifier = nn.Sequential( + # dense9 + ConvBlock3D(in_planes=cfg['conv7'][1], + out_planes=hidden_dim, + kernel_size=(1, 1, 1), + causal=causal, + conv_type=conv_type, + bias_attr=True), + nn.Swish(), + nn.Dropout(p=0.2), + # dense10d + ConvBlock3D(in_planes=hidden_dim, + out_planes=num_classes, + kernel_size=(1, 1, 1), + causal=causal, + conv_type=conv_type, + bias_attr=True), + ) + if causal: + self.cgap = TemporalCGAvgPool3D() + self.apply(self._weight_init) + self.causal = causal + + def avg(self, x: paddle.Tensor) -> paddle.Tensor: + if self.causal: + avg = F.adaptive_avg_pool3d(x, (x.shape[2], 1, 1)) + avg = self.cgap(avg)[:, :, -1:] + else: + avg = F.adaptive_avg_pool3d(x, 1) + return avg + + @staticmethod + def _weight_init(m): + if isinstance(m, nn.Conv3D): + nn.initializer.KaimingNormal(m.weight) + if m.bias is not None: + nn.initializer.Constant(0.0)(m.bias) + elif isinstance(m, (nn.BatchNorm3D, nn.BatchNorm2D, nn.GroupNorm)): + nn.initializer.Constant(1.0)(m.weight) + nn.initializer.Constant(0.0)(m.bias) + elif isinstance(m, nn.Linear): + nn.initializer.Normal(m.weight, 0, 0.01) + nn.initializer.Constant(0.0)(m.bias) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self.conv1(x) + x = self.blocks(x) + x = self.conv7(x) + x = self.avg(x) + x = self.classifier(x) + x = x.flatten(1) + return x + + @staticmethod + def _clean_activation_buffers(m): + if issubclass(type(m), CausalModule): + m.reset_activation() + + def clean_activation_buffers(self) -> None: + self.apply(self._clean_activation_buffers) + + +if __name__ == '__main__': + net = MoViNet(causal=False, conv_type='3d') + paddle.summary(net, input_size=(1, 3, 8, 224, 224)) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ms_tcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ms_tcn.py new file mode 100644 index 0000000..fb49b9c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/ms_tcn.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import copy +import random +import math + +from paddle import ParamAttr +from ..registry import BACKBONES +from ..weight_init import weight_init_ + + +def _calculate_fan_in_and_fan_out(tensor): + dimensions = len(tensor.shape) + if dimensions < 2: + raise ValueError("Fan in and fan out can not be computed \ + for tensor with fewer than 2 dimensions") + + if dimensions == 2: # Linear + fan_in = tensor.shape[1] + fan_out = tensor.shape[0] + else: + num_input_fmaps = tensor.shape[1] + num_output_fmaps = tensor.shape[0] + receptive_field_size = 1 + if tensor.dim() > 2: + receptive_field_size = tensor[0][0].numel() + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def calculate_gain(nonlinearity=None, a=None): + if nonlinearity == 'tanh': + return 5.0 / 3 + elif nonlinearity == 'relu': + return math.sqrt(2.0) + elif nonlinearity == 'leaky_relu': + if a != None: + return math.sqrt(2.0 / (1 + a**2)) + else: + return math.sqrt(2.0 / (1 + 0.01**2)) + elif nonlinearity == 'selu': + return 3.0 / 4 + else: + return 1 + + +def KaimingUniform_like_torch(weight_npy, + mode='fan_in', + nonlinearity='leaky_relu'): + fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy) + if mode == 'fan_in': + fan_mode = fan_in + else: + fan_mode = fan_out + a = math.sqrt(5.0) + gain = calculate_gain(nonlinearity=nonlinearity, a=a) + std = gain / math.sqrt(fan_mode) + bound = math.sqrt(3.0) * std + return np.random.uniform(-bound, bound, weight_npy.shape) + + +def init_bias(weight_npy, bias_npy): + # attention this weight is not bias + fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy) + bound = 1.0 / math.sqrt(fan_in) + return np.random.uniform(-bound, bound, bias_npy.shape) + + +class SingleStageModel(nn.Layer): + + def __init__(self, num_layers, num_f_maps, dim, num_classes): + super(SingleStageModel, self).__init__() + self.conv_in = nn.Conv1D(dim, num_f_maps, 1) + self.layers = nn.LayerList([ + copy.deepcopy(DilatedResidualLayer(2**i, num_f_maps, num_f_maps)) + for i in range(num_layers) + ]) + self.conv_out = nn.Conv1D(num_f_maps, num_classes, 1) + + def forward(self, x): + out = self.conv_in(x) + for layer in self.layers: + out = layer(out) + out = self.conv_out(out) + return out + + +class DilatedResidualLayer(nn.Layer): + + def __init__(self, dilation, in_channels, out_channels): + super(DilatedResidualLayer, self).__init__() + self.conv_dilated = nn.Conv1D(in_channels, + out_channels, + 3, + padding=dilation, + dilation=dilation) + self.conv_in = nn.Conv1D(out_channels, out_channels, 1) + self.dropout = nn.Dropout() + + def forward(self, x): + out = F.relu(self.conv_dilated(x)) + out = self.conv_in(out) + out = self.dropout(out) + return (x + out) + + +@BACKBONES.register() +class MSTCN(nn.Layer): + + def __init__(self, num_stages, num_layers, num_f_maps, dim, num_classes): + super().__init__() + self.stage1 = SingleStageModel(num_layers, num_f_maps, dim, num_classes) + self.stages = nn.LayerList([ + copy.deepcopy( + SingleStageModel(num_layers, num_f_maps, num_classes, + num_classes)) for s in range(num_stages - 1) + ]) + + def forward(self, x): + """ MSTCN forward + """ + out = self.stage1(x) + outputs = out.unsqueeze(0) + for s in self.stages: + out = s(F.softmax(out, axis=1)) + outputs = paddle.concat((outputs, out.unsqueeze(0)), axis=0) + return outputs + + def init_weights(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv1D): + layer.weight.set_value( + KaimingUniform_like_torch(layer.weight).astype('float32')) + if layer.bias is not None: + layer.bias.set_value( + init_bias(layer.weight, layer.bias).astype('float32')) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv2.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv2.py new file mode 100644 index 0000000..28d045d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv2.py @@ -0,0 +1,282 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + +# Download URL of pretrained model +# { +# "MobileNetV2": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_ssld_pretrained.pdparams", + +# "MobileNetV2_x0_25": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams", +# "MobileNetV2_x0_5": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams", +# "MobileNetV2_x0_75": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams", +# "MobileNetV2_x1_5": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams", +# "MobileNetV2_x2_0": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams" +# } + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + name=None, + use_cudnn=True): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D(in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + param_attr=ParamAttr(name=name + "_bn_scale"), + bias_attr=ParamAttr(name=name + "_bn_offset"), + moving_mean_name=name + "_bn_mean", + moving_variance_name=name + "_bn_variance") + + def forward(self, inputs, if_act=True): + y = self._conv(inputs) + y = self._batch_norm(y) + if if_act: + y = F.relu6(y) + return y + + +class InvertedResidualUnit(nn.Layer): + def __init__(self, num_channels, num_in_filter, num_filters, stride, + filter_size, padding, expansion_factor, name, num_seg): + super(InvertedResidualUnit, self).__init__() + self.num_seg = num_seg + num_expfilter = int(round(num_in_filter * expansion_factor)) + self._expand_conv = ConvBNLayer(num_channels=num_channels, + num_filters=num_expfilter, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + name=name + "_expand") + + self._bottleneck_conv = ConvBNLayer(num_channels=num_expfilter, + num_filters=num_expfilter, + filter_size=filter_size, + stride=stride, + padding=padding, + num_groups=num_expfilter, + use_cudnn=False, + name=name + "_dwise") + + self._linear_conv = ConvBNLayer(num_channels=num_expfilter, + num_filters=num_filters, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + name=name + "_linear") + + def forward(self, inputs, ifshortcut): + # add temporal shift module + y = inputs + if ifshortcut: + y = F.temporal_shift(y, self.num_seg, 1.0 / self.num_seg) + + y = self._expand_conv(y, if_act=True) + y = self._bottleneck_conv(y, if_act=True) + y = self._linear_conv(y, if_act=False) + if ifshortcut: + y = paddle.add(inputs, y) + return y + + +class InvresiBlocks(nn.Layer): + def __init__(self, in_c, t, c, n, s, name, num_seg): + super(InvresiBlocks, self).__init__() + + self._first_block = InvertedResidualUnit(num_channels=in_c, + num_in_filter=in_c, + num_filters=c, + stride=s, + filter_size=3, + padding=1, + expansion_factor=t, + name=name + "_1", + num_seg=num_seg) + + self._block_list = [] + for i in range(1, n): + block = self.add_sublayer(name + "_" + str(i + 1), + sublayer=InvertedResidualUnit( + num_channels=c, + num_in_filter=c, + num_filters=c, + stride=1, + filter_size=3, + padding=1, + expansion_factor=t, + name=name + "_" + str(i + 1), + num_seg=num_seg)) + self._block_list.append(block) + + def forward(self, inputs): + y = self._first_block(inputs, ifshortcut=False) + for block in self._block_list: + y = block(y, ifshortcut=True) + return y + + +class MobileNet(nn.Layer): + def __init__(self, + class_num=400, + scale=1.0, + pretrained=None, + prefix_name="", + num_seg=8): + super(MobileNet, self).__init__() + self.scale = scale + self.class_num = class_num + self.pretrained = pretrained + self.num_seg = num_seg + + bottleneck_params_list = [ + (1, 16, 1, 1), + (6, 24, 2, 2), + (6, 32, 3, 2), + (6, 64, 4, 2), + (6, 96, 3, 1), + (6, 160, 3, 2), + (6, 320, 1, 1), + ] + + self.conv1 = ConvBNLayer(num_channels=3, + num_filters=int(32 * scale), + filter_size=3, + stride=2, + padding=1, + name=prefix_name + "conv1_1") + + self.block_list = [] + i = 1 + in_c = int(32 * scale) + for layer_setting in bottleneck_params_list: + t, c, n, s = layer_setting + i += 1 + block = self.add_sublayer(prefix_name + "conv" + str(i), + sublayer=InvresiBlocks(in_c=in_c, + t=t, + c=int(c * scale), + n=n, + s=s, + name=prefix_name + + "conv" + str(i), + num_seg=num_seg)) + self.block_list.append(block) + in_c = int(c * scale) + + self.out_c = int(1280 * scale) if scale > 1.0 else 1280 + self.conv9 = ConvBNLayer(num_channels=in_c, + num_filters=self.out_c, + filter_size=1, + stride=1, + padding=0, + name=prefix_name + "conv9") + + self.pool2d_avg = AdaptiveAvgPool2D(1) + + self.out = Linear(self.out_c, + class_num, + weight_attr=ParamAttr(name=prefix_name + + "fc10_weights"), + bias_attr=ParamAttr(name=prefix_name + "fc10_offset")) + + def init_weights(self): + """Initiate the parameters. + """ + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + y = self.conv1(inputs, if_act=True) + for block in self.block_list: + y = block(y) + y = self.conv9(y, if_act=True) + y = self.pool2d_avg(y) + + y = paddle.reshape(y, [-1, self.num_seg, y.shape[1]]) + y = paddle.mean(y, axis=1) + y = paddle.reshape(y, shape=[-1, self.out_c]) + + y = self.out(y) + return y + + +@BACKBONES.register() +def PPTSM_MobileNetV2(pretrained=None, **kwargs): + model = MobileNet(pretrained=pretrained, scale=1.0, **kwargs) + return model + + +def PPTSM_MobileNetV2_x0_25(pretrained=None, **kwargs): + model = MobileNet(pretrained=pretrained, scale=0.25, **kwargs) + return model + + +def PPTSM_MobileNetV2_x0_5(pretrained=None, **kwargs): + model = MobileNet(pretrained=pretrained, scale=0.5, **kwargs) + return model + + +def PPTSM_MobileNetV2_x0_75(pretrained=None, **kwargs): + model = MobileNet(pretrained=pretrained, scale=0.75, **kwargs) + return model + + +def PPTSM_MobileNetV2_x1_5(pretrained=None, **kwargs): + model = MobileNet(pretrained=pretrained, scale=1.5, **kwargs) + return model + + +def PPTSM_MobileNetV2_x2_0(pretrained=None, **kwargs): + model = MobileNet(pretrained=pretrained, scale=2.0, **kwargs) + return model diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv3.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv3.py new file mode 100644 index 0000000..cd10bac --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_mv3.py @@ -0,0 +1,408 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# reference: https://arxiv.org/abs/1905.02244 + +from __future__ import absolute_import, division, print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear +from paddle.regularizer import L2Decay + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + +# Download URL of pretrained model +# MODEL_URLS = { +# "MobileNetV3_small_x1_0": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_ssld_pretrained.pdparams", +# "MobileNetV3_large_x1_0": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_ssld_pretrained.pdparams", +# } + +MODEL_STAGES_PATTERN = { + "MobileNetV3_small": ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"], + "MobileNetV3_large": + ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"] +} + +# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively. +# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s. +# k: kernel_size +# exp: middle channel number in depthwise block +# c: output channel number in depthwise block +# se: whether to use SE block +# act: which activation to use +# s: stride in depthwise block +NET_CONFIG = { + "large": [ + # k, exp, c, se, act, s + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], + [5, 72, 40, True, "relu", 2], + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], + [3, 240, 80, False, "hardswish", 2], + [3, 200, 80, False, "hardswish", 1], + [3, 184, 80, False, "hardswish", 1], + [3, 184, 80, False, "hardswish", 1], + [3, 480, 112, True, "hardswish", 1], + [3, 672, 112, True, "hardswish", 1], + [5, 672, 160, True, "hardswish", 2], + [5, 960, 160, True, "hardswish", 1], + [5, 960, 160, True, "hardswish", 1], + ], + "small": [ + # k, exp, c, se, act, s + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], + [3, 88, 24, False, "relu", 1], + [5, 96, 40, True, "hardswish", 2], + [5, 240, 40, True, "hardswish", 1], + [5, 240, 40, True, "hardswish", 1], + [5, 120, 48, True, "hardswish", 1], + [5, 144, 48, True, "hardswish", 1], + [5, 288, 96, True, "hardswish", 2], + [5, 576, 96, True, "hardswish", 1], + [5, 576, 96, True, "hardswish", 1], + ] +} +# first conv output channel number in MobileNetV3 +STEM_CONV_NUMBER = 16 +# last second conv output channel for "small" +LAST_SECOND_CONV_SMALL = 576 +# last second conv output channel for "large" +LAST_SECOND_CONV_LARGE = 960 +# last conv output channel number for "large" and "small" +LAST_CONV = 1280 + + +def _make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def _create_act(act): + if act == "hardswish": + return nn.Hardswish() + elif act == "relu": + return nn.ReLU() + elif act is None: + return None + else: + raise RuntimeError( + "The activation function is not supported: {}".format(act)) + + +class MobileNetV3(nn.Layer): + """ + MobileNetV3 + Args: + config: list. MobileNetV3 depthwise blocks config. + scale: float=1.0. The coefficient that controls the size of network parameters. + class_num: int=1000. The number of classes. + inplanes: int=16. The output channel number of first convolution layer. + class_squeeze: int=960. The output channel number of penultimate convolution layer. + class_expand: int=1280. The output channel number of last convolution layer. + dropout_prob: float=0.2. Probability of setting units to zero. + Returns: + model: nn.Layer. Specific MobileNetV3 model depends on args. + """ + def __init__(self, + config, + stages_pattern, + scale=1.0, + class_num=400, + inplanes=STEM_CONV_NUMBER, + class_squeeze=LAST_SECOND_CONV_LARGE, + class_expand=LAST_CONV, + dropout_prob=0.2, + num_seg=8, + pretrained=None, + return_patterns=None, + return_stages=None): + super().__init__() + + self.cfg = config + self.scale = scale + self.inplanes = inplanes + self.class_squeeze = class_squeeze + self.class_expand = class_expand + self.class_num = class_num + self.num_seg = num_seg + self.pretrained = pretrained + + self.conv = ConvBNLayer(in_c=3, + out_c=_make_divisible(self.inplanes * + self.scale), + filter_size=3, + stride=2, + padding=1, + num_groups=1, + if_act=True, + act="hardswish") + + self.blocks = nn.Sequential(*[ + ResidualUnit(in_c=_make_divisible(self.inplanes * self.scale if i == + 0 else self.cfg[i - 1][2] * + self.scale), + mid_c=_make_divisible(self.scale * exp), + out_c=_make_divisible(self.scale * c), + filter_size=k, + stride=s, + use_se=se, + num_seg=self.num_seg, + act=act) + for i, (k, exp, c, se, act, s) in enumerate(self.cfg) + ]) + + self.last_second_conv = ConvBNLayer( + in_c=_make_divisible(self.cfg[-1][2] * self.scale), + out_c=_make_divisible(self.scale * self.class_squeeze), + filter_size=1, + stride=1, + padding=0, + num_groups=1, + if_act=True, + act="hardswish") + + self.avg_pool = AdaptiveAvgPool2D(1) + + self.last_conv = Conv2D(in_channels=_make_divisible(self.scale * + self.class_squeeze), + out_channels=self.class_expand, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False) + + self.hardswish = nn.Hardswish() + if dropout_prob is not None: + self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer") + else: + self.dropout = None + + self.fc = Linear(self.class_expand, class_num) + + def init_weights(self): + """Initiate the parameters. + """ + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + #XXX: no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, x): + x = self.conv(x) + x = self.blocks(x) + x = self.last_second_conv(x) + x = self.avg_pool(x) + x = self.last_conv(x) + x = self.hardswish(x) + if self.dropout is not None: + x = self.dropout(x) + + # feature aggregation for video + x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]]) + x = paddle.mean(x, axis=1) + x = paddle.reshape(x, shape=[-1, self.class_expand]) + + x = self.fc(x) + + return x + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_c, + out_c, + filter_size, + stride, + padding, + num_groups=1, + if_act=True, + act=None): + super().__init__() + + self.conv = Conv2D(in_channels=in_c, + out_channels=out_c, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + bias_attr=False) + self.bn = BatchNorm(num_channels=out_c, + act=None, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self.if_act = if_act + self.act = _create_act(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + x = self.act(x) + return x + + +class ResidualUnit(nn.Layer): + def __init__(self, + in_c, + mid_c, + out_c, + filter_size, + stride, + use_se, + num_seg=8, + act=None): + super().__init__() + self.if_shortcut = stride == 1 and in_c == out_c + self.if_se = use_se + self.num_seg = num_seg + + self.expand_conv = ConvBNLayer(in_c=in_c, + out_c=mid_c, + filter_size=1, + stride=1, + padding=0, + if_act=True, + act=act) + self.bottleneck_conv = ConvBNLayer(in_c=mid_c, + out_c=mid_c, + filter_size=filter_size, + stride=stride, + padding=int((filter_size - 1) // 2), + num_groups=mid_c, + if_act=True, + act=act) + if self.if_se: + self.mid_se = SEModule(mid_c) + self.linear_conv = ConvBNLayer(in_c=mid_c, + out_c=out_c, + filter_size=1, + stride=1, + padding=0, + if_act=False, + act=None) + + def forward(self, x): + identity = x + + if self.if_shortcut: + x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg) + + x = self.expand_conv(x) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(identity, x) + return x + + +# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid +class Hardsigmoid(nn.Layer): + def __init__(self, slope=0.2, offset=0.5): + super().__init__() + self.slope = slope + self.offset = offset + + def forward(self, x): + return nn.functional.hardsigmoid(x, + slope=self.slope, + offset=self.offset) + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D(in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = Conv2D(in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5) + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + return paddle.multiply(x=identity, y=x) + + +def PPTSM_MobileNetV3_small_x1_0(pretrained=None, **kwargs): + """ + MobileNetV3_small_x1_0 + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args. + """ + model = MobileNetV3( + config=NET_CONFIG["small"], + scale=1.0, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + class_squeeze=LAST_SECOND_CONV_SMALL, + pretrained=pretrained, + **kwargs) + return model + + +@BACKBONES.register() +def PPTSM_MobileNetV3(pretrained=None, **kwargs): + """ + MobileNetV3_large_x1_0 + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args. + """ + model = MobileNetV3( + config=NET_CONFIG["large"], + scale=1.0, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"], + class_squeeze=LAST_SECOND_CONV_LARGE, + pretrained=pretrained, + **kwargs) + return model diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_v2.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_v2.py new file mode 100644 index 0000000..07dc5bf --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/pptsm_v2.py @@ -0,0 +1,405 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear, BatchNorm2D +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +import paddle.nn.functional as F + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + +# MODEL_URLS = { +# "PPLCNetV2": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams", +# } + +MODEL_STAGES_PATTERN = { + "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"] +} + +NET_CONFIG = { + # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut + "stage1": [64, 3, False, False, False, False], + "stage2": [128, 3, False, False, False, False], + "stage3": [256, 5, True, True, True, False], + "stage4": [512, 5, False, True, False, True], +} + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class GlobalAttention(nn.Layer): + """ + Lightweight temporal attention module. + """ + + def __init__(self, num_seg=8): + super().__init__() + self.fc = nn.Linear(in_features=num_seg, + out_features=num_seg, + weight_attr=ParamAttr(learning_rate=5.0, + regularizer=L2Decay(1e-4)), + bias_attr=ParamAttr(learning_rate=10.0, + regularizer=L2Decay(0.0))) + self.num_seg = num_seg + + def forward(self, x): + _, C, H, W = x.shape + x0 = x + + x = x.reshape([-1, self.num_seg, C * H * W]) + x = paddle.mean(x, axis=2) # efficient way of avg_pool + x = x.squeeze(axis=-1) + x = self.fc(x) + attention = F.sigmoid(x) + attention = attention.reshape( + (-1, self.num_seg, 1, 1, 1)) #for broadcast + + x0 = x0.reshape([-1, self.num_seg, C, H, W]) + y = paddle.multiply(x0, attention) + y = y.reshape_([-1, C, H, W]) + return y + + +class ConvBNLayer(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + use_act=True): + super().__init__() + self.use_act = use_act + self.conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self.bn = BatchNorm2D(out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + if self.use_act: + self.act = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + return x + + +class SEModule(nn.Layer): + + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D(in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = Conv2D(in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = nn.Sigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class RepDepthwiseSeparable(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + stride, + dw_size=3, + split_pw=False, + use_rep=False, + use_se=False, + use_shortcut=False): + super().__init__() + self.is_repped = False + + self.dw_size = dw_size + self.split_pw = split_pw + self.use_rep = use_rep + self.use_se = use_se + self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False + + if self.use_rep: + self.dw_conv_list = nn.LayerList() + for kernel_size in range(self.dw_size, 0, -2): + if kernel_size == 1 and stride != 1: + continue + dw_conv = ConvBNLayer(in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + groups=in_channels, + use_act=False) + self.dw_conv_list.append(dw_conv) + self.dw_conv = nn.Conv2D(in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + padding=(dw_size - 1) // 2, + groups=in_channels) + else: + self.dw_conv = ConvBNLayer(in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels) + + self.act = nn.ReLU() + + if use_se: + self.se = SEModule(in_channels) + + if self.split_pw: + pw_ratio = 0.5 + self.pw_conv_1 = ConvBNLayer(in_channels=in_channels, + kernel_size=1, + out_channels=int(out_channels * + pw_ratio), + stride=1) + self.pw_conv_2 = ConvBNLayer(in_channels=int(out_channels * + pw_ratio), + kernel_size=1, + out_channels=out_channels, + stride=1) + else: + self.pw_conv = ConvBNLayer(in_channels=in_channels, + kernel_size=1, + out_channels=out_channels, + stride=1) + + def forward(self, x): + if self.use_rep: + input_x = x + if self.is_repped: + x = self.act(self.dw_conv(x)) + else: + y = self.dw_conv_list[0](x) + for dw_conv in self.dw_conv_list[1:]: + y += dw_conv(x) + x = self.act(y) + else: + x = self.dw_conv(x) + + if self.use_se: + x = self.se(x) + if self.split_pw: + x = self.pw_conv_1(x) + x = self.pw_conv_2(x) + else: + x = self.pw_conv(x) + if self.use_shortcut: + x = x + input_x + return x + + def rep(self): + if self.use_rep: + self.is_repped = True + kernel, bias = self._get_equivalent_kernel_bias() + self.dw_conv.weight.set_value(kernel) + self.dw_conv.bias.set_value(bias) + + def _get_equivalent_kernel_bias(self): + kernel_sum = 0 + bias_sum = 0 + for dw_conv in self.dw_conv_list: + kernel, bias = self._fuse_bn_tensor(dw_conv) + kernel = self._pad_tensor(kernel, to_size=self.dw_size) + kernel_sum += kernel + bias_sum += bias + return kernel_sum, bias_sum + + def _fuse_bn_tensor(self, branch): + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + def _pad_tensor(self, tensor, to_size): + from_size = tensor.shape[-1] + if from_size == to_size: + return tensor + pad = (to_size - from_size) // 2 + return F.pad(tensor, [pad, pad, pad, pad]) + + +class PPTSM_v2_LCNet(nn.Layer): + + def __init__(self, + scale, + depths, + class_num=400, + dropout_prob=0, + num_seg=8, + use_temporal_att=False, + pretrained=None, + use_last_conv=True, + class_expand=1280): + super().__init__() + self.scale = scale + self.use_last_conv = use_last_conv + self.class_expand = class_expand + self.num_seg = num_seg + self.use_temporal_att = use_temporal_att + self.pretrained = pretrained + + self.stem = nn.Sequential(*[ + ConvBNLayer(in_channels=3, + kernel_size=3, + out_channels=make_divisible(32 * scale), + stride=2), + RepDepthwiseSeparable(in_channels=make_divisible(32 * scale), + out_channels=make_divisible(64 * scale), + stride=1, + dw_size=3) + ]) + + # stages + self.stages = nn.LayerList() + for depth_idx, k in enumerate(NET_CONFIG): + in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[ + k] + self.stages.append( + nn.Sequential(*[ + RepDepthwiseSeparable(in_channels=make_divisible( + (in_channels if i == 0 else in_channels * 2) * scale), + out_channels=make_divisible( + in_channels * 2 * scale), + stride=2 if i == 0 else 1, + dw_size=kernel_size, + split_pw=split_pw, + use_rep=use_rep, + use_se=use_se, + use_shortcut=use_shortcut) + for i in range(depths[depth_idx]) + ])) + + self.avg_pool = AdaptiveAvgPool2D(1) + + if self.use_last_conv: + self.last_conv = Conv2D(in_channels=make_divisible( + NET_CONFIG["stage4"][0] * 2 * scale), + out_channels=self.class_expand, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False) + self.act = nn.ReLU() + self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer") + + self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) + in_features = self.class_expand if self.use_last_conv else NET_CONFIG[ + "stage4"][0] * 2 * scale + self.fc = Linear(in_features, class_num) + if self.use_temporal_att: + self.global_attention = GlobalAttention(num_seg=self.num_seg) + + def init_weights(self): + """Initiate the parameters. + """ + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, x): + x = self.stem(x) + count = 0 + for stage in self.stages: + # only add temporal attention and tsm in stage3 for efficiency + if count == 2: + # add temporal attention + if self.use_temporal_att: + x = self.global_attention(x) + x = F.temporal_shift(x, self.num_seg, 1.0 / self.num_seg) + count += 1 + x = stage(x) + + x = self.avg_pool(x) + if self.use_last_conv: + x = self.last_conv(x) + x = self.act(x) + x = self.dropout(x) + + # Feature aggregation + x = paddle.reshape(x, [-1, self.num_seg, x.shape[1]]) + x = paddle.mean(x, axis=1) + x = paddle.reshape(x, shape=[-1, self.class_expand]) + + x = self.fc(x) + return x + + +@BACKBONES.register() +def PPTSM_v2(pretrained=None, use_ssld=False, **kwargs): + """ + PP-TSM_v2 model. + Args: + pretrained: str, means the path of the pretrained model. + Returns: + model: nn.Layer. + """ + model = PPTSM_v2_LCNet(pretrained=pretrained, + scale=1.0, + depths=[2, 2, 6, 2], + dropout_prob=0.2, + **kwargs) + return model diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet.py new file mode 100644 index 0000000..2f07991 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet.py @@ -0,0 +1,283 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math + +import paddle +import paddle.nn as nn +from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D, + AvgPool2D) +from paddle import ParamAttr +import paddle.nn.functional as F + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + + +class ConvBNLayer(nn.Layer): + """Conv2D and BatchNorm2D layer. + + Args: + in_channels (int): Number of channels for the input. + out_channels (int): Number of channels for the output. + kernel_size (int): Kernel size. + stride (int): Stride in the Conv2D layer. Default: 1. + groups (int): Groups in the Conv2D, Default: 1. + act (str): Indicate activation after BatchNorm2D layer. + name (str): the name of an instance of ConvBNLayer. + + Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method. + + """ + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + + self._act = act + + self._batch_norm = BatchNorm2D(out_channels, + weight_attr=ParamAttr(name=bn_name + + "_scale"), + bias_attr=ParamAttr(bn_name + "_offset")) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self._act: + y = getattr(paddle.nn.functional, self._act)(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + name=None): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act="relu", + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act="relu", + name=name + "_branch2b") + + self.conv2 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + return F.relu(y) + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + filter_size=3, + stride=stride, + act="relu", + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + filter_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + filter_size=1, + stride=stride, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(short, conv1) + y = F.relu(y) + return y + + +@BACKBONES.register() +class ResNet(nn.Layer): + """ResNet backbone. + + Args: + depth (int): Depth of resnet model. + pretrained (str): pretrained model. Default: None. + """ + def __init__(self, depth, pretrained=None): + super(ResNet, self).__init__() + self.pretrained = pretrained + self.layers = depth + + supported_layers = [18, 34, 50, 101, 152] + assert self.layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, self.layers) + + if self.layers == 18: + depth = [2, 2, 2, 2] + elif self.layers == 34 or self.layers == 50: + depth = [3, 4, 6, 3] + elif self.layers == 101: + depth = [3, 4, 23, 3] + elif self.layers == 152: + depth = [3, 8, 36, 3] + + in_channels = [64, 256, 512, 1024] + out_channels = [64, 128, 256, 512] + + self.conv = ConvBNLayer(in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", + name="conv1") + self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if self.layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if self.layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + conv_name, + BottleneckBlock( + # NOTE: Be careful! Here is different from TSM model. + in_channels=in_channels[block] + if i == 0 else out_channels[block] * 4, + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + name=conv_name)) + + self.block_list.append(bottleneck_block) + shortcut = True + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + conv_name, + BasicBlock(in_channels=in_channels[block] + if i == 0 else out_channels[block], + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + name=conv_name)) + self.block_list.append(basic_block) + shortcut = True + + def init_weights(self): + """Initiate the parameters. + Note: + 1. when indicate pretrained loading path, will load it to initiate backbone. + 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html + """ + #XXX: check bias!!! check pretrained!!! + + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + #XXX: no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + """Define how the backbone is going to run. + + """ + #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase, + # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27 + #y = paddle.reshape( + # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]]) + + y = self.conv(inputs) + y = self.pool2D_max(y) + for block in self.block_list: + y = block(y) + return y diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d.py new file mode 100644 index 0000000..33edefe --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d.py @@ -0,0 +1,641 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +import collections +from itertools import repeat + +import paddle +from paddle import nn + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +_triple = _ntuple(3) + + +class ConvBNLayer(nn.Layer): + """A conv block that bundles conv/norm/activation layers. + + This block simplifies the usage of convolution layers, which are commonly + used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). + It is based upon three build methods: `build_conv_layer()`, + `build_norm_layer()` and `build_activation_layer()`. + + Besides, we add some additional features in this module. + 1. Automatically set `bias` of the conv layer. + 2. Spectral norm is supported. + 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only + supports zero and circular padding, and we add "reflect" padding mode. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. + groups (int): Number of blocked connections from input channels to + output channels. Same as that in ``nn._ConvNd``. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + padding=0, + stride=1, + dilation=1, + groups=1, + act=None, + bias=None, + ): + super(ConvBNLayer, self).__init__() + + self._conv = nn.Conv3D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias) + + self._batch_norm = nn.BatchNorm3D(out_channels, momentum=0.1) + self.act = act + if act is not None: + self._act_op = nn.ReLU() + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act_op(y) + + return y + + +class Bottleneck3d(nn.Layer): + """Bottleneck 3d block for ResNet3D. + + Args: + inplanes (int): Number of channels for the input in first conv3d layer. + planes (int): Number of channels produced by some norm/conv3d layers. + spatial_stride (int): Spatial stride in the conv3d layer. Default: 1. + temporal_stride (int): Temporal stride in the conv3d layer. Default: 1. + dilation (int): Spacing between kernel elements. Default: 1. + downsample (nn.Module | None): Downsample layer. Default: None. + inflate (bool): Whether to inflate kernel. Default: True. + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + kernel sizes and padding strides for conv1 and conv2 in each block. + Default: '3x1x1'. + non_local (bool): Determine whether to apply non-local module in this + block. Default: False. + non_local_cfg (dict): Config for non-local module. Default: ``dict()``. + conv_cfg (dict): Config dict for convolution layer. + Default: ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. required keys are ``type``, + Default: ``dict(type='BN3d')``. + act_cfg (dict): Config dict for activation layer. + Default: ``dict(type='ReLU')``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + expansion = 4 + + def __init__(self, + inplanes, + planes, + spatial_stride=1, + temporal_stride=1, + dilation=1, + downsample=None, + inflate=True, + inflate_style='3x1x1', + non_local=False, + non_local_cfg=dict(), + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super().__init__() + assert inflate_style in ['3x1x1', '3x3x3'] + + self.inplanes = inplanes + self.planes = planes + self.spatial_stride = spatial_stride + self.temporal_stride = temporal_stride + self.dilation = dilation + self.inflate = inflate + self.inflate_style = inflate_style + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.act_cfg = act_cfg + self.with_cp = with_cp + self.non_local = non_local + self.non_local_cfg = non_local_cfg + + self.conv1_stride_s = 1 + self.conv2_stride_s = spatial_stride + self.conv1_stride_t = 1 + self.conv2_stride_t = temporal_stride + + if self.inflate: + if inflate_style == '3x1x1': + conv1_kernel_size = (3, 1, 1) + conv1_padding = (1, 0, 0) + conv2_kernel_size = (1, 3, 3) + conv2_padding = (0, dilation, dilation) + else: + conv1_kernel_size = (1, 1, 1) + conv1_padding = (0, 0, 0) + conv2_kernel_size = (3, 3, 3) + conv2_padding = (1, dilation, dilation) + else: + conv1_kernel_size = (1, 1, 1) + conv1_padding = (0, 0, 0) + conv2_kernel_size = (1, 3, 3) + conv2_padding = (0, dilation, dilation) + self.conv1 = ConvBNLayer( + in_channels=inplanes, + out_channels=planes, + kernel_size=conv1_kernel_size, + stride=(self.conv1_stride_t, self.conv1_stride_s, + self.conv1_stride_s), + padding=conv1_padding, + bias=False, + act='relu') + + self.conv2 = ConvBNLayer( + in_channels=planes, + out_channels=planes, + kernel_size=conv2_kernel_size, + stride=(self.conv2_stride_t, self.conv2_stride_s, + self.conv2_stride_s), + padding=conv2_padding, + dilation=(1, dilation, dilation), + bias=False, + act='relu') + + self.conv3 = ConvBNLayer( + in_channels=planes, + out_channels=planes * self.expansion, + kernel_size=1, + bias=False, + act=None, + ) + + self.downsample = downsample + self.relu = nn.ReLU() + + def forward(self, x): + """Defines the computation performed at every call.""" + + def _inner_forward(x): + """Forward wrapper for utilizing checkpoint.""" + identity = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out = out + identity + return out + + out = _inner_forward(x) + out = self.relu(out) + + if self.non_local: + out = self.non_local_block(out) + + return out + + +class ResNet3d(nn.Layer): + """ResNet 3d backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + pretrained (str | None): Name of pretrained model. + stage_blocks (tuple | None): Set number of stages for each res layer. + Default: None. + pretrained2d (bool): Whether to load pretrained 2D model. + Default: True. + in_channels (int): Channel num of input features. Default: 3. + base_channels (int): Channel num of stem output features. Default: 64. + out_indices (Sequence[int]): Indices of output feature. Default: (3, ). + num_stages (int): Resnet stages. Default: 4. + spatial_strides (Sequence[int]): + Spatial strides of residual blocks of each stage. + Default: ``(1, 2, 2, 2)``. + temporal_strides (Sequence[int]): + Temporal strides of residual blocks of each stage. + Default: ``(1, 1, 1, 1)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + conv1_kernel (Sequence[int]): Kernel size of the first conv layer. + Default: ``(3, 7, 7)``. + conv1_stride_s (int): Spatial stride of the first conv layer. + Default: 2. + conv1_stride_t (int): Temporal stride of the first conv layer. + Default: 1. + pool1_stride_s (int): Spatial stride of the first pooling layer. + Default: 2. + pool1_stride_t (int): Temporal stride of the first pooling layer. + Default: 1. + with_pool2 (bool): Whether to use pool2. Default: True. + inflate (Sequence[int]): Inflate Dims of each block. + Default: (1, 1, 1, 1). + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + kernel sizes and padding strides for conv1 and conv2 in each block. + Default: '3x1x1'. + conv_cfg (dict): Config for conv layers. required keys are ``type`` + Default: ``dict(type='Conv3d')``. + norm_cfg (dict): Config for norm layers. required keys are ``type`` and + ``requires_grad``. + Default: ``dict(type='BN3d', requires_grad=True)``. + act_cfg (dict): Config dict for activation layer. + Default: ``dict(type='ReLU', inplace=True)``. + norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze + running stats (mean and var). Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + non_local (Sequence[int]): Determine whether to apply non-local module + in the corresponding block of each stages. Default: (0, 0, 0, 0). + non_local_cfg (dict): Config for non-local module. Default: ``dict()``. + zero_init_residual (bool): + Whether to use zero initialization for residual block, + Default: True. + kwargs (dict, optional): Key arguments for "make_res_layer". + """ + + arch_settings = { + 50: (Bottleneck3d, (3, 4, 6, 3)), + 101: (Bottleneck3d, (3, 4, 23, 3)), + 152: (Bottleneck3d, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + stage_blocks=None, + pretrained2d=True, + in_channels=3, + num_stages=4, + base_channels=64, + out_indices=(3, ), + spatial_strides=(1, 2, 2, 2), + temporal_strides=(1, 1, 1, 1), + dilations=(1, 1, 1, 1), + conv1_kernel=(3, 7, 7), + conv1_stride_s=2, + conv1_stride_t=1, + pool1_stride_s=2, + pool1_stride_t=1, + with_pool1=True, + with_pool2=True, + inflate=(1, 1, 1, 1), + inflate_style='3x1x1', + conv_cfg=dict(type='Conv3d'), + norm_cfg=dict(type='BN3d', requires_grad=True), + act_cfg=dict(type='ReLU', inplace=True), + norm_eval=False, + with_cp=False, + non_local=(0, 0, 0, 0), + non_local_cfg=dict(), + zero_init_residual=True, + **kwargs): + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.pretrained2d = pretrained2d + self.in_channels = in_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert 1 <= num_stages <= 4 + self.stage_blocks = stage_blocks + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.spatial_strides = spatial_strides + self.temporal_strides = temporal_strides + self.dilations = dilations + assert len(spatial_strides) == len(temporal_strides) == len( + dilations) == num_stages + if self.stage_blocks is not None: + assert len(self.stage_blocks) == num_stages + + self.conv1_kernel = conv1_kernel + self.conv1_stride_s = conv1_stride_s + self.conv1_stride_t = conv1_stride_t + self.pool1_stride_s = pool1_stride_s + self.pool1_stride_t = pool1_stride_t + self.with_pool1 = with_pool1 + self.with_pool2 = with_pool2 + self.stage_inflations = _ntuple(num_stages)(inflate) + self.non_local_stages = _ntuple(num_stages)(non_local) + self.inflate_style = inflate_style + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + self.block, stage_blocks = self.arch_settings[depth] + + if self.stage_blocks is None: + self.stage_blocks = stage_blocks[:num_stages] + + self.inplanes = self.base_channels + + self.non_local_cfg = non_local_cfg + + self._make_stem_layer() + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + spatial_stride = spatial_strides[i] + temporal_stride = temporal_strides[i] + dilation = dilations[i] + planes = self.base_channels * 2**i + res_layer = self.make_res_layer( + self.block, + self.inplanes, + planes, + num_blocks, + spatial_stride=spatial_stride, + temporal_stride=temporal_stride, + dilation=dilation, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + act_cfg=self.act_cfg, + non_local=self.non_local_stages[i], + non_local_cfg=self.non_local_cfg, + inflate=self.stage_inflations[i], + inflate_style=self.inflate_style, + with_cp=with_cp, + **kwargs) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_sublayer(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = self.block.expansion * self.base_channels * 2**( + len(self.stage_blocks) - 1) + + @staticmethod + def make_res_layer(block, + inplanes, + planes, + blocks, + spatial_stride=1, + temporal_stride=1, + dilation=1, + inflate=1, + inflate_style='3x1x1', + non_local=0, + non_local_cfg=dict(), + norm_cfg=None, + act_cfg=None, + conv_cfg=None, + with_cp=False, + **kwargs): + """Build residual layer for ResNet3D. + + Args: + block (nn.Module): Residual module to be built. + inplanes (int): Number of channels for the input feature + in each block. + planes (int): Number of channels for the output feature + in each block. + blocks (int): Number of residual blocks. + spatial_stride (int | Sequence[int]): Spatial strides in + residual and conv layers. Default: 1. + temporal_stride (int | Sequence[int]): Temporal strides in + residual and conv layers. Default: 1. + dilation (int): Spacing between kernel elements. Default: 1. + inflate (int | Sequence[int]): Determine whether to inflate + for each block. Default: 1. + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines + the kernel sizes and padding strides for conv1 and conv2 + in each block. Default: '3x1x1'. + non_local (int | Sequence[int]): Determine whether to apply + non-local module in the corresponding block of each stages. + Default: 0. + non_local_cfg (dict): Config for non-local module. + Default: ``dict()``. + conv_cfg (dict | None): Config for norm layers. Default: None. + norm_cfg (dict | None): Config for norm layers. Default: None. + act_cfg (dict | None): Config for activate layers. Default: None. + with_cp (bool | None): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + + Returns: + nn.Module: A residual layer for the given config. + """ + inflate = inflate if not isinstance(inflate, + int) else (inflate, ) * blocks + non_local = non_local if not isinstance(non_local, + int) else (non_local, ) * blocks + assert len(inflate) == blocks and len(non_local) == blocks + downsample = None + if spatial_stride != 1 or inplanes != planes * block.expansion: + downsample = ConvBNLayer( + in_channels=inplanes, + out_channels=planes * block.expansion, + kernel_size=1, + stride=(temporal_stride, spatial_stride, spatial_stride), + bias=False, + act=None) + + layers = [] + layers.append( + block( + inplanes, + planes, + spatial_stride=spatial_stride, + temporal_stride=temporal_stride, + dilation=dilation, + downsample=downsample, + inflate=(inflate[0] == 1), + inflate_style=inflate_style, + non_local=(non_local[0] == 1), + non_local_cfg=non_local_cfg, + norm_cfg=norm_cfg, + conv_cfg=conv_cfg, + act_cfg=act_cfg, + with_cp=with_cp, + **kwargs)) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + spatial_stride=1, + temporal_stride=1, + dilation=dilation, + inflate=(inflate[i] == 1), + inflate_style=inflate_style, + non_local=(non_local[i] == 1), + non_local_cfg=non_local_cfg, + norm_cfg=norm_cfg, + conv_cfg=conv_cfg, + act_cfg=act_cfg, + with_cp=with_cp, + **kwargs)) + + return nn.Sequential(*layers) + + @staticmethod + def _inflate_conv_params(conv3d, state_dict_2d, module_name_2d, + inflated_param_names): + """Inflate a conv module from 2d to 3d. + + Args: + conv3d (nn.Module): The destination conv3d module. + state_dict_2d (OrderedDict): The state dict of pretrained 2d model. + module_name_2d (str): The name of corresponding conv module in the + 2d model. + inflated_param_names (list[str]): List of parameters that have been + inflated. + """ + weight_2d_name = module_name_2d + '.weight' + + conv2d_weight = state_dict_2d[weight_2d_name] + kernel_t = conv3d.weight.data.shape[2] + + new_weight = conv2d_weight.data.unsqueeze(2).expand_as( + conv3d.weight) / kernel_t + conv3d.weight.data.copy_(new_weight) + inflated_param_names.append(weight_2d_name) + + if getattr(conv3d, 'bias') is not None: + bias_2d_name = module_name_2d + '.bias' + conv3d.bias.data.copy_(state_dict_2d[bias_2d_name]) + inflated_param_names.append(bias_2d_name) + + @staticmethod + def _inflate_bn_params(bn3d, state_dict_2d, module_name_2d, + inflated_param_names): + """Inflate a norm module from 2d to 3d. + + Args: + bn3d (nn.Module): The destination bn3d module. + state_dict_2d (OrderedDict): The state dict of pretrained 2d model. + module_name_2d (str): The name of corresponding bn module in the + 2d model. + inflated_param_names (list[str]): List of parameters that have been + inflated. + """ + for param_name, param in bn3d.named_parameters(): + param_2d_name = f'{module_name_2d}.{param_name}' + param_2d = state_dict_2d[param_2d_name] + if param.data.shape != param_2d.shape: + warnings.warn(f'The parameter of {module_name_2d} is not' + 'loaded due to incompatible shapes. ') + return + + param.data.copy_(param_2d) + inflated_param_names.append(param_2d_name) + + for param_name, param in bn3d.named_buffers(): + param_2d_name = f'{module_name_2d}.{param_name}' + # some buffers like num_batches_tracked may not exist in old + # checkpoints + if param_2d_name in state_dict_2d: + param_2d = state_dict_2d[param_2d_name] + param.data.copy_(param_2d) + inflated_param_names.append(param_2d_name) + + def _make_stem_layer(self): + """Construct the stem layers consists of a conv+norm+act module and a + pooling layer.""" + + self.conv1 = ConvBNLayer( + in_channels=self.in_channels, + out_channels=self.base_channels, + kernel_size=self.conv1_kernel, + stride=(self.conv1_stride_t, self.conv1_stride_s, + self.conv1_stride_s), + padding=tuple([(k - 1) // 2 for k in _triple(self.conv1_kernel)]), + bias=False, + act="relu") + + self.maxpool = nn.MaxPool3D( + kernel_size=(1, 3, 3), + stride=(self.pool1_stride_t, self.pool1_stride_s, + self.pool1_stride_s), + padding=(0, 1, 1)) + + self.pool2 = nn.MaxPool3D(kernel_size=(2, 1, 1), stride=(2, 1, 1)) + + @staticmethod + def _init_weights(self, pretrained=None): + pass + + def init_weights(self, pretrained=None): + self._init_weights(self, pretrained) + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The feature of the input + samples extracted by the backbone. + """ + x = self.conv1(x) + if self.with_pool1: + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i == 0 and self.with_pool2: + x = self.pool2(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + def train(self, mode=True): + """Set the optimization status when training.""" + super().train() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, paddle.nn._BatchNormBase): + m.eval() diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d_slowonly.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d_slowonly.py new file mode 100644 index 0000000..eb5b080 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet3d_slowonly.py @@ -0,0 +1,214 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings + +import paddle +import paddle.nn as nn + +from .resnet3d import ResNet3d, ConvBNLayer +from ..registry import BACKBONES + + +@BACKBONES.register() +class ResNet3dSlowOnly(ResNet3d): + """A pathway of Slowfast based on ResNet3d. + + Args: + *args (arguments): Arguments same as :class:``ResNet3d``. + channel_ratio (int): Reduce the channel number of fast pathway + by ``channel_ratio``, corresponding to ``beta`` in the paper. + Default: 8. + **kwargs (keyword arguments): Keywords arguments for ResNet3d. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.inplanes = self.base_channels + + self.lateral_connections = [] + for i in range(len(self.stage_blocks)): + planes = self.base_channels * 2**i + self.inplanes = planes * self.block.expansion + + def make_res_layer(self, + block, + inplanes, + planes, + blocks, + spatial_stride=1, + temporal_stride=1, + dilation=1, + inflate=1, + inflate_style='3x1x1', + non_local=0, + non_local_cfg=dict(), + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + with_cp=False): + """Build residual layer for Slowfast. + + Args: + block (nn.Module): Residual module to be built. + inplanes (int): Number of channels for the input + feature in each block. + planes (int): Number of channels for the output + feature in each block. + blocks (int): Number of residual blocks. + spatial_stride (int | Sequence[int]): Spatial strides + in residual and conv layers. Default: 1. + temporal_stride (int | Sequence[int]): Temporal strides in + residual and conv layers. Default: 1. + dilation (int): Spacing between kernel elements. Default: 1. + inflate (int | Sequence[int]): Determine whether to inflate + for each block. Default: 1. + inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines + the kernel sizes and padding strides for conv1 and + conv2 in each block. Default: ``3x1x1``. + non_local (int | Sequence[int]): Determine whether to apply + non-local module in the corresponding block of each stages. + Default: 0. + non_local_cfg (dict): Config for non-local module. + Default: ``dict()``. + conv_cfg (dict | None): Config for conv layers. Default: None. + norm_cfg (dict | None): Config for norm layers. Default: None. + act_cfg (dict | None): Config for activate layers. Default: None. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Default: False. + + Returns: + nn.Module: A residual layer for the given config. + """ + inflate = inflate if not isinstance(inflate, + int) else (inflate, ) * blocks + non_local = non_local if not isinstance(non_local, + int) else (non_local, ) * blocks + assert len(inflate) == blocks and len(non_local) == blocks + + lateral_inplanes = 0 + if (spatial_stride != 1 + or (inplanes + lateral_inplanes) != planes * block.expansion): + downsample = ConvBNLayer( + in_channels=inplanes + lateral_inplanes, + out_channels=planes * block.expansion, + kernel_size=1, + stride=(temporal_stride, spatial_stride, spatial_stride), + bias=False, + act=None) + else: + downsample = None + + layers = [] + layers.append( + block( + inplanes + lateral_inplanes, + planes, + spatial_stride, + temporal_stride, + dilation, + downsample, + inflate=(inflate[0] == 1), + inflate_style=inflate_style, + non_local=(non_local[0] == 1), + non_local_cfg=non_local_cfg, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) + inplanes = planes * block.expansion + + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + 1, + 1, + dilation, + inflate=(inflate[i] == 1), + inflate_style=inflate_style, + non_local=(non_local[i] == 1), + non_local_cfg=non_local_cfg, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) + + return nn.Sequential(*layers) + + def _inflate_conv_params(self, conv3d, state_dict_2d, module_name_2d, + inflated_param_names): + """Inflate a conv module from 2d to 3d. + + The differences of conv modules betweene 2d and 3d in Pathway + mainly lie in the inplanes due to lateral connections. To fit the + shapes of the lateral connection counterpart, it will expand + parameters by concatting conv2d parameters and extra zero paddings. + + Args: + conv3d (nn.Module): The destination conv3d module. + state_dict_2d (OrderedDict): The state dict of pretrained 2d model. + module_name_2d (str): The name of corresponding conv module in the + 2d model. + inflated_param_names (list[str]): List of parameters that have been + inflated. + """ + weight_2d_name = module_name_2d + '.weight' + conv2d_weight = state_dict_2d[weight_2d_name] + old_shape = conv2d_weight.shape + new_shape = conv3d.weight.data.shape + kernel_t = new_shape[2] + + if new_shape[1] != old_shape[1]: + if new_shape[1] < old_shape[1]: + warnings.warn(f'The parameter of {module_name_2d} is not' + 'loaded due to incompatible shapes. ') + return + # Inplanes may be different due to lateral connections + new_channels = new_shape[1] - old_shape[1] + pad_shape = old_shape + pad_shape = pad_shape[:1] + (new_channels, ) + pad_shape[2:] + # Expand parameters by concat extra channels + conv2d_weight = paddle.concat( + (conv2d_weight, paddle.zeros(pad_shape)), axis=1) + + new_weight = conv2d_weight.data.unsqueeze(2).expand_as( + conv3d.weight) / kernel_t + conv3d.weight.data.copy_(new_weight) + inflated_param_names.append(weight_2d_name) + + if getattr(conv3d, 'bias') is not None: + bias_2d_name = module_name_2d + '.bias' + conv3d.bias.data.copy_(state_dict_2d[bias_2d_name]) + inflated_param_names.append(bias_2d_name) + + +if __name__ == '__main__': + net = ResNet3dSlowOnly( + depth=50, + in_channels=17, + base_channels=32, + conv1_kernel=(1, 7, 7), + num_stages=3, + out_indices=[2], + stage_blocks=[3, 4, 6], + conv1_stride_s=1, + pool1_stride_s=1, + inflate=[0, 1, 1], + with_pool2=False, + spatial_strides=[2, 2, 2], + temporal_strides=[1, 1, 2], + dilations=[1, 1, 1]) + pass diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast.py new file mode 100644 index 0000000..a679159 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast.py @@ -0,0 +1,795 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from paddle.nn.initializer import KaimingNormal +from ..registry import BACKBONES +from paddlevideo.utils.multigrid import get_norm +import sys +import numpy as np +import paddle.distributed as dist + +# seed random seed +paddle.framework.seed(0) + + +# get init parameters for conv layer +def get_conv_init(fan_out): + return KaimingNormal(fan_in=fan_out) + + +def get_bn_param_attr(bn_weight=1.0, coeff=0.0): + param_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(bn_weight), + regularizer=paddle.regularizer.L2Decay(coeff)) + return param_attr + + +"""Video models.""" + + +class BottleneckTransform(paddle.nn.Layer): + """ + Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of + temporal kernel. + """ + def __init__(self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + dilation=1, + norm_module=paddle.nn.BatchNorm3D): + """ + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the middle + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): if True, calculate the relu on the original + input without allocating new memory. + eps (float): epsilon for batch norm. + dilation (int): size of dilation. + """ + super(BottleneckTransform, self).__init__() + self.temp_kernel_size = temp_kernel_size + self._inplace_relu = inplace_relu + self._eps = eps + self._stride_1x1 = stride_1x1 + self.norm_module = norm_module + self._construct(dim_in, dim_out, stride, dim_inner, num_groups, + dilation) + + def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups, + dilation): + str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride) + + fan = (dim_inner) * (self.temp_kernel_size * 1 * 1) + initializer_tmp = get_conv_init(fan) + + self.a = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_inner, + kernel_size=[self.temp_kernel_size, 1, 1], + stride=[1, str1x1, str1x1], + padding=[int(self.temp_kernel_size // 2), 0, 0], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self.a_bn = self.norm_module(num_features=dim_inner, + epsilon=self._eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + # 1x3x3, BN, ReLU. + fan = (dim_inner) * (1 * 3 * 3) + initializer_tmp = get_conv_init(fan) + + self.b = paddle.nn.Conv3D( + in_channels=dim_inner, + out_channels=dim_inner, + kernel_size=[1, 3, 3], + stride=[1, str3x3, str3x3], + padding=[0, dilation, dilation], + groups=num_groups, + dilation=[1, dilation, dilation], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self.b_bn = self.norm_module(num_features=dim_inner, + epsilon=self._eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + # 1x1x1, BN. + fan = (dim_out) * (1 * 1 * 1) + initializer_tmp = get_conv_init(fan) + + self.c = paddle.nn.Conv3D( + in_channels=dim_inner, + out_channels=dim_out, + kernel_size=[1, 1, 1], + stride=[1, 1, 1], + padding=[0, 0, 0], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self.c_bn = self.norm_module( + num_features=dim_out, + epsilon=self._eps, + weight_attr=get_bn_param_attr(bn_weight=0.0), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + def forward(self, x): + # Branch2a. + x = self.a(x) + x = self.a_bn(x) + x = F.relu(x) + + # Branch2b. + x = self.b(x) + x = self.b_bn(x) + x = F.relu(x) + + # Branch2c + x = self.c(x) + x = self.c_bn(x) + return x + + +class ResBlock(paddle.nn.Layer): + """ + Residual block. + """ + def __init__(self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups=1, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + dilation=1, + norm_module=paddle.nn.BatchNorm3D): + """ + ResBlock class constructs redisual blocks. More details can be found in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. + "Deep residual learning for image recognition." + https://arxiv.org/abs/1512.03385 + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the middle + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + trans_func (string): transform function to be used to construct the + bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): calculate the relu on the original input + without allocating new memory. + eps (float): epsilon for batch norm. + dilation (int): size of dilation. + """ + super(ResBlock, self).__init__() + self._inplace_relu = inplace_relu + self._eps = eps + self.norm_module = norm_module + self._construct( + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ) + + def _construct( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ): + # Use skip connection with projection if dim or res change. + if (dim_in != dim_out) or (stride != 1): + fan = (dim_out) * (1 * 1 * 1) + initializer_tmp = get_conv_init(fan) + self.branch1 = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_out, + kernel_size=1, + stride=[1, stride, stride], + padding=0, + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False, + dilation=1) + self.branch1_bn = self.norm_module( + num_features=dim_out, + epsilon=self._eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + self.branch2 = BottleneckTransform(dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=stride_1x1, + inplace_relu=inplace_relu, + dilation=dilation, + norm_module=self.norm_module) + + def forward(self, x): + if hasattr(self, "branch1"): + x1 = self.branch1(x) + x1 = self.branch1_bn(x1) + x2 = self.branch2(x) + x = paddle.add(x=x1, y=x2) + else: + x2 = self.branch2(x) + x = paddle.add(x=x, y=x2) + + x = F.relu(x) + return x + + +class ResStage(paddle.nn.Layer): + """ + Stage of 3D ResNet. It expects to have one or more tensors as input for + multi-pathway (SlowFast) cases. More details can be found here: + + Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. + "Slowfast networks for video recognition." + https://arxiv.org/pdf/1812.03982.pdf + """ + def __init__(self, + dim_in, + dim_out, + stride, + temp_kernel_sizes, + num_blocks, + dim_inner, + num_groups, + num_block_temp_kernel, + dilation, + stride_1x1=False, + inplace_relu=True, + norm_module=paddle.nn.BatchNorm3D): + """ + The `__init__` method of any subclass should also contain these arguments. + ResStage builds p streams, where p can be greater or equal to one. + Args: + dim_in (list): list of p the channel dimensions of the input. + Different channel dimensions control the input dimension of + different pathways. + dim_out (list): list of p the channel dimensions of the output. + Different channel dimensions control the input dimension of + different pathways. + temp_kernel_sizes (list): list of the p temporal kernel sizes of the + convolution in the bottleneck. Different temp_kernel_sizes + control different pathway. + stride (list): list of the p strides of the bottleneck. Different + stride control different pathway. + num_blocks (list): list of p numbers of blocks for each of the + pathway. + dim_inner (list): list of the p inner channel dimensions of the + input. Different channel dimensions control the input dimension + of different pathways. + num_groups (list): list of number of p groups for the convolution. + num_groups=1 is for standard ResNet like networks, and + num_groups>1 is for ResNeXt like networks. + num_block_temp_kernel (list): extent the temp_kernel_sizes to + num_block_temp_kernel blocks, then fill temporal kernel size + of 1 for the rest of the layers. + dilation (list): size of dilation for each pathway. + """ + super(ResStage, self).__init__() + assert all((num_block_temp_kernel[i] <= num_blocks[i] + for i in range(len(temp_kernel_sizes)))) + self.num_blocks = num_blocks + self.temp_kernel_sizes = [ + (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] + + [1] * (num_blocks[i] - num_block_temp_kernel[i]) + for i in range(len(temp_kernel_sizes)) + ] + assert (len({ + len(dim_in), + len(dim_out), + len(temp_kernel_sizes), + len(stride), + len(num_blocks), + len(dim_inner), + len(num_groups), + len(num_block_temp_kernel), + }) == 1) + self.num_pathways = len(self.num_blocks) + self.norm_module = norm_module + self._construct( + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ) + + def _construct( + self, + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ): + + for pathway in range(self.num_pathways): + for i in range(self.num_blocks[pathway]): + res_block = ResBlock( + dim_in[pathway] if i == 0 else dim_out[pathway], + dim_out[pathway], + self.temp_kernel_sizes[pathway][i], + stride[pathway] if i == 0 else 1, + dim_inner[pathway], + num_groups[pathway], + stride_1x1=stride_1x1, + inplace_relu=inplace_relu, + dilation=dilation[pathway], + norm_module=self.norm_module) + self.add_sublayer("pathway{}_res{}".format(pathway, i), + res_block) + + def forward(self, inputs): + output = [] + for pathway in range(self.num_pathways): + x = inputs[pathway] + + for i in range(self.num_blocks[pathway]): + m = getattr(self, "pathway{}_res{}".format(pathway, i)) + x = m(x) + output.append(x) + + return output + + +class ResNetBasicStem(paddle.nn.Layer): + """ + ResNe(X)t 3D stem module. + Performs spatiotemporal Convolution, BN, and Relu following by a + spatiotemporal pooling. + """ + def __init__(self, + dim_in, + dim_out, + kernel, + stride, + padding, + eps=1e-5, + norm_module=paddle.nn.BatchNorm3D): + super(ResNetBasicStem, self).__init__() + self.kernel = kernel + self.stride = stride + self.padding = padding + self.eps = eps + self.norm_module = norm_module + self._construct_stem(dim_in, dim_out) + + def _construct_stem(self, dim_in, dim_out): + fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2]) + initializer_tmp = get_conv_init(fan) + + self._conv = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_out, + kernel_size=self.kernel, + stride=self.stride, + padding=self.padding, + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self._bn = self.norm_module(num_features=dim_out, + epsilon=self.eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + def forward(self, x): + x = self._conv(x) + x = self._bn(x) + x = F.relu(x) + + x = F.max_pool3d(x=x, + kernel_size=[1, 3, 3], + stride=[1, 2, 2], + padding=[0, 1, 1], + data_format="NCDHW") + return x + + +class VideoModelStem(paddle.nn.Layer): + """ + Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool + on input data tensor for slow and fast pathways. + """ + def __init__(self, + dim_in, + dim_out, + kernel, + stride, + padding, + eps=1e-5, + norm_module=paddle.nn.BatchNorm3D): + """ + Args: + dim_in (list): the list of channel dimensions of the inputs. + dim_out (list): the output dimension of the convolution in the stem + layer. + kernel (list): the kernels' size of the convolutions in the stem + layers. Temporal kernel size, height kernel size, width kernel + size in order. + stride (list): the stride sizes of the convolutions in the stem + layer. Temporal kernel stride, height kernel size, width kernel + size in order. + padding (list): the paddings' sizes of the convolutions in the stem + layer. Temporal padding size, height padding size, width padding + size in order. + eps (float): epsilon for batch norm. + """ + super(VideoModelStem, self).__init__() + + assert (len({ + len(dim_in), + len(dim_out), + len(kernel), + len(stride), + len(padding), + }) == 1), "Input pathway dimensions are not consistent." + self.num_pathways = len(dim_in) + self.kernel = kernel + self.stride = stride + self.padding = padding + self.eps = eps + self.norm_module = norm_module + self._construct_stem(dim_in, dim_out) + + def _construct_stem(self, dim_in, dim_out): + for pathway in range(len(dim_in)): + stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway], + self.kernel[pathway], self.stride[pathway], + self.padding[pathway], self.eps, + self.norm_module) + self.add_sublayer("pathway{}_stem".format(pathway), stem) + + def forward(self, x): + assert (len(x) == self.num_pathways + ), "Input tensor does not contain {} pathway".format( + self.num_pathways) + + for pathway in range(len(x)): + m = getattr(self, "pathway{}_stem".format(pathway)) + x[pathway] = m(x[pathway]) + + return x + + +class FuseFastToSlow(paddle.nn.Layer): + """ + Fuses the information from the Fast pathway to the Slow pathway. Given the + tensors from Slow pathway and Fast pathway, fuse information from Fast to + Slow, then return the fused tensors from Slow and Fast pathway in order. + """ + def __init__(self, + dim_in, + fusion_conv_channel_ratio, + fusion_kernel, + alpha, + fuse_bn_relu=1, + eps=1e-5, + norm_module=paddle.nn.BatchNorm3D): + """ + Args: + dim_in (int): the channel dimension of the input. + fusion_conv_channel_ratio (int): channel ratio for the convolution + used to fuse from Fast pathway to Slow pathway. + fusion_kernel (int): kernel size of the convolution used to fuse + from Fast pathway to Slow pathway. + alpha (int): the frame rate ratio between the Fast and Slow pathway. + eps (float): epsilon for batch norm. + """ + super(FuseFastToSlow, self).__init__() + self.fuse_bn_relu = fuse_bn_relu + fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1) + initializer_tmp = get_conv_init(fan) + + self._conv_f2s = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_in * fusion_conv_channel_ratio, + kernel_size=[fusion_kernel, 1, 1], + stride=[alpha, 1, 1], + padding=[fusion_kernel // 2, 0, 0], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio, + epsilon=eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + def forward(self, x): + x_s = x[0] + x_f = x[1] + fuse = self._conv_f2s(x_f) + # TODO: For AVA, set fuse_bn_relu=1, check mAP's improve. + if self.fuse_bn_relu: + fuse = self._bn(fuse) + fuse = F.relu(fuse) + x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None) + + return [x_s_fuse, x_f] + + +@BACKBONES.register() +class ResNetSlowFast(paddle.nn.Layer): + """ + SlowFast model builder for SlowFast network. + + Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. + "Slowfast networks for video recognition." + https://arxiv.org/pdf/1812.03982.pdf + """ + def __init__( + self, + alpha, + beta, + bn_norm_type="batchnorm", + bn_num_splits=1, + num_pathways=2, + depth=50, + num_groups=1, + input_channel_num=[3, 3], + width_per_group=64, + fusion_conv_channel_ratio=2, + fusion_kernel_sz=7, #5? + pool_size_ratio=[[1, 1, 1], [1, 1, 1]], + fuse_bn_relu = 1, + spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]], + use_pool_af_s2 = 1, + ): + """ + Args: + cfg (CfgNode): model building configs, details are in the + comments of the config file. + """ + super(ResNetSlowFast, self).__init__() + + self.alpha = alpha #8 + self.beta = beta #8 + self.norm_module = get_norm(bn_norm_type, bn_num_splits) + self.num_pathways = num_pathways + self.depth = depth + self.num_groups = num_groups + self.input_channel_num = input_channel_num + self.width_per_group = width_per_group + self.fusion_conv_channel_ratio = fusion_conv_channel_ratio + self.fusion_kernel_sz = fusion_kernel_sz # NOTE: modify to 7 in 8*8, 5 in old implement + self.pool_size_ratio = pool_size_ratio + self.fuse_bn_relu = fuse_bn_relu + self.spatial_strides = spatial_strides + self.use_pool_af_s2 = use_pool_af_s2 + self._construct_network() + + def _construct_network(self): + """ + Builds a SlowFast model. + The first pathway is the Slow pathway + and the second pathway is the Fast pathway. + + Args: + cfg (CfgNode): model building configs, details are in the + comments of the config file. + """ + temp_kernel = [ + [[1], [5]], # conv1 temporal kernel for slow and fast pathway. + [[1], [3]], # res2 temporal kernel for slow and fast pathway. + [[1], [3]], # res3 temporal kernel for slow and fast pathway. + [[3], [3]], # res4 temporal kernel for slow and fast pathway. + [[3], [3]], + ] # res5 temporal kernel for slow and fast pathway. + + self.s1 = VideoModelStem( + dim_in=self.input_channel_num, + dim_out=[self.width_per_group, self.width_per_group // self.beta], + kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]], + stride=[[1, 2, 2]] * 2, + padding=[ + [temp_kernel[0][0][0] // 2, 3, 3], + [temp_kernel[0][1][0] // 2, 3, 3], + ], + norm_module=self.norm_module) + self.s1_fuse = FuseFastToSlow( + dim_in=self.width_per_group // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu) + + # ResNet backbone + MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)} + (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth] + + num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]] + spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]] + spatial_strides = self.spatial_strides + #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]] + #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment + + out_dim_ratio = self.beta // self.fusion_conv_channel_ratio #4 + dim_inner = self.width_per_group * self.num_groups #64 + + self.s2 = ResStage(dim_in=[ + self.width_per_group + self.width_per_group // out_dim_ratio, + self.width_per_group // self.beta, + ], + dim_out=[ + self.width_per_group * 4, + self.width_per_group * 4 // self.beta, + ], + dim_inner=[dim_inner, dim_inner // self.beta], + temp_kernel_sizes=temp_kernel[1], + stride=spatial_strides[0], + num_blocks=[d2] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[0], + dilation=spatial_dilations[0], + norm_module=self.norm_module) + + self.s2_fuse = FuseFastToSlow( + dim_in=self.width_per_group * 4 // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu, + ) + + self.s3 = ResStage( + dim_in=[ + self.width_per_group * 4 + + self.width_per_group * 4 // out_dim_ratio, + self.width_per_group * 4 // self.beta, + ], + dim_out=[ + self.width_per_group * 8, + self.width_per_group * 8 // self.beta, + ], + dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta], + temp_kernel_sizes=temp_kernel[2], + stride=spatial_strides[1], + num_blocks=[d3] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[1], + dilation=spatial_dilations[1], + norm_module=self.norm_module, + ) + + self.s3_fuse = FuseFastToSlow( + dim_in=self.width_per_group * 8 // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu, + ) + + self.s4 = ResStage( + dim_in=[ + self.width_per_group * 8 + + self.width_per_group * 8 // out_dim_ratio, + self.width_per_group * 8 // self.beta, + ], + dim_out=[ + self.width_per_group * 16, + self.width_per_group * 16 // self.beta, + ], + dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta], + temp_kernel_sizes=temp_kernel[3], + stride=spatial_strides[2], + num_blocks=[d4] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[2], + dilation=spatial_dilations[2], + norm_module=self.norm_module, + ) + + self.s4_fuse = FuseFastToSlow( + dim_in=self.width_per_group * 16 // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu, + ) + + self.s5 = ResStage( + dim_in=[ + self.width_per_group * 16 + + self.width_per_group * 16 // out_dim_ratio, + self.width_per_group * 16 // self.beta, + ], + dim_out=[ + self.width_per_group * 32, + self.width_per_group * 32 // self.beta, + ], + dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta], + temp_kernel_sizes=temp_kernel[4], + stride=spatial_strides[3], + num_blocks=[d5] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[3], + dilation=spatial_dilations[3], + norm_module=self.norm_module, + ) + + def init_weights(self): + pass + + def forward(self, x): + x = self.s1(x) #VideoModelStem + x = self.s1_fuse(x) #FuseFastToSlow + x = self.s2(x) #ResStage + x = self.s2_fuse(x) + + # TODO: For AVA, set use_pool_af_s2=1, check mAP's improve. + if self.use_pool_af_s2: + for pathway in range(self.num_pathways): + x[pathway] = F.max_pool3d(x=x[pathway], + kernel_size=self.pool_size_ratio[pathway], + stride=self.pool_size_ratio[pathway], + padding=[0, 0, 0], + data_format="NCDHW") + + x = self.s3(x) + x = self.s3_fuse(x) + x = self.s4(x) + x = self.s4_fuse(x) + x = self.s5(x) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py new file mode 100644 index 0000000..d348d45 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_slowfast_MRI.py @@ -0,0 +1,796 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from paddle.nn.initializer import KaimingNormal +from ..registry import BACKBONES +from paddlevideo.utils.multigrid import get_norm +import sys +import numpy as np +import paddle.distributed as dist + +# seed random seed +paddle.framework.seed(0) + + +# get init parameters for conv layer +def get_conv_init(fan_out): + return KaimingNormal(fan_in=fan_out) + + +def get_bn_param_attr(bn_weight=1.0, coeff=0.0): + param_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(bn_weight), + regularizer=paddle.regularizer.L2Decay(coeff)) + return param_attr + + +"""Video models.""" + + +class BottleneckTransform(paddle.nn.Layer): + """ + Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of + temporal kernel. + """ + def __init__(self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + dilation=1, + norm_module=paddle.nn.BatchNorm3D): + """ + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the middle + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): if True, calculate the relu on the original + input without allocating new memory. + eps (float): epsilon for batch norm. + dilation (int): size of dilation. + """ + super(BottleneckTransform, self).__init__() + self.temp_kernel_size = temp_kernel_size + self._inplace_relu = inplace_relu + self._eps = eps + self._stride_1x1 = stride_1x1 + self.norm_module = norm_module + self._construct(dim_in, dim_out, stride, dim_inner, num_groups, + dilation) + + def _construct(self, dim_in, dim_out, stride, dim_inner, num_groups, + dilation): + str1x1, str3x3 = (stride, 1) if self._stride_1x1 else (1, stride) + + fan = (dim_inner) * (self.temp_kernel_size * 1 * 1) + initializer_tmp = get_conv_init(fan) + + self.a = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_inner, + kernel_size=[self.temp_kernel_size, 1, 1], + stride=[1, str1x1, str1x1], + padding=[int(self.temp_kernel_size // 2), 0, 0], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self.a_bn = self.norm_module(num_features=dim_inner, + epsilon=self._eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + # 1x3x3, BN, ReLU. + fan = (dim_inner) * (1 * 3 * 3) + initializer_tmp = get_conv_init(fan) + + self.b = paddle.nn.Conv3D( + in_channels=dim_inner, + out_channels=dim_inner, + kernel_size=[1, 3, 3], + stride=[1, str3x3, str3x3], + padding=[0, dilation, dilation], + groups=num_groups, + dilation=[1, dilation, dilation], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self.b_bn = self.norm_module(num_features=dim_inner, + epsilon=self._eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + # 1x1x1, BN. + fan = (dim_out) * (1 * 1 * 1) + initializer_tmp = get_conv_init(fan) + + self.c = paddle.nn.Conv3D( + in_channels=dim_inner, + out_channels=dim_out, + kernel_size=[1, 1, 1], + stride=[1, 1, 1], + padding=[0, 0, 0], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self.c_bn = self.norm_module( + num_features=dim_out, + epsilon=self._eps, + weight_attr=get_bn_param_attr(bn_weight=0.0), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + def forward(self, x): + # Branch2a. + x = self.a(x) + x = self.a_bn(x) + x = F.relu(x) + + # Branch2b. + x = self.b(x) + x = self.b_bn(x) + x = F.relu(x) + + # Branch2c + x = self.c(x) + x = self.c_bn(x) + return x + + +class ResBlock(paddle.nn.Layer): + """ + Residual block. + """ + def __init__(self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups=1, + stride_1x1=False, + inplace_relu=True, + eps=1e-5, + dilation=1, + norm_module=paddle.nn.BatchNorm3D): + """ + ResBlock class constructs redisual blocks. More details can be found in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. + "Deep residual learning for image recognition." + https://arxiv.org/abs/1512.03385 + Args: + dim_in (int): the channel dimensions of the input. + dim_out (int): the channel dimension of the output. + temp_kernel_size (int): the temporal kernel sizes of the middle + convolution in the bottleneck. + stride (int): the stride of the bottleneck. + trans_func (string): transform function to be used to construct the + bottleneck. + dim_inner (int): the inner dimension of the block. + num_groups (int): number of groups for the convolution. num_groups=1 + is for standard ResNet like networks, and num_groups>1 is for + ResNeXt like networks. + stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise + apply stride to the 3x3 conv. + inplace_relu (bool): calculate the relu on the original input + without allocating new memory. + eps (float): epsilon for batch norm. + dilation (int): size of dilation. + """ + super(ResBlock, self).__init__() + self._inplace_relu = inplace_relu + self._eps = eps + self.norm_module = norm_module + self._construct( + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ) + + def _construct( + self, + dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ): + # Use skip connection with projection if dim or res change. + if (dim_in != dim_out) or (stride != 1): + fan = (dim_out) * (1 * 1 * 1) + initializer_tmp = get_conv_init(fan) + self.branch1 = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_out, + kernel_size=1, + stride=[1, stride, stride], + padding=0, + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False, + dilation=1) + self.branch1_bn = self.norm_module( + num_features=dim_out, + epsilon=self._eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + self.branch2 = BottleneckTransform(dim_in, + dim_out, + temp_kernel_size, + stride, + dim_inner, + num_groups, + stride_1x1=stride_1x1, + inplace_relu=inplace_relu, + dilation=dilation, + norm_module=self.norm_module) + + def forward(self, x): + if hasattr(self, "branch1"): + x1 = self.branch1(x) + x1 = self.branch1_bn(x1) + x2 = self.branch2(x) + x = paddle.add(x=x1, y=x2) + else: + x2 = self.branch2(x) + x = paddle.add(x=x, y=x2) + + x = F.relu(x) + return x + + +class ResStage(paddle.nn.Layer): + """ + Stage of 3D ResNet. It expects to have one or more tensors as input for + multi-pathway (SlowFast) cases. More details can be found here: + + Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. + "Slowfast networks for video recognition." + https://arxiv.org/pdf/1812.03982.pdf + """ + def __init__(self, + dim_in, + dim_out, + stride, + temp_kernel_sizes, + num_blocks, + dim_inner, + num_groups, + num_block_temp_kernel, + dilation, + stride_1x1=False, + inplace_relu=True, + norm_module=paddle.nn.BatchNorm3D): + """ + The `__init__` method of any subclass should also contain these arguments. + ResStage builds p streams, where p can be greater or equal to one. + Args: + dim_in (list): list of p the channel dimensions of the input. + Different channel dimensions control the input dimension of + different pathways. + dim_out (list): list of p the channel dimensions of the output. + Different channel dimensions control the input dimension of + different pathways. + temp_kernel_sizes (list): list of the p temporal kernel sizes of the + convolution in the bottleneck. Different temp_kernel_sizes + control different pathway. + stride (list): list of the p strides of the bottleneck. Different + stride control different pathway. + num_blocks (list): list of p numbers of blocks for each of the + pathway. + dim_inner (list): list of the p inner channel dimensions of the + input. Different channel dimensions control the input dimension + of different pathways. + num_groups (list): list of number of p groups for the convolution. + num_groups=1 is for standard ResNet like networks, and + num_groups>1 is for ResNeXt like networks. + num_block_temp_kernel (list): extent the temp_kernel_sizes to + num_block_temp_kernel blocks, then fill temporal kernel size + of 1 for the rest of the layers. + dilation (list): size of dilation for each pathway. + """ + super(ResStage, self).__init__() + assert all((num_block_temp_kernel[i] <= num_blocks[i] + for i in range(len(temp_kernel_sizes)))) + self.num_blocks = num_blocks + self.temp_kernel_sizes = [ + (temp_kernel_sizes[i] * num_blocks[i])[:num_block_temp_kernel[i]] + + [1] * (num_blocks[i] - num_block_temp_kernel[i]) + for i in range(len(temp_kernel_sizes)) + ] + assert (len({ + len(dim_in), + len(dim_out), + len(temp_kernel_sizes), + len(stride), + len(num_blocks), + len(dim_inner), + len(num_groups), + len(num_block_temp_kernel), + }) == 1) + self.num_pathways = len(self.num_blocks) + self.norm_module = norm_module + self._construct( + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ) + + def _construct( + self, + dim_in, + dim_out, + stride, + dim_inner, + num_groups, + stride_1x1, + inplace_relu, + dilation, + ): + + for pathway in range(self.num_pathways): + for i in range(self.num_blocks[pathway]): + res_block = ResBlock( + dim_in[pathway] if i == 0 else dim_out[pathway], + dim_out[pathway], + self.temp_kernel_sizes[pathway][i], + stride[pathway] if i == 0 else 1, + dim_inner[pathway], + num_groups[pathway], + stride_1x1=stride_1x1, + inplace_relu=inplace_relu, + dilation=dilation[pathway], + norm_module=self.norm_module) + self.add_sublayer("pathway{}_res{}".format(pathway, i), + res_block) + + def forward(self, inputs): + output = [] + for pathway in range(self.num_pathways): + x = inputs[pathway] + + for i in range(self.num_blocks[pathway]): + m = getattr(self, "pathway{}_res{}".format(pathway, i)) + x = m(x) + output.append(x) + + return output + + +class ResNetBasicStem(paddle.nn.Layer): + """ + ResNe(X)t 3D stem module. + Performs spatiotemporal Convolution, BN, and Relu following by a + spatiotemporal pooling. + """ + def __init__(self, + dim_in, + dim_out, + kernel, + stride, + padding, + eps=1e-5, + norm_module=paddle.nn.BatchNorm3D): + super(ResNetBasicStem, self).__init__() + self.kernel = kernel + self.stride = stride + self.padding = padding + self.eps = eps + self.norm_module = norm_module + self._construct_stem(dim_in, dim_out) + + def _construct_stem(self, dim_in, dim_out): + fan = (dim_out) * (self.kernel[0] * self.kernel[1] * self.kernel[2]) + initializer_tmp = get_conv_init(fan) + + self._conv = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_out, + kernel_size=self.kernel, + stride=self.stride, + padding=self.padding, + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self._bn = self.norm_module(num_features=dim_out, + epsilon=self.eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + def forward(self, x): + x = self._conv(x) + x = self._bn(x) + x = F.relu(x) + + x = F.max_pool3d(x=x, + kernel_size=[1, 3, 3], + stride=[1, 2, 2], + padding=[0, 1, 1], + data_format="NCDHW") + return x + + +class VideoModelStem(paddle.nn.Layer): + """ + Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool + on input data tensor for slow and fast pathways. + """ + def __init__(self, + dim_in, + dim_out, + kernel, + stride, + padding, + eps=1e-5, + norm_module=paddle.nn.BatchNorm3D): + """ + Args: + dim_in (list): the list of channel dimensions of the inputs. + dim_out (list): the output dimension of the convolution in the stem + layer. + kernel (list): the kernels' size of the convolutions in the stem + layers. Temporal kernel size, height kernel size, width kernel + size in order. + stride (list): the stride sizes of the convolutions in the stem + layer. Temporal kernel stride, height kernel size, width kernel + size in order. + padding (list): the paddings' sizes of the convolutions in the stem + layer. Temporal padding size, height padding size, width padding + size in order. + eps (float): epsilon for batch norm. + """ + super(VideoModelStem, self).__init__() + + assert (len({ + len(dim_in), + len(dim_out), + len(kernel), + len(stride), + len(padding), + }) == 1), "Input pathway dimensions are not consistent." + self.num_pathways = len(dim_in) + self.kernel = kernel + self.stride = stride + self.padding = padding + self.eps = eps + self.norm_module = norm_module + self._construct_stem(dim_in, dim_out) + + def _construct_stem(self, dim_in, dim_out): + for pathway in range(len(dim_in)): + stem = ResNetBasicStem(dim_in[pathway], dim_out[pathway], + self.kernel[pathway], self.stride[pathway], + self.padding[pathway], self.eps, + self.norm_module) + self.add_sublayer("pathway{}_stem".format(pathway), stem) + + def forward(self, x): + assert (len(x) == self.num_pathways + ), "Input tensor does not contain {} pathway".format( + self.num_pathways) + + for pathway in range(len(x)): + m = getattr(self, "pathway{}_stem".format(pathway)) + x[pathway] = m(x[pathway]) + + return x + + +class FuseFastToSlow(paddle.nn.Layer): + """ + Fuses the information from the Fast pathway to the Slow pathway. Given the + tensors from Slow pathway and Fast pathway, fuse information from Fast to + Slow, then return the fused tensors from Slow and Fast pathway in order. + """ + def __init__(self, + dim_in, + fusion_conv_channel_ratio, + fusion_kernel, + alpha, + fuse_bn_relu=1, + eps=1e-5, + norm_module=paddle.nn.BatchNorm3D): + """ + Args: + dim_in (int): the channel dimension of the input. + fusion_conv_channel_ratio (int): channel ratio for the convolution + used to fuse from Fast pathway to Slow pathway. + fusion_kernel (int): kernel size of the convolution used to fuse + from Fast pathway to Slow pathway. + alpha (int): the frame rate ratio between the Fast and Slow pathway. + eps (float): epsilon for batch norm. + """ + super(FuseFastToSlow, self).__init__() + self.fuse_bn_relu = fuse_bn_relu + fan = (dim_in * fusion_conv_channel_ratio) * (fusion_kernel * 1 * 1) + initializer_tmp = get_conv_init(fan) + + self._conv_f2s = paddle.nn.Conv3D( + in_channels=dim_in, + out_channels=dim_in * fusion_conv_channel_ratio, + kernel_size=[fusion_kernel, 1, 1], + stride=[alpha, 1, 1], + padding=[fusion_kernel // 2, 0, 0], + weight_attr=paddle.ParamAttr(initializer=initializer_tmp), + bias_attr=False) + self._bn = norm_module(num_features=dim_in * fusion_conv_channel_ratio, + epsilon=eps, + weight_attr=get_bn_param_attr(), + bias_attr=get_bn_param_attr(bn_weight=0.0)) + + def forward(self, x): + x_s = x[0] + x_f = x[1] + fuse = self._conv_f2s(x_f) + # TODO: For AVA, set fuse_bn_relu=1, check mAP's improve. + if self.fuse_bn_relu: + fuse = self._bn(fuse) + fuse = F.relu(fuse) + x_s_fuse = paddle.concat(x=[x_s, fuse], axis=1, name=None) + + return [x_s_fuse, x_f] + + +@BACKBONES.register() +class ResNetSlowFast_MRI(paddle.nn.Layer): + """ + SlowFast model builder for SlowFast network. + + Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. + "Slowfast networks for video recognition." + https://arxiv.org/pdf/1812.03982.pdf + """ + def __init__( + self, + alpha, + beta, + bn_norm_type="batchnorm", + bn_num_splits=1, + num_pathways=2, + depth=50, + num_groups=1, + input_channel_num=[1, 1], + width_per_group=64, + fusion_conv_channel_ratio=2, + fusion_kernel_sz=7, #5? + pool_size_ratio=[[1, 1, 1], [1, 1, 1]], + fuse_bn_relu=1, + spatial_strides=[[1, 1], [2, 2], [2, 2], [2, 2]], + use_pool_af_s2=1, + ): + """ + Args: + cfg (CfgNode): model building configs, details are in the + comments of the config file. + """ + super(ResNetSlowFast_MRI, self).__init__() + + self.alpha = alpha #8 + self.beta = beta #8 + self.norm_module = get_norm(bn_norm_type, bn_num_splits) + self.num_pathways = num_pathways + self.depth = depth + self.num_groups = num_groups + self.input_channel_num = input_channel_num + self.width_per_group = width_per_group + self.fusion_conv_channel_ratio = fusion_conv_channel_ratio + self.fusion_kernel_sz = fusion_kernel_sz # NOTE: modify to 7 in 8*8, 5 in old implement + self.pool_size_ratio = pool_size_ratio + self.fuse_bn_relu = fuse_bn_relu + self.spatial_strides = spatial_strides + self.use_pool_af_s2 = use_pool_af_s2 + self._construct_network() + + def _construct_network(self): + """ + Builds a SlowFast model. + The first pathway is the Slow pathway + and the second pathway is the Fast pathway. + + Args: + cfg (CfgNode): model building configs, details are in the + comments of the config file. + """ + temp_kernel = [ + [[1], [5]], # conv1 temporal kernel for slow and fast pathway. + [[1], [3]], # res2 temporal kernel for slow and fast pathway. + [[1], [3]], # res3 temporal kernel for slow and fast pathway. + [[3], [3]], # res4 temporal kernel for slow and fast pathway. + [[3], [3]], + ] # res5 temporal kernel for slow and fast pathway. + + self.s1 = VideoModelStem( + dim_in=self.input_channel_num, + dim_out=[self.width_per_group, self.width_per_group // self.beta], + kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]], + stride=[[1, 2, 2]] * 2, + padding=[ + [temp_kernel[0][0][0] // 2, 3, 3], + [temp_kernel[0][1][0] // 2, 3, 3], + ], + norm_module=self.norm_module) + self.s1_fuse = FuseFastToSlow( + dim_in=self.width_per_group // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu) + + # ResNet backbone + MODEL_STAGE_DEPTH = {50: (3, 4, 6, 3)} + (d2, d3, d4, d5) = MODEL_STAGE_DEPTH[self.depth] + + num_block_temp_kernel = [[3, 3], [4, 4], [6, 6], [3, 3]] + spatial_dilations = [[1, 1], [1, 1], [1, 1], [1, 1]] + spatial_strides = self.spatial_strides + #spatial_strides = [[1, 1], [2, 2], [2, 2], [2, 2]] + #spatial_strides = [[1, 1], [2, 2], [2, 2], [1, 1]] #TODO:check which value is FAIR's impliment + + out_dim_ratio = self.beta // self.fusion_conv_channel_ratio #4 + dim_inner = self.width_per_group * self.num_groups #64 + + self.s2 = ResStage(dim_in=[ + self.width_per_group + self.width_per_group // out_dim_ratio, + self.width_per_group // self.beta, + ], + dim_out=[ + self.width_per_group * 4, + self.width_per_group * 4 // self.beta, + ], + dim_inner=[dim_inner, dim_inner // self.beta], + temp_kernel_sizes=temp_kernel[1], + stride=spatial_strides[0], + num_blocks=[d2] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[0], + dilation=spatial_dilations[0], + norm_module=self.norm_module) + + self.s2_fuse = FuseFastToSlow( + dim_in=self.width_per_group * 4 // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu, + ) + + self.s3 = ResStage( + dim_in=[ + self.width_per_group * 4 + + self.width_per_group * 4 // out_dim_ratio, + self.width_per_group * 4 // self.beta, + ], + dim_out=[ + self.width_per_group * 8, + self.width_per_group * 8 // self.beta, + ], + dim_inner=[dim_inner * 2, dim_inner * 2 // self.beta], + temp_kernel_sizes=temp_kernel[2], + stride=spatial_strides[1], + num_blocks=[d3] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[1], + dilation=spatial_dilations[1], + norm_module=self.norm_module, + ) + + self.s3_fuse = FuseFastToSlow( + dim_in=self.width_per_group * 8 // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu, + ) + + self.s4 = ResStage( + dim_in=[ + self.width_per_group * 8 + + self.width_per_group * 8 // out_dim_ratio, + self.width_per_group * 8 // self.beta, + ], + dim_out=[ + self.width_per_group * 16, + self.width_per_group * 16 // self.beta, + ], + dim_inner=[dim_inner * 4, dim_inner * 4 // self.beta], + temp_kernel_sizes=temp_kernel[3], + stride=spatial_strides[2], + num_blocks=[d4] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[2], + dilation=spatial_dilations[2], + norm_module=self.norm_module, + ) + + self.s4_fuse = FuseFastToSlow( + dim_in=self.width_per_group * 16 // self.beta, + fusion_conv_channel_ratio=self.fusion_conv_channel_ratio, + fusion_kernel=self.fusion_kernel_sz, + alpha=self.alpha, + norm_module=self.norm_module, + fuse_bn_relu=self.fuse_bn_relu, + ) + + self.s5 = ResStage( + dim_in=[ + self.width_per_group * 16 + + self.width_per_group * 16 // out_dim_ratio, + self.width_per_group * 16 // self.beta, + ], + dim_out=[ + self.width_per_group * 32, + self.width_per_group * 32 // self.beta, + ], + dim_inner=[dim_inner * 8, dim_inner * 8 // self.beta], + temp_kernel_sizes=temp_kernel[4], + stride=spatial_strides[3], + num_blocks=[d5] * 2, + num_groups=[self.num_groups] * 2, + num_block_temp_kernel=num_block_temp_kernel[3], + dilation=spatial_dilations[3], + norm_module=self.norm_module, + ) + + def init_weights(self): + pass + + def forward(self, x): + x = self.s1(x) #VideoModelStem + x = self.s1_fuse(x) #FuseFastToSlow + x = self.s2(x) #ResStage + x = self.s2_fuse(x) + + # TODO: For AVA, set use_pool_af_s2=1, check mAP's improve. + if self.use_pool_af_s2: + for pathway in range(self.num_pathways): + x[pathway] = F.max_pool3d( + x=x[pathway], + kernel_size=self.pool_size_ratio[pathway], + stride=self.pool_size_ratio[pathway], + padding=[0, 0, 0], + data_format="NCDHW") + + x = self.s3(x) + x = self.s3_fuse(x) + x = self.s4(x) + x = self.s4_fuse(x) + x = self.s5(x) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm.py new file mode 100644 index 0000000..70788ec --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm.py @@ -0,0 +1,353 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D, + AvgPool2D) +from paddle import ParamAttr +import paddle.nn.functional as F +from paddle.regularizer import L2Decay +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + + +class ConvBNLayer(nn.Layer): + """Conv2D and BatchNorm2D layer. + + Args: + in_channels (int): Number of channels for the input. + out_channels (int): Number of channels for the output. + kernel_size (int): Kernel size. + stride (int): Stride in the Conv2D layer. Default: 1. + groups (int): Groups in the Conv2D, Default: 1. + act (str): Indicate activation after BatchNorm2D layer. + name (str): the name of an instance of ConvBNLayer. + Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method. + + """ + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None, + data_format="NCHW"): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False, + data_format=data_format) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + + self._act = act + + self._batch_norm = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(name=bn_name + "_scale", + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(name=bn_name + "_offset", + regularizer=L2Decay(0.0)), + data_format=data_format) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self._act: + y = getattr(paddle.nn.functional, self._act)(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + num_seg=8, + name=None, + data_format="NCHW"): + super(BottleneckBlock, self).__init__() + self.data_format = data_format + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act="relu", + name=name + "_branch2a", + data_format=data_format) + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act="relu", + name=name + "_branch2b", + data_format=data_format) + + self.conv2 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c", + data_format=data_format) + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + name=name + "_branch1", + data_format=data_format) + + self.shortcut = shortcut + self.num_seg = num_seg + + def forward(self, inputs): + if paddle.is_compiled_with_custom_device('npu'): + x = inputs + seg_num = self.num_seg + shift_ratio = 1.0 / self.num_seg + + shape = x.shape #[N*T, C, H, W] + reshape_x = x.reshape( + (-1, seg_num, shape[1], shape[2], shape[3])) #[N, T, C, H, W] + pad_x = F.pad(reshape_x, [ + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + ]) #[N, T+2, C, H, W] + c1 = int(shape[1] * shift_ratio) + c2 = int(shape[1] * 2 * shift_ratio) + slice1 = pad_x[:, :seg_num, :c1, :, :] + slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :] + slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :] + concat_x = paddle.concat([slice1, slice2, slice3], + axis=2) #[N, T, C, H, W] + shifts = concat_x.reshape(shape) + else: + shifts = F.temporal_shift(inputs, + self.num_seg, + 1.0 / self.num_seg, + data_format=self.data_format) + + y = self.conv0(shifts) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + return F.relu(y) + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + name=None, + data_format="NCHW"): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + filter_size=3, + stride=stride, + act="relu", + name=name + "_branch2a", + data_format=data_format, + ) + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + filter_size=3, + act=None, + name=name + "_branch2b", + data_format=data_format, + ) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + filter_size=1, + stride=stride, + name=name + "_branch1", + data_format=data_format, + ) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(short, conv1) + y = F.relu(y) + return y + + +@BACKBONES.register() +class ResNetTSM(nn.Layer): + """ResNet TSM backbone. + + Args: + depth (int): Depth of resnet model. + pretrained (str): pretrained model. Default: None. + """ + def __init__(self, depth, num_seg=8, data_format="NCHW", pretrained=None): + super(ResNetTSM, self).__init__() + self.pretrained = pretrained + self.layers = depth + self.num_seg = num_seg + self.data_format = data_format + + supported_layers = [18, 34, 50, 101, 152] + assert self.layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, self.layers) + + if self.layers == 18: + depth = [2, 2, 2, 2] + elif self.layers == 34 or self.layers == 50: + depth = [3, 4, 6, 3] + elif self.layers == 101: + depth = [3, 4, 23, 3] + elif self.layers == 152: + depth = [3, 8, 36, 3] + + in_channels = 64 + out_channels = [64, 128, 256, 512] + + self.conv = ConvBNLayer(in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", + name="conv1", + data_format=self.data_format) + self.pool2D_max = MaxPool2D( + kernel_size=3, + stride=2, + padding=1, + data_format=self.data_format, + ) + + self.block_list = [] + if self.layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if self.layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + conv_name, + BottleneckBlock( + in_channels=in_channels + if i == 0 else out_channels[block] * 4, + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + num_seg=self.num_seg, + shortcut=shortcut, + name=conv_name, + data_format=self.data_format)) + in_channels = out_channels[block] * 4 + self.block_list.append(bottleneck_block) + shortcut = True + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + conv_name, + BasicBlock( + in_channels=in_channels[block] + if i == 0 else out_channels[block], + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + name=conv_name, + data_format=self.data_format, + )) + self.block_list.append(basic_block) + shortcut = True + + def init_weights(self): + """Initiate the parameters. + Note: + 1. when indicate pretrained loading path, will load it to initiate backbone. + 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html + """ + #XXX: check bias!!! check pretrained!!! + + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + #XXX: no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + """Define how the backbone is going to run. + + """ + #NOTE: (deprecated design) Already merge axis 0(batches) and axis 1(clips) before extracting feature phase, + # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27 + #y = paddle.reshape( + # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]]) + + #NOTE: As paddlepaddle to_static method need a "pure" model to trim. It means from + # 1. the phase of generating data[images, label] from dataloader + # to + # 2. last layer of a model, always is FC layer + + y = self.conv(inputs) + y = self.pool2D_max(y) + for block in self.block_list: + y = block(y) + return y diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm_MRI.py new file mode 100644 index 0000000..e814f0f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsm_MRI.py @@ -0,0 +1,327 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math + +import sys +import paddle +import paddle.nn as nn +from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D, + AvgPool2D) +from paddle import ParamAttr +import paddle.nn.functional as F + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils.save_load import load_ckpt +from paddle.regularizer import L2Decay + + +class ConvBNLayer(nn.Layer): + """Conv2D and BatchNorm2D layer. + + Args: + in_channels (int): Number of channels for the input. + out_channels (int): Number of channels for the output. + kernel_size (int): Kernel size. + stride (int): Stride in the Conv2D layer. Default: 1. + groups (int): Groups in the Conv2D, Default: 1. + is_tweaks_mode (bool): switch for tweaks. Default: False. + act (str): Indicate activation after BatchNorm2D layer. + name (str): the name of an instance of ConvBNLayer. + + Note: weight and bias initialization include initialize values and name the restored parameters, values initialization are explicit declared in the ```init_weights``` method. + + """ + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_tweaks_mode=False, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.is_tweaks_mode = is_tweaks_mode + #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution, + # whose stride is changed to 1, works well in practice. + self._pool2d_avg = AvgPool2D(kernel_size=2, + stride=2, + padding=0, + ceil_mode=True) + + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + + self._act = act + + self._batch_norm = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(name=bn_name + "_scale", + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0))) + + def forward(self, inputs): + if self.is_tweaks_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + if self._act: + y = getattr(paddle.nn.functional, self._act)(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + num_seg=8, + name=None): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act="leaky_relu", + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act="leaky_relu", + name=name + "_branch2b") + + self.conv2 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride= + 1, #ResNet-D 2/2:add a 2×2 average pooling layer with a stride of 2 before the convolution, + # whose stride is changed to 1, works well in practice. + is_tweaks_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + self.num_seg = num_seg + + def forward(self, inputs): + shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg) + y = self.conv0(shifts) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + return F.leaky_relu(y) + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + filter_size=3, + stride=stride, + act="leaky_relu", + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + filter_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + filter_size=1, + stride=stride, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(short, conv1) + y = F.leaky_relu(y) + return y + + +@BACKBONES.register() +class ResNetTSM_MRI(nn.Layer): + """ResNet TSM backbone. + + Args: + depth (int): Depth of resnet model. + pretrained (str): pretrained model. Default: None. + """ + def __init__(self, depth, num_seg=8, pretrained=None, in_channels=1): + super(ResNetTSM_MRI, self).__init__() + self.pretrained = pretrained + self.layers = depth + self.num_seg = num_seg + self.in_channels = in_channels + + supported_layers = [18, 34, 50, 101, 152] + assert self.layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, self.layers) + + if self.layers == 18: + depth = [2, 2, 2, 2] + elif self.layers == 34 or self.layers == 50: + depth = [3, 4, 6, 3] + elif self.layers == 101: + depth = [3, 4, 23, 3] + elif self.layers == 152: + depth = [3, 8, 36, 3] + + in_channels = 64 + out_channels = [64, 128, 256, 512] + + #ResNet-C: use three 3x3 conv, replace, one 7x7 conv + self.conv1_1 = ConvBNLayer(in_channels=self.in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='leaky_relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer(in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='leaky_relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer(in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='leaky_relu', + name="conv1_3") + self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if self.layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if self.layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % + (block, i), #same with PaddleClas, for loading pretrain + BottleneckBlock( + in_channels=in_channels + if i == 0 else out_channels[block] * 4, + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + num_seg=self.num_seg, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + in_channels = out_channels[block] * 4 + self.block_list.append(bottleneck_block) + shortcut = True + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + conv_name, + BasicBlock(in_channels=in_channels[block] + if i == 0 else out_channels[block], + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + name=conv_name)) + self.block_list.append(basic_block) + shortcut = True + + def init_weights(self): + """Initiate the parameters. + Note: + 1. when indicate pretrained loading path, will load it to initiate backbone. + 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html + """ + #XXX: check bias!!! check pretrained!!! + + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + #XXX: no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + """Define how the backbone is going to run. + + """ + #NOTE: Already merge axis 0(batches) and axis 1(channels) before extracting feature phase, + # please refer to paddlevideo/modeling/framework/recognizers/recognizer2d.py#L27 + #y = paddle.reshape( + # inputs, [-1, inputs.shape[2], inputs.shape[3], inputs.shape[4]]) + + ####ResNet-C: use three 3x3 conv, replace, one 7x7 conv + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + + y = self.pool2D_max(y) + for block in self.block_list: + y = block(y) + return y diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsn_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsn_MRI.py new file mode 100644 index 0000000..439a0ef --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tsn_MRI.py @@ -0,0 +1,331 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.regularizer import L2Decay +from paddle.nn import Conv2D, BatchNorm +from paddle.nn import MaxPool2D, AvgPool2D + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + +__all__ = ["ResNetTSN_MRI"] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_tweaks_mode=False, + act=None, + lr_mult=1.0, + name=None): + super(ConvBNLayer, self).__init__() + self.is_tweaks_mode = is_tweaks_mode + self._pool2d_avg = AvgPool2D(kernel_size=2, + stride=2, + padding=0, + ceil_mode=True) + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights", + learning_rate=lr_mult), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale', + learning_rate=lr_mult, + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(bn_name + '_offset', + learning_rate=lr_mult, + regularizer=L2Decay(0.0)), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + if self.is_tweaks_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + lr_mult=1.0, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + lr_mult=lr_mult, + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + lr_mult=lr_mult, + name=name + "_branch2b") + self.conv2 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + lr_mult=lr_mult, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_tweaks_mode=False if if_first else True, + lr_mult=lr_mult, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + lr_mult=1.0, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + lr_mult=lr_mult, + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + lr_mult=lr_mult, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_tweaks_mode=False if if_first else True, + lr_mult=lr_mult, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +@BACKBONES.register() +class ResNetTSN_MRI(nn.Layer): + """ResNetTweaksTSN backbone. + + Args: + depth (int): Depth of resnet model. + pretrained (str): pretrained model. Default: None. + """ + def __init__(self, + layers=50, + pretrained=None, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + in_channels=1): + super(ResNetTSN_MRI, self).__init__() + + self.pretrained = pretrained + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + self.lr_mult_list = lr_mult_list + self.in_channels = in_channels + assert isinstance( + self.lr_mult_list, + (list, tuple + )), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list)) + assert len( + self.lr_mult_list + ) == 5, "lr_mult_list length should should be 5 but got {}".format( + len(self.lr_mult_list)) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, 1024 + ] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.conv1_1 = ConvBNLayer(in_channels=self.in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + lr_mult=self.lr_mult_list[0], + name="conv1_1") + self.conv1_2 = ConvBNLayer(in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + lr_mult=self.lr_mult_list[0], + name="conv1_2") + self.conv1_3 = ConvBNLayer(in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + lr_mult=self.lr_mult_list[0], + name="conv1_3") + self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if layers in [101, 152, 200] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + lr_mult=self.lr_mult_list[block + 1], + name=conv_name)) + self.block_list.append(bottleneck_block) + shortcut = True + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock(in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name, + lr_mult=self.lr_mult_list[block + 1])) + self.block_list.append(basic_block) + shortcut = True + + def init_weights(self): + """Initiate the parameters. + Note: + 1. when indicate pretrained loading path, will load it to initiate backbone. + 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be + initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html + """ + # XXX: check bias!!! check pretrained!!! + + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + # XXX: no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + for block in self.block_list: + y = block(y) + return y diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py new file mode 100644 index 0000000..089da4e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsm.py @@ -0,0 +1,362 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math + +import sys +import paddle +import paddle.nn as nn +from paddle.nn import (Conv2D, BatchNorm2D, Linear, Dropout, MaxPool2D, + AvgPool2D) +from paddle import ParamAttr +import paddle.nn.functional as F +from paddle.regularizer import L2Decay + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils.save_load import load_ckpt + +# Download URL of pretrained model +# { +# "ResNet50_vd": +# "wget https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams", +# "ResNet101_vd": +# "https://videotag.bj.bcebos.com/PaddleVideo-release2.2/ResNet101_vd_ssld_pretrained.pdparams", +# "ResNet18_vd": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams", +# "ResNet34_vd": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet34_vd_ssld_pretrained.pdparams", +# "ResNet152_vd": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams", +# "ResNet200_vd": +# "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams", +# } + + +class ConvBNLayer(nn.Layer): + """Conv2D and BatchNorm2D layer. + Args: + in_channels (int): Number of channels for the input. + out_channels (int): Number of channels for the output. + kernel_size (int): Kernel size. + stride (int): Stride in the Conv2D layer. Default: 1. + groups (int): Groups in the Conv2D, Default: 1. + is_tweaks_mode (bool): switch for tweaks. Default: False. + act (str): Indicate activation after BatchNorm2D layer. + name (str): the name of an instance of ConvBNLayer. + """ + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_tweaks_mode=False, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.is_tweaks_mode = is_tweaks_mode + #ResNet-D 1/2:add a 2×2 average pooling layer with a stride of 2 before the convolution, + # whose stride is changed to 1, works well in practice. + self._pool2d_avg = AvgPool2D(kernel_size=2, + stride=2, + padding=0, + ceil_mode=True) + + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + + self._act = act + + self._batch_norm = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(name=bn_name + "_scale", + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(bn_name + "_offset", regularizer=L2Decay(0.0))) + + def forward(self, inputs): + if self.is_tweaks_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + if self._act: + y = getattr(paddle.nn.functional, self._act)(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + num_seg=8, + name=None): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act="leaky_relu", + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act="leaky_relu", + name=name + "_branch2b") + + self.conv2 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_tweaks_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + self.num_seg = num_seg + + def forward(self, inputs): + if paddle.is_compiled_with_custom_device('npu'): + x = inputs + seg_num = self.num_seg + shift_ratio = 1.0 / self.num_seg + + shape = x.shape #[N*T, C, H, W] + reshape_x = x.reshape( + (-1, seg_num, shape[1], shape[2], shape[3])) #[N, T, C, H, W] + pad_x = F.pad(reshape_x, [ + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + ]) #[N, T+2, C, H, W] + c1 = int(shape[1] * shift_ratio) + c2 = int(shape[1] * 2 * shift_ratio) + slice1 = pad_x[:, :seg_num, :c1, :, :] + slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :] + slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :] + concat_x = paddle.concat([slice1, slice2, slice3], + axis=2) #[N, T, C, H, W] + shifts = concat_x.reshape(shape) + else: + shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg) + + y = self.conv0(shifts) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + return F.leaky_relu(y) + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + num_seg=8, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.num_seg = num_seg + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act="leaky_relu", + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + # add temporal shift module + shifts = F.temporal_shift(inputs, self.num_seg, 1.0 / self.num_seg) + y = self.conv0(shifts) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(short, conv1) + y = F.leaky_relu(y) + return y + + +@BACKBONES.register() +class ResNetTweaksTSM(nn.Layer): + """ResNet TSM backbone. + + Args: + depth (int): Depth of resnet model. + pretrained (str): pretrained model. Default: None. + """ + def __init__(self, depth, num_seg=8, pretrained=None): + super(ResNetTweaksTSM, self).__init__() + self.pretrained = pretrained + self.layers = depth + self.num_seg = num_seg + + supported_layers = [18, 34, 50, 101, 152] + assert self.layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, self.layers) + + if self.layers == 18: + depth = [2, 2, 2, 2] + elif self.layers == 34 or self.layers == 50: + depth = [3, 4, 6, 3] + elif self.layers == 101: + depth = [3, 4, 23, 3] + elif self.layers == 152: + depth = [3, 8, 36, 3] + + in_channels = 64 + out_channels = [64, 128, 256, 512] + + #ResNet-C: use three 3x3 conv, replace, one 7x7 conv + self.conv1_1 = ConvBNLayer(in_channels=3, + out_channels=32, + kernel_size=3, + stride=2, + act='leaky_relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer(in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='leaky_relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer(in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='leaky_relu', + name="conv1_3") + self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if self.layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if self.layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % + (block, i), #same with PaddleClas, for loading pretrain + BottleneckBlock( + in_channels=in_channels + if i == 0 else out_channels[block] * 4, + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + num_seg=self.num_seg, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + in_channels = out_channels[block] * 4 + self.block_list.append(bottleneck_block) + shortcut = True + else: + in_channels = [64, 64, 128, 256] + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + conv_name, + BasicBlock(in_channels=in_channels[block] + if i == 0 else out_channels[block], + out_channels=out_channels[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + num_seg=self.num_seg, + name=conv_name)) + self.block_list.append(basic_block) + shortcut = True + + def init_weights(self): + """Initiate the parameters. + Note: + 1. when indicate pretrained loading path, will load it to initiate backbone. + 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html + """ + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + # no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + """Define how the backbone is going to run. + """ + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + + y = self.pool2D_max(y) + for block in self.block_list: + y = block(y) + return y diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py new file mode 100644 index 0000000..36b3307 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnet_tweaks_tsn.py @@ -0,0 +1,328 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.regularizer import L2Decay +from paddle.nn import Conv2D, BatchNorm +from paddle.nn import MaxPool2D, AvgPool2D + +from ..registry import BACKBONES +from ..weight_init import weight_init_ +from ...utils import load_ckpt + +__all__ = ["ResNetTweaksTSN"] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_tweaks_mode=False, + act=None, + lr_mult=1.0, + name=None): + super(ConvBNLayer, self).__init__() + self.is_tweaks_mode = is_tweaks_mode + self._pool2d_avg = AvgPool2D(kernel_size=2, + stride=2, + padding=0, + ceil_mode=True) + self._conv = Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights", + learning_rate=lr_mult), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale', + learning_rate=lr_mult, + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(bn_name + '_offset', + learning_rate=lr_mult, + regularizer=L2Decay(0.0)), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + if self.is_tweaks_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + lr_mult=1.0, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + lr_mult=lr_mult, + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + lr_mult=lr_mult, + name=name + "_branch2b") + self.conv2 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + lr_mult=lr_mult, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_tweaks_mode=False if if_first else True, + lr_mult=lr_mult, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + lr_mult=1.0, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + lr_mult=lr_mult, + name=name + "_branch2a") + self.conv1 = ConvBNLayer(in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + lr_mult=lr_mult, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer(in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_tweaks_mode=False if if_first else True, + lr_mult=lr_mult, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +@BACKBONES.register() +class ResNetTweaksTSN(nn.Layer): + """ResNetTweaksTSN backbone. + + Args: + depth (int): Depth of resnet model. + pretrained (str): pretrained model. Default: None. + """ + def __init__(self, + layers=50, + pretrained=None, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]): + super(ResNetTweaksTSN, self).__init__() + + self.pretrained = pretrained + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + self.lr_mult_list = lr_mult_list + assert isinstance( + self.lr_mult_list, + (list, tuple + )), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list)) + assert len( + self.lr_mult_list + ) == 5, "lr_mult_list length should should be 5 but got {}".format( + len(self.lr_mult_list)) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, 1024 + ] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.conv1_1 = ConvBNLayer(in_channels=3, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + lr_mult=self.lr_mult_list[0], + name="conv1_1") + self.conv1_2 = ConvBNLayer(in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + lr_mult=self.lr_mult_list[0], + name="conv1_2") + self.conv1_3 = ConvBNLayer(in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + lr_mult=self.lr_mult_list[0], + name="conv1_3") + self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if layers in [101, 152, 200] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + lr_mult=self.lr_mult_list[block + 1], + name=conv_name)) + self.block_list.append(bottleneck_block) + shortcut = True + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock(in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name, + lr_mult=self.lr_mult_list[block + 1])) + self.block_list.append(basic_block) + shortcut = True + + def init_weights(self): + """Initiate the parameters. + Note: + 1. when indicate pretrained loading path, will load it to initiate backbone. + 2. when not indicating pretrained loading path, will follow specific initialization initiate backbone. Always, Conv2D layer will be + initiated by KaimingNormal function, and BatchNorm2d will be initiated by Constant function. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/nn/initializer/kaiming/KaimingNormal_en.html + """ + # XXX: check bias!!! check pretrained!!! + + if isinstance(self.pretrained, str) and self.pretrained.strip() != "": + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + # XXX: no bias + weight_init_(layer, 'KaimingNormal') + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Constant', value=1) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + for block in self.block_list: + y = block(y) + return y diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnext101.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnext101.py new file mode 100644 index 0000000..deca671 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/resnext101.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import ParamAttr +from paddle import fluid +import paddle.nn as nn +from paddle.nn import Conv3D, BatchNorm3D +from functools import partial + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode='zeros', + weight_attr=None, + bias_attr=None, + name=None, + data_format="NCDHW"): + super(ConvBNLayer, self).__init__() + self._conv = Conv3D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + padding_mode=padding_mode, + weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal( + fan_in=num_filters * filter_size * filter_size), name=name+'_weights'), + bias_attr=bias_attr, + data_format=data_format) + bn_name = "bn_" + name + self._batch_norm = BatchNorm3D( + num_filters, + momentum=0.9, + epsilon=1e-05, + weight_attr=ParamAttr(initializer=nn.initializer.Constant( + 1.), name=bn_name + '_scale'), + bias_attr=ParamAttr(initializer=nn.initializer.Constant( + 0.), name=bn_name + '_offset'), + data_format=data_format) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +def _downsample_basic_block(self, x, planes, stride): + out = fluid.layers.pool3d( + x, pool_size=1, pool_stride=stride, pool_type='avg') + shape = out.shape + zero_pads = fluid.layers.zeros([shape[0], planes - shape[1], shape[2], shape[3], shape[4]], + dtype='float32') + out = fluid.layers.concat([out, zero_pads], axis=1) + + +class BottleneckBlock(nn.Layer): + expansion = 2 + + def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None, name=None): + super(BottleneckBlock, self).__init__() + + mid_planes = cardinality * int(planes / 32) + self.conv0 = ConvBNLayer( + inplanes, mid_planes, filter_size=1, bias_attr=False, name=name+'_branch2a') + self.conv1 = ConvBNLayer(mid_planes, mid_planes, filter_size=3, stride=stride, + padding=1, groups=cardinality, bias_attr=False, name=name+'_branch2b') + self.conv2 = ConvBNLayer(mid_planes, planes * self.expansion, + filter_size=1, bias_attr=False, name=name+'_branch2c') + self.downsample = downsample + self.stride = stride + self.relu = nn.ReLU() + + def forward(self, x): + residual = x + + out = self.conv0(x) + out = self.relu(out) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNeXt(nn.Layer): + def __init__(self, + block, + layers, + shortcut_type='B', + cardinality=32): + self.inplanes = 64 + super(ResNeXt, self).__init__() + self.conv = ConvBNLayer( + 3, + 64, + filter_size=7, + stride=(1, 2, 2), + padding=(3, 3, 3), + bias_attr=False, + name="res_conv1" + ) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool3D(kernel_size=(3, 3, 3), stride=2, padding=1) + self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, + cardinality, stride=1, name='layer1') + + self.layer2 = self._make_layer( + block, 256, layers[1], shortcut_type, cardinality, stride=2, name='layer2') + + self.layer3 = self._make_layer( + block, 512, layers[2], shortcut_type, cardinality, stride=2, name='layer3') + + self.layer4 = self._make_layer( + block, 1024, layers[3], shortcut_type, cardinality, stride=2, name='layer4') + self.avgpool = nn.AvgPool3D((2, 1, 1), stride=1, exclusive=False) + + def _make_layer(self, + block, + planes, + blocks, + shortcut_type, + cardinality, + stride=1, + name=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + if shortcut_type == 'A': + downsample = partial(self._downsample_basic_block, + planes=planes * block.expansion, + stride=stride) + else: + downsample = ConvBNLayer( + self.inplanes, + planes * block.expansion, + 1, + stride=stride, + bias_attr=False, + name=name+'downsample' + ) + layers = [] + layers.append( + block(self.inplanes, planes, cardinality, stride, downsample, name=name+'_downsample')) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, + cardinality, name=name+'_res_block'+str(i))) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x + + +def ResNext101(): + """Constructs a ResNext-101 model. + """ + model = ResNeXt(BottleneckBlock, [3, 4, 23, 3]) + return model diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/stgcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/stgcn.py new file mode 100644 index 0000000..40d9d0d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/stgcn.py @@ -0,0 +1,343 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +from ..registry import BACKBONES +from ..weight_init import weight_init_ + + +def zero(x): + return 0 + + +def iden(x): + return x + + +def einsum(x, A): + """paddle.einsum will be implemented in release/2.2. + """ + x = x.transpose((0, 2, 3, 1, 4)) + n, c, t, k, v = x.shape + k2, v2, w = A.shape + assert (k == k2 and v == v2), "Args of einsum not match!" + x = x.reshape((n, c, t, k * v)) + A = A.reshape((k * v, w)) + y = paddle.matmul(x, A) + return y + + +def get_hop_distance(num_node, edge, max_hop=1): + A = np.zeros((num_node, num_node)) + for i, j in edge: + A[j, i] = 1 + A[i, j] = 1 + + # compute hop steps + hop_dis = np.zeros((num_node, num_node)) + np.inf + transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)] + arrive_mat = (np.stack(transfer_mat) > 0) + for d in range(max_hop, -1, -1): + hop_dis[arrive_mat[d]] = d + return hop_dis + + +def normalize_digraph(A): + Dl = np.sum(A, 0) + num_node = A.shape[0] + Dn = np.zeros((num_node, num_node)) + for i in range(num_node): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-1) + AD = np.dot(A, Dn) + return AD + + +class Graph(): + + def __init__(self, + layout='openpose', + strategy='uniform', + max_hop=1, + dilation=1): + self.max_hop = max_hop + self.dilation = dilation + + self.get_edge(layout) + self.hop_dis = get_hop_distance(self.num_node, + self.edge, + max_hop=max_hop) + self.get_adjacency(strategy) + + def __str__(self): + return self.A + + def get_edge(self, layout): + # edge is a list of [child, parent] paris + + if layout == 'fsd10': + self.num_node = 25 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_link = [(1, 8), (0, 1), (15, 0), (17, 15), (16, 0), + (18, 16), (5, 1), (6, 5), (7, 6), (2, 1), (3, 2), + (4, 3), (9, 8), (10, 9), (11, 10), (24, 11), + (22, 11), (23, 22), (12, 8), (13, 12), (14, 13), + (21, 14), (19, 14), (20, 19)] + self.edge = self_link + neighbor_link + self.center = 8 + elif layout == 'ntu-rgb+d': + self.num_node = 25 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), (6, 5), + (7, 6), (8, 7), (9, 21), (10, 9), (11, 10), + (12, 11), (13, 1), (14, 13), (15, 14), (16, 15), + (17, 1), (18, 17), (19, 18), (20, 19), (22, 23), + (23, 8), (24, 25), (25, 12)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] + self.edge = self_link + neighbor_link + self.center = 21 - 1 + elif layout == 'coco_keypoint': + self.num_node = 17 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_1base = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), + (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), + (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)] + neighbor_link = [(i, j) for (i, j) in neighbor_1base] + self.edge = self_link + neighbor_link + self.center = 11 + else: + raise ValueError("Do Not Exist This Layout.") + + def get_adjacency(self, strategy): + valid_hop = range(0, self.max_hop + 1, self.dilation) + adjacency = np.zeros((self.num_node, self.num_node)) + for hop in valid_hop: + adjacency[self.hop_dis == hop] = 1 + normalize_adjacency = normalize_digraph(adjacency) + + if strategy == 'spatial': + A = [] + for hop in valid_hop: + a_root = np.zeros((self.num_node, self.num_node)) + a_close = np.zeros((self.num_node, self.num_node)) + a_further = np.zeros((self.num_node, self.num_node)) + for i in range(self.num_node): + for j in range(self.num_node): + if self.hop_dis[j, i] == hop: + if self.hop_dis[j, self.center] == self.hop_dis[ + i, self.center]: + a_root[j, i] = normalize_adjacency[j, i] + elif self.hop_dis[j, self.center] > self.hop_dis[ + i, self.center]: + a_close[j, i] = normalize_adjacency[j, i] + else: + a_further[j, i] = normalize_adjacency[j, i] + if hop == 0: + A.append(a_root) + else: + A.append(a_root + a_close) + A.append(a_further) + A = np.stack(A) + self.A = A + else: + raise ValueError("Do Not Exist This Strategy") + + +class ConvTemporalGraphical(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + t_kernel_size=1, + t_stride=1, + t_padding=0, + t_dilation=1): + super().__init__() + + self.kernel_size = kernel_size + self.conv = nn.Conv2D(in_channels, + out_channels * kernel_size, + kernel_size=(t_kernel_size, 1), + padding=(t_padding, 0), + stride=(t_stride, 1), + dilation=(t_dilation, 1)) + + def forward(self, x, A): + assert A.shape[0] == self.kernel_size + + x = self.conv(x) + n, kc, t, v = x.shape + x = x.reshape((n, self.kernel_size, kc // self.kernel_size, t, v)) + x = einsum(x, A) + + return x, A + + +class st_gcn_block(nn.Layer): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + dropout=0, + residual=True): + super(st_gcn_block, self).__init__() + + assert len(kernel_size) == 2 + assert kernel_size[0] % 2 == 1 + padding = ((kernel_size[0] - 1) // 2, 0) + + self.gcn = ConvTemporalGraphical(in_channels, out_channels, + kernel_size[1]) + + self.tcn = nn.Sequential( + nn.BatchNorm2D(out_channels), + nn.ReLU(), + nn.Conv2D( + out_channels, + out_channels, + (kernel_size[0], 1), + (stride, 1), + padding, + ), + nn.BatchNorm2D(out_channels), + nn.Dropout(dropout), + ) + + if not residual: + self.residual = zero + + elif (in_channels == out_channels) and (stride == 1): + self.residual = iden + + else: + self.residual = nn.Sequential( + nn.Conv2D(in_channels, + out_channels, + kernel_size=1, + stride=(stride, 1)), + nn.BatchNorm2D(out_channels), + ) + + self.relu = nn.ReLU() + + def forward(self, x, A): + res = self.residual(x) + x, A = self.gcn(x, A) + x = self.tcn(x) + res + return self.relu(x), A + + +@BACKBONES.register() +class STGCN(nn.Layer): + """ + ST-GCN model from: + `"Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition" `_ + Args: + in_channels: int, channels of vertex coordinate. 2 for (x,y), 3 for (x,y,z). Default 2. + edge_importance_weighting: bool, whether to use edge attention. Default True. + data_bn: bool, whether to use data BatchNorm. Default True. + """ + + def __init__(self, + in_channels=2, + edge_importance_weighting=True, + data_bn=True, + layout='fsd10', + strategy='spatial', + **kwargs): + super(STGCN, self).__init__() + self.data_bn = data_bn + # load graph + self.graph = Graph( + layout=layout, + strategy=strategy, + ) + A = paddle.to_tensor(self.graph.A, dtype='float32') + self.register_buffer('A', A) + + # build networks + spatial_kernel_size = A.shape[0] + temporal_kernel_size = 9 + kernel_size = (temporal_kernel_size, spatial_kernel_size) + self.data_bn = nn.BatchNorm1D(in_channels * + A.shape[1]) if self.data_bn else iden + kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'} + self.st_gcn_networks = nn.LayerList(( + st_gcn_block(in_channels, + 64, + kernel_size, + 1, + residual=False, + **kwargs0), + st_gcn_block(64, 64, kernel_size, 1, **kwargs), + st_gcn_block(64, 64, kernel_size, 1, **kwargs), + st_gcn_block(64, 64, kernel_size, 1, **kwargs), + st_gcn_block(64, 128, kernel_size, 2, **kwargs), + st_gcn_block(128, 128, kernel_size, 1, **kwargs), + st_gcn_block(128, 128, kernel_size, 1, **kwargs), + st_gcn_block(128, 256, kernel_size, 2, **kwargs), + st_gcn_block(256, 256, kernel_size, 1, **kwargs), + st_gcn_block(256, 256, kernel_size, 1, **kwargs), + )) + + # initialize parameters for edge importance weighting + if edge_importance_weighting: + self.edge_importance = nn.ParameterList([ + self.create_parameter( + shape=self.A.shape, + default_initializer=nn.initializer.Constant(1)) + for i in self.st_gcn_networks + ]) + else: + self.edge_importance = [1] * len(self.st_gcn_networks) + + self.pool = nn.AdaptiveAvgPool2D(output_size=(1, 1)) + + def init_weights(self): + """Initiate the parameters. + """ + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + weight_init_(layer, 'Normal', mean=0.0, std=0.02) + elif isinstance(layer, nn.BatchNorm2D): + weight_init_(layer, 'Normal', mean=1.0, std=0.02) + elif isinstance(layer, nn.BatchNorm1D): + weight_init_(layer, 'Normal', mean=1.0, std=0.02) + + def forward(self, x): + # data normalization + N, C, T, V, M = x.shape + x = x.transpose((0, 4, 3, 1, 2)) # N, M, V, C, T + x = x.reshape((N * M, V * C, T)) + if self.data_bn: + x.stop_gradient = False + x = self.data_bn(x) + x = x.reshape((N, M, V, C, T)) + x = x.transpose((0, 1, 3, 4, 2)) # N, M, C, T, V + x = x.reshape((N * M, C, T, V)) + + # forward + for gcn, importance in zip(self.st_gcn_networks, self.edge_importance): + x, _ = gcn(x, paddle.multiply(self.A, importance)) + + x = self.pool(x) # NM,C,T,V --> NM,C,1,1 + C = x.shape[1] + x = paddle.reshape(x, (N, M, C, 1, 1)).mean(axis=1) # N,C,1,1 + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/swin_transformer.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/swin_transformer.py new file mode 100644 index 0000000..aaed217 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/swin_transformer.py @@ -0,0 +1,742 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import lru_cache, reduce +from operator import mul + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Constant + +from ...utils import load_ckpt +from ..registry import BACKBONES +from ..weight_init import trunc_normal_ + +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + # issuecomment-532968956 ... + See discussion: https://github.com/tensorflow/tpu/issues/494 + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Mlp(nn.Layer): + """ Multilayer perceptron.""" + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """window_partition + Args: + x (Tensor): x.shape = [B, D, H, W, C] + window_size (tuple[int]): window_size + + Returns: + Tensor: (B*num_windows, window_size*window_size, C) + """ + B, D, H, W, C = x.shape + x = x.reshape([ + B, D // window_size[0], window_size[0], H // window_size[1], + window_size[1], W // window_size[2], window_size[2], C + ]) + windows = x.transpose([0, 1, 3, 5, 2, 4, 6, + 7]).reshape([-1, reduce(mul, window_size), C]) + return windows + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +def window_reverse(windows, window_size, B, D, H, W): + """ + Args: + windows: (B*num_windows, window_size, window_size, C) + window_size (tuple[int]): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, D, H, W, C) + """ + x = windows.reshape([ + B, D // window_size[0], H // window_size[1], W // window_size[2], + window_size[0], window_size[1], window_size[2], -1 + ]) + x = x.transpose([0, 1, 4, 2, 5, 3, 6, 7]).reshape([B, D, H, W, -1]) + return x + + +def get_window_size(x_size, window_size, shift_size=None): + use_window_size = list(window_size) + if shift_size is not None: + use_shift_size = list(shift_size) + for i in range(len(x_size)): + if x_size[i] <= window_size[i]: + use_window_size[i] = x_size[i] + if shift_size is not None: + use_shift_size[i] = 0 + + if shift_size is None: + return tuple(use_window_size) + else: + return tuple(use_window_size), tuple(use_shift_size) + + +class WindowAttention3D(nn.Layer): + """ Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The temporal length, height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + def __init__(self, + dim, + window_size, + num_heads, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wd, Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = self.create_parameter( + shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1) * + (2 * window_size[2] - 1), num_heads), + default_initializer=zeros_, + ) # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH + self.add_parameter("relative_position_bias_table", + self.relative_position_bias_table) + # get pair-wise relative position index for each token inside the window + coords_d = paddle.arange(self.window_size[0]) + coords_h = paddle.arange(self.window_size[1]) + coords_w = paddle.arange(self.window_size[2]) + coords = paddle.stack(paddle.meshgrid(coords_d, coords_h, + coords_w)) # 3, Wd, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 3, Wd*Wh*Ww + + relative_coords = coords_flatten.unsqueeze( + axis=2) - coords_flatten.unsqueeze(axis=1) # 3, Wd*Wh*Ww, Wd*Wh*Ww + + # relative_coords = coords_flatten.unsqueeze(2) - coords_flatten.unsqueeze(1) # 3, Wd*Wh*Ww, Wd*Wh*Ww + relative_coords = relative_coords.transpose([1, 2, 0 + ]) # Wd*Wh*Ww, Wd*Wh*Ww, 3 + relative_coords[:, :, + 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 2] += self.window_size[2] - 1 + + relative_coords[:, :, 0] *= (2 * self.window_size[1] - + 1) * (2 * self.window_size[2] - 1) + relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1) + relative_position_index = relative_coords.sum( + axis=-1) # Wd*Wh*Ww, Wd*Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x, mask=None): + """ Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, N, N) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape( + [B_, N, 3, self.num_heads, + C // self.num_heads]).transpose([2, 0, 3, 1, 4]) + q, k, v = qkv[0], qkv[1], qkv[2] # B_, nH, N, C + + q = q * self.scale + attn = q @ k.transpose([0, 1, 3, 2]) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index[:N, :N].reshape([-1])].reshape( + [N, N, -1]) # Wd*Wh*Ww,Wd*Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wd*Wh*Ww, Wd*Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N + + if mask is not None: + nW = mask.shape[0] + attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N + ]) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.reshape([-1, self.num_heads, N, N]) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B_, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock3D(nn.Layer): + """ Swin Transformer Block. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): Window size. + shift_size (tuple[int]): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + def __init__(self, + dim, + num_heads, + window_size=(2, 7, 7), + shift_size=(0, 0, 0), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + use_checkpoint=False): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + # self.use_checkpoint=use_checkpoint + + assert 0 <= self.shift_size[0] < self.window_size[ + 0], "shift_size must in 0-window_size" + assert 0 <= self.shift_size[1] < self.window_size[ + 1], "shift_size must in 0-window_size" + assert 0 <= self.shift_size[2] < self.window_size[ + 2], "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention3D(dim, + window_size=self.window_size, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + def forward_part1(self, x, mask_matrix): + B = paddle.shape(x)[0] + _, D, H, W, C = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, + self.shift_size) + + x = self.norm1(x) + # pad feature maps to multiples of window size + pad_l = pad_t = pad_d0 = 0 + pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0] + pad_b = (window_size[1] - H % window_size[1]) % window_size[1] + pad_r = (window_size[2] - W % window_size[2]) % window_size[2] + x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1), + data_format='NDHWC') + _, Dp, Hp, Wp, _ = x.shape + # cyclic shift + if any(i > 0 for i in shift_size): + shifted_x = paddle.roll(x, + shifts=(-shift_size[0], -shift_size[1], + -shift_size[2]), + axis=(1, 2, 3)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + # partition windows + x_windows = window_partition(shifted_x, + window_size) # B*nW, Wd*Wh*Ww, C + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # B*nW, Wd*Wh*Ww, C + # merge windows + attn_windows = attn_windows.reshape([-1, *(window_size + (C, ))]) + shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp, + Wp) # B D' H' W' C + # reverse cyclic shift + if any(i > 0 for i in shift_size): + x = paddle.roll(shifted_x, + shifts=(shift_size[0], shift_size[1], + shift_size[2]), + axis=(1, 2, 3)) + else: + x = shifted_x + + if pad_d1 > 0 or pad_r > 0 or pad_b > 0: + x = x[:, :D, :H, :W, :] + return x + + def forward_part2(self, x): + return self.drop_path(self.mlp(self.norm2(x))) + + def forward(self, x, mask_matrix): + """ Forward function. + + Args: + x: Input feature, tensor size (B, D, H, W, C). + mask_matrix: Attention mask for cyclic shift. + """ + + shortcut = x + x = self.forward_part1(x, mask_matrix) + x = shortcut + self.drop_path(x) + x = x + self.forward_part2(x) + + return x + + +class PatchMerging(nn.Layer): + """ Patch Merging Layer + + Args: + dim (int): Number of input channels. + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ Forward function. + + Args: + x: Input feature, tensor size (B, D, H, W, C). + """ + B, D, H, W, C = x.shape + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, W % 2, 0, H % 2, 0, 0), data_format='NDHWC') + + x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C + x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C + x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C + x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C + x = paddle.concat([x0, x1, x2, x3], -1) # B D H/2 W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +# cache each stage results +@lru_cache() +def compute_mask(D, H, W, window_size, shift_size): + img_mask = paddle.zeros((1, D, H, W, 1)) # 1 Dp Hp Wp 1 + cnt = 0 + for d in slice(-window_size[0]), slice(-window_size[0], + -shift_size[0]), slice( + -shift_size[0], None): + for h in slice(-window_size[1]), slice(-window_size[1], + -shift_size[1]), slice( + -shift_size[1], None): + for w in slice(-window_size[2]), slice(-window_size[2], + -shift_size[2]), slice( + -shift_size[2], None): + img_mask[:, d, h, w, :] = cnt + cnt += 1 + mask_windows = window_partition(img_mask, + window_size) # nW, ws[0]*ws[1]*ws[2], 1 + mask_windows = mask_windows.squeeze(-1) # nW, ws[0]*ws[1]*ws[2] + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + huns = -100.0 * paddle.ones_like(attn_mask) + attn_mask = huns * (attn_mask != 0).astype("float32") + return attn_mask + + +class BasicLayer(nn.Layer): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (tuple[int]): Local window size. Default: (1,7,7). + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None + """ + def __init__(self, + dim, + depth, + num_heads, + window_size=(1, 7, 7), + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False): + super().__init__() + self.window_size = window_size + self.shift_size = tuple(i // 2 for i in window_size) + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.LayerList([ + SwinTransformerBlock3D( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + use_checkpoint=use_checkpoint, + ) for i in range(depth) + ]) + + self.downsample = downsample + if self.downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + + def forward(self, x): + """ Forward function. + + Args: + x: Input feature, tensor size (B, C, D, H, W). + """ + # calculate attention mask for SW-MSA + B = paddle.shape(x)[0] + _, C, D, H, W = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, + self.shift_size) + # x = rearrange(x, 'b c d h w -> b d h w c') + x = x.transpose([0, 2, 3, 4, 1]) + Dp = int(np.ceil(D / window_size[0])) * window_size[0] + Hp = int(np.ceil(H / window_size[1])) * window_size[1] + Wp = int(np.ceil(W / window_size[2])) * window_size[2] + attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size) + for blk in self.blocks: + x = blk(x, attn_mask) + x = x.reshape([B, D, H, W, C]) + + if self.downsample is not None: + x = self.downsample(x) + x = x.transpose([0, 4, 1, 2, 3]) + return x + + +class PatchEmbed3D(nn.Layer): + """ Video to Patch Embedding. + + Args: + patch_size (int): Patch token size. Default: (2,4,4). + in_chans (int): Number of input video channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Layer, optional): Normalization layer. Default: None + """ + def __init__(self, + patch_size=(2, 4, 4), + in_chans=3, + embed_dim=96, + norm_layer=None): + super().__init__() + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv3D(in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + _, _, D, H, W = x.shape + if W % self.patch_size[2] != 0: + x = F.pad( + x, (0, self.patch_size[2] - W % self.patch_size[2], 0, 0, 0, 0), + data_format='NCDHW') + if H % self.patch_size[1] != 0: + x = F.pad( + x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1], 0, 0), + data_format='NCDHW') + if D % self.patch_size[0] != 0: + x = F.pad( + x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]), + data_format='NCDHW') + + x = self.proj(x) # B C D Wh Ww + if self.norm is not None: + D, Wh, Ww = x.shape[2], x.shape[3], x.shape[4] + x = x.flatten(2).transpose([0, 2, 1]) + x = self.norm(x) + x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, D, Wh, Ww]) + + return x + + +@BACKBONES.register() +class SwinTransformer3D(nn.Layer): + """ Swin Transformer backbone. + A Paddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + patch_size (int | tuple(int)): Patch size. Default: (4,4,4). + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer: Normalization layer. Default: nn.LayerNorm. + patch_norm (bool): If True, add normalization after patch embedding. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + """ + def __init__(self, + pretrained=None, + patch_size=(4, 4, 4), + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=(2, 7, 7), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + patch_norm=False, + frozen_stages=-1, + use_checkpoint=False): + super().__init__() + + self.pretrained = pretrained + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.patch_norm = patch_norm + self.frozen_stages = frozen_stages + self.window_size = window_size + self.patch_size = patch_size + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed3D( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.LayerList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging + if i_layer < self.num_layers - 1 else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + self.num_features = int(embed_dim * 2**(self.num_layers - 1)) + + # add a norm layer for each output + self.norm = norm_layer(self.num_features) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.stop_gradient = True + + if self.frozen_stages >= 1: + self.pos_drop.eval() + for i in range(0, self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.stop_gradient = True + + def _init_fn(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def init_weights(self): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + """First init model's weight""" + + self.apply(self._init_fn) + """Second, if provide pretrained ckpt, load it""" + if isinstance( + self.pretrained, str + ) and self.pretrained.strip() != "": # load pretrained weights + load_ckpt(self, self.pretrained) + elif self.pretrained is None or self.pretrained.strip() == "": + pass + else: + raise NotImplementedError + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x = x.transpose([0, 2, 3, 4, 1]) + x = self.norm(x) + x = x.transpose([0, 4, 1, 2, 3]) + return x + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer3D, self).train(mode) + self._freeze_stages() diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/toshift_vit.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/toshift_vit.py new file mode 100644 index 0000000..a481996 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/toshift_vit.py @@ -0,0 +1,413 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Constant + +from ...utils import load_ckpt +from ..registry import BACKBONES +from ..weight_init import trunc_normal_ + +__all__ = ['VisionTransformer'] + +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +def to_2tuple(x): + return tuple([x] * 2) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + # issuecomment-532968956 ... + See discussion: https://github.com/tensorflow/tpu/issues/494 + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0): + super().__init__() + + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.attn_drop = nn.Dropout(attn_drop) + + def forward(self, x): + N, C = x.shape[1:] + qkv = self.qkv(x).reshape( + (-1, N, 3, self.num_heads, C // self.num_heads)).transpose( + (2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.1, + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5, + num_segments = 8, + fold_div = 4): + #attention_type='divided_space_time', + super().__init__() + self.n_seg = num_segments #ckk + self.foldP_div = fold_div #ckk + #self.attention_type = attention_type + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm1 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + + self.attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + # Temporal Attention Parameters + ''' + if self.attention_type == 'divided_space_time': + if isinstance(norm_layer, str): + self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.temporal_norm1 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + self.temporal_attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + self.temporal_fc = nn.Linear(dim, dim) + ''' + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm2 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + # token_shift + def shuift_tk(self, x): + t = self.n_seg + bt, n, c = x.shape + b = bt // t + x = x.reshape([b, t, n, c]) #B T N C + + fold = c // self.foldP_div + out = paddle.zeros_like(x) + out.stop_gradient = True + # print("#### fold ", fold) + # print(out.shape) + # print(x[:, 1:, 0, :fold].unsqueeze(2).shape) + # print(out[:, :-1, 0:1, :fold].shape) + # exit(0) + out[:, :-1, 0, :fold] = x[:, 1:, 0, :fold] # shift left + out[:, 1:, 0, fold:2*fold] = x[:,:-1:, 0, fold:2*fold] + + out[:, :, 1:, :2*fold] = x[:, :, 1:, :2*fold] + out[:, :, :, 2*fold:] = x[:, :, :, 2*fold:] + + return out.reshape([bt, n, c]) + + def forward(self, x): + x = self.shuift_tk(x) + x = x + self.drop_path(self.attn(self.norm1(x))) + x = self.shuift_tk(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // + patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D(in_channels, + embed_dim, + kernel_size=patch_size, + stride=patch_size) + + def forward(self, x): + B, C, T, H, W = x.shape + + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = x.transpose((0, 2, 1, 3, 4)) + x = x.reshape([-1, C, H, W]) + x = self.proj(x) + W = x.shape[-1] + x = x.flatten(2).transpose((0, 2, 1)) + return x, T, W + + +@BACKBONES.register() +class TokenShiftVisionTransformer(nn.Layer): + """ Vision Transformer with support for patch input + """ + def __init__(self, + pretrained=None, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0, + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + epsilon=1e-5, + num_seg=8, + attention_type='divided_space_time', + **args): + + super().__init__() + self.pretrained = pretrained + self.num_seg = num_seg + self.attention_type = attention_type + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed(img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + # Positional Embeddings + self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), + default_initializer=zeros_) + self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, + embed_dim), + default_initializer=zeros_) + self.pos_drop = nn.Dropout(p=drop_rate) + + if self.attention_type != 'space_only': + self.time_embed = self.create_parameter(shape=(1, num_seg, + embed_dim), + default_initializer=zeros_) + self.time_drop = nn.Dropout(p=drop_rate) + + self.add_parameter("pos_embed", self.pos_embed) + self.add_parameter("cls_token", self.cls_token) + + dpr = np.linspace(0, drop_path_rate, depth) + + self.blocks = nn.LayerList([ + Block(dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + epsilon=epsilon, + num_segments= self.num_seg + ) for i in range(depth) + #attention_type=self.attention_type + ]) + + self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) + + def init_weights(self): + """First init model's weight""" + trunc_normal_(self.pos_embed, std=0.02) + trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_fn) + + if self.attention_type == 'divided_space_time': + i = 0 + for m in self.blocks.sublayers(include_self=True): + m_str = str(m) + if 'Block' in m_str: + if i > 0: + zeros_(m.temporal_fc.weight) + zeros_(m.temporal_fc.bias) + i += 1 + + """Second, if provide pretrained ckpt, load it""" + + if isinstance( + self.pretrained, str + ) and self.pretrained.strip() != "": # load pretrained weights + load_ckpt(self, + self.pretrained, + num_patches=self.patch_embed.num_patches, + num_seg=self.num_seg, + attention_type=self.attention_type) + + def _init_fn(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + ones_(m.weight) + zeros_(m.bias) + + def forward_features(self, x): + # B = x.shape[0] + B = paddle.shape(x)[0] + x, T, W = self.patch_embed(x) # [BT,nH*nW,F] + cls_tokens = self.cls_token.expand((B * T, -1, -1)) # [1,1,F]->[BT,1,F] + x = paddle.concat((cls_tokens, x), axis=1) + pos_interp = (x.shape[1] != self.pos_embed.shape[1]) + if pos_interp: + pos_embed = self.pos_embed + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose( + (0, 2, 1)) + P = int(other_pos_embed.shape[2]**0.5) + H = x.shape[1] // W + other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P]) + new_pos_embed = F.interpolate(other_pos_embed, + size=(H, W), + mode='nearest') + new_pos_embed = new_pos_embed.flatten(2) + new_pos_embed = new_pos_embed.transpose((0, 2, 1)) + new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), + axis=1) + x = x + new_pos_embed + else: + x = x + self.pos_embed + + x = self.pos_drop(x) + + # Attention blocks + for blk in self.blocks: + x = blk(x) + + + x = self.norm(x) + return x[:, 0] # [B, embed_dim] -> [B*T, embed_dim] + + def forward(self, x): + x = self.forward_features(x) + return x \ No newline at end of file diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/transnetv2.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/transnetv2.py new file mode 100644 index 0000000..60603e2 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/transnetv2.py @@ -0,0 +1,582 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as functional +import random +from paddle import ParamAttr + +from ..registry import BACKBONES + + +class OctConv3D(nn.Layer): + def __init__(self, in_filters, filters, kernel_size=3, dilation_rate=(1, 1, 1), alpha=0.25, + use_bias=True, kernel_initializer=nn.initializer.KaimingNormal()): + super(OctConv3D, self).__init__() + + self.low_channels = int(filters * alpha) + self.high_channels = filters - self.low_channels + + self.high_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size, + dilation=dilation_rate, padding=(dilation_rate[0], 1, 1), + weight_attr=ParamAttr(initializer=kernel_initializer), + bias_attr=ParamAttr( + initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias) + self.high_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size, + dilation=dilation_rate, padding=(dilation_rate[0], 1, 1), + weight_attr=ParamAttr(initializer=kernel_initializer), + bias_attr=False) + self.low_to_high = nn.Conv3D(in_filters, self.high_channels, kernel_size=kernel_size, + dilation=dilation_rate, padding=(dilation_rate[0], 1, 1), + weight_attr=ParamAttr(initializer=kernel_initializer), + bias_attr=False) + self.low_to_low = nn.Conv3D(self.high_channels, self.low_channels, kernel_size=kernel_size, + dilation=dilation_rate, padding=(dilation_rate[0], 1, 1), + weight_attr=ParamAttr(initializer=kernel_initializer), + bias_attr=ParamAttr( + initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias) + self.upsampler = nn.Upsample(size=(1, 2, 2), data_format='NCDHW') + self.downsampler = nn.AvgPool3D(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0, 1, 1)) + + @staticmethod + def pad_to(tensor, target_shape): + shape = tensor.shape + padding = [[0, tar - curr] for curr, tar in zip(shape, target_shape)] + return functional.pad(tensor, padding, "CONSTANT", data_format='NCDHW') + + @staticmethod + def crop_to(tensor, target_width, target_height): + return tensor[:, :, :target_height, :target_width] + + def forward(self, inputs): + low_inputs, high_inputs = inputs + + high_to_high = self.high_to_high(high_inputs) + high_to_low = self.high_to_low(self.downsampler(high_inputs)) + + low_to_high = self.upsampler(self.low_to_high(low_inputs)) + low_to_low = self.low_to_low(low_inputs) + + high_output = high_to_high[:, :, :, :low_to_high.shape[3], :low_to_high.shape[4]] + low_to_high + low_output = low_to_low + high_to_low[:, :, :, :low_to_low.shape[3], :low_to_low.shape[4]] + + return low_output, high_output + + +class Conv3DConfigurable(nn.Layer): + def __init__(self, + in_filters, + filters, + dilation_rate, + separable=True, + octave=False, + use_bias=True): + super(Conv3DConfigurable, self).__init__() + assert not (separable and octave) + + if separable: + conv1 = nn.Conv3D(in_filters, 2 * filters, kernel_size=(1, 3, 3), + dilation=(1, 1, 1), padding=(0, 1, 1), + weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()), + bias_attr=False) + conv2 = nn.Conv3D(2 * filters, filters, kernel_size=(3, 1, 1), + dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0), + weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()), + bias_attr=ParamAttr( + initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias) + self.layers = nn.LayerList([conv1, conv2]) + elif octave: + conv = OctConv3D(in_filters, filters, kernel_size=3, dilation_rate=(dilation_rate, 1, 1), + use_bias=use_bias, + kernel_initializer=nn.initializer.KaimingNormal()) + self.layers = [conv] + else: + conv = nn.Conv3D(in_filters, filters, kernel_size=3, + dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1), + weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()), + bias_attr=ParamAttr( + initializer=nn.initializer.Constant(value=0.)) if use_bias else use_bias) + self.layers = nn.LayerList([conv]) + + def forward(self, inputs): + x = inputs + for layer in self.layers: + x = layer(x) + return x + + +class DilatedDCNNV2(nn.Layer): + def __init__(self, + in_filters, + filters, + batch_norm=True, + activation=None, + octave_conv=False): + super(DilatedDCNNV2, self).__init__() + assert not (octave_conv and batch_norm) + + self.Conv3D_1 = Conv3DConfigurable(in_filters, filters, 1, use_bias=not batch_norm, octave=octave_conv) + self.Conv3D_2 = Conv3DConfigurable(in_filters, filters, 2, use_bias=not batch_norm, octave=octave_conv) + self.Conv3D_4 = Conv3DConfigurable(in_filters, filters, 4, use_bias=not batch_norm, octave=octave_conv) + self.Conv3D_8 = Conv3DConfigurable(in_filters, filters, 8, use_bias=not batch_norm, octave=octave_conv) + self.octave = octave_conv + + self.bn = nn.BatchNorm3D(filters * 4, momentum=0.99, epsilon=1e-03, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)) + ) if batch_norm else None + self.activation = activation + + def forward(self, inputs): + conv1 = self.Conv3D_1(inputs) + conv2 = self.Conv3D_2(inputs) + conv3 = self.Conv3D_4(inputs) + conv4 = self.Conv3D_8(inputs) + + # shape of convi[j]/convi is [B, 3, T, H, W], concat in channel dimension + if self.octave: + x = [paddle.concat([conv1[0], conv2[0], conv3[0], conv4[0]], axis=1), + paddle.concat([conv1[1], conv2[1], conv3[1], conv4[1]], axis=1)] + else: + x = paddle.concat([conv1, conv2, conv3, conv4], axis=1) + + if self.bn is not None: + x = self.bn(x) + + if self.activation is not None: + if self.octave: + x = [self.activation(x[0]), self.activation(x[1])] + else: + x = self.activation(x) + return x + + +class StackedDDCNNV2(nn.Layer): + def __init__(self, + in_filters, + n_blocks, + filters, + shortcut=True, + use_octave_conv=False, + pool_type="avg", + stochastic_depth_drop_prob=0.0): + super(StackedDDCNNV2, self).__init__() + assert pool_type == "max" or pool_type == "avg" + if use_octave_conv and pool_type == "max": + print("WARN: Octave convolution was designed with average pooling, not max pooling.") + + self.shortcut = shortcut + self.DDCNN = nn.LayerList([ + DilatedDCNNV2(in_filters if i == 1 else filters * 4, filters, octave_conv=use_octave_conv, + activation=functional.relu if i != n_blocks else None) for i in range(1, n_blocks + 1) + ]) + self.pool = nn.MaxPool3D(kernel_size=(1, 2, 2)) if pool_type == "max" else nn.AvgPool3D(kernel_size=(1, 2, 2)) + self.octave = use_octave_conv + self.stochastic_depth_drop_prob = stochastic_depth_drop_prob + + def forward(self, inputs): + x = inputs + shortcut = None + + if self.octave: + x = [self.pool(x), x] + for block in self.DDCNN: + x = block(x) + if shortcut is None: + shortcut = x + # shape of x[i] is [B, 3, T, H, W], concat in channel dimension + if self.octave: + x = paddle.concat([x[0], self.pool(x[1])], axis=1) + + x = functional.relu(x) + + if self.shortcut is not None: + if self.stochastic_depth_drop_prob != 0.: + if self.training: + if random.random() < self.stochastic_depth_drop_prob: + x = shortcut + else: + x = x + shortcut + else: + x = (1 - self.stochastic_depth_drop_prob) * x + shortcut + else: + x += shortcut + + if not self.octave: + x = self.pool(x) + return x + + +class ResNetBlock(nn.Layer): + def __init__(self, in_filters, filters, strides=(1, 1)): + super(ResNetBlock, self).__init__() + + self.conv1 = nn.Conv2D(in_filters, filters, kernel_size=(3, 3), stride=strides, padding=(1, 1), + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(filters, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))) + + self.conv2 = nn.Conv2D(filters, filters, kernel_size=(3, 3), padding=(1, 1), + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=False) + self.bn2 = nn.BatchNorm2D(filters, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))) + + def forward(self, inputs): + x = self.conv1(inputs) + x = self.bn1(x) + x = functional.relu(x) + + x = self.conv2(x) + x = self.bn2(x) + + shortcut = inputs + x += shortcut + + return functional.relu(x) + + +class ResNetFeatures(nn.Layer): + def __init__(self, in_filters=3, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]): + super(ResNetFeatures, self).__init__() + self.conv1 = nn.Conv2D(in_channels=in_filters, out_channels=64, kernel_size=(7, 7), + stride=(2, 2), padding=(3, 3), + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(num_features=64, momentum=0.99, epsilon=1e-03, + weight_attr=ParamAttr(initializer=nn.initializer.Constant(value=1.)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)) + ) + self.max_pool = nn.MaxPool2D(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + + self.layer2a = ResNetBlock(64, 64) + self.layer2b = ResNetBlock(64, 64) + + self.mean = paddle.to_tensor(mean) + self.std = paddle.to_tensor(std) + + def forward(self, inputs): + shape = inputs.shape + x = paddle.reshape(inputs, [shape[0] * shape[2], shape[1], shape[3], shape[4]]) + x = (x - self.mean) / self.std + + x = self.conv1(x) + x = self.bn1(x) + x = functional.relu(x) + x = self.max_pool(x) + x = self.layer2a(x) + x = self.layer2b(x) + + new_shape = x.shape + x = paddle.reshape(x, [shape[0], new_shape[1], shape[2], new_shape[2], new_shape[3]]) + return x + + +class FrameSimilarity(nn.Layer): + def __init__(self, + in_filters, + similarity_dim=128, + lookup_window=101, + output_dim=128, + stop_gradient=False, + use_bias=False): + super(FrameSimilarity, self).__init__() + self.projection = nn.Linear(in_filters, similarity_dim, + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=use_bias) + self.fc = nn.Linear(lookup_window, output_dim, + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))) + + self.lookup_window = lookup_window + self.stop_gradient = stop_gradient + assert lookup_window % 2 == 1, "`lookup_window` must be odd integer" + + def forward(self, inputs): + x = paddle.concat([paddle.mean(x, axis=[3, 4]) for x in inputs], axis=1) + x = paddle.transpose(x, (0, 2, 1)) + + if self.stop_gradient: + x = x.stop_gradient + + x = self.projection(x) + x = functional.normalize(x, p=2, axis=2) + batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0] + time_window = x.shape[1] + similarities = paddle.bmm(x, x.transpose([0, 2, 1])) # [batch_size, time_window, time_window] + + similarities_padded = functional.pad(similarities, + [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2], + data_format='NCL') + + batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1]) + batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window]) + time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1]) + time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window]) + lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window]) + lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices + indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1) + similarities = paddle.gather_nd(similarities_padded, indices) + return functional.relu(self.fc(similarities)) + + +class ConvexCombinationRegularization(nn.Layer): + def __init__(self, in_filters, filters=32, delta_scale=10., loss_weight=0.01): + super(ConvexCombinationRegularization, self).__init__() + + self.projection = nn.Conv3D(in_filters, filters, kernel_size=1, dilation=1, padding=(0, 0, 0), + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))) + self.features = nn.Conv3D((filters * 3), filters * 2, + kernel_size=(3, 3, 3), dilation=1, padding=(1, 1, 1), + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))) + self.dense = nn.Linear(64, 1, weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), bias_attr=True) + self.loss = nn.SmoothL1Loss(reduction='none') + self.delta_scale = delta_scale + self.loss_weight = loss_weight + + def forward(self, image_inputs, feature_inputs): + x = feature_inputs + x = self.projection(x) + x = functional.relu(x) + batch_size = x.shape[0] + window_size = x.shape[2] + first_frame = paddle.tile(x[:, :, :1], [1, 1, window_size, 1, 1]) + last_frame = paddle.tile(x[:, :, -1:], [1, 1, window_size, 1, 1]) + x = paddle.concat([x, first_frame, last_frame], 1) + x = self.features(x) + x = functional.relu(x) + x = paddle.mean(x, axis=[3, 4]) + x = paddle.transpose(x, (0, 2, 1)) + alpha = self.dense(x) + alpha = paddle.transpose(alpha, (0, 2, 1)) + + first_img = paddle.tile(image_inputs[:, :, :1], [1, 1, window_size, 1, 1]) + last_img = paddle.tile(image_inputs[:, :, -1:], [1, 1, window_size, 1, 1]) + + alpha_ = functional.sigmoid(alpha) + alpha_ = paddle.reshape(alpha_, [batch_size, 1, window_size, 1, 1]) + predictions_ = (alpha_ * first_img + (1 - alpha_) * last_img) + loss_ = self.loss(label=image_inputs / self.delta_scale, input=predictions_ / self.delta_scale) + loss_ = self.loss_weight * paddle.mean(loss_) + return alpha, loss_ + + +class ColorHistograms(nn.Layer): + def __init__(self, + lookup_window=101, + output_dim=None): + super(ColorHistograms, self).__init__() + + self.fc = nn.Linear(lookup_window, output_dim, + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr( + initializer=nn.initializer.Constant(value=0.))) if output_dim is not None else None + self.lookup_window = lookup_window + assert lookup_window % 2 == 1, "`lookup_window` must be odd integer" + + def compute_color_histograms(self, frames): + frames = frames.astype('int32') + + def get_bin(frames): + # returns 0 .. 511 + R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2] + R, G, B = R // 32, G // 32, B // 32 + return (R * 64) + (G * 8) + B + + batch_size = paddle.slice(frames.shape, starts=[0], ends=[1], axes=[0]) if frames.shape[0] == -1 else frames.shape[0] + time_window, height, width, no_channels = frames.shape[1:] + + assert no_channels == 3 or no_channels == 6 + if no_channels == 3: + frames_flatten = frames.reshape([-1, height * width, 3]) + else: + frames_flatten = frames.reshape([-1, height * width * 2, 3]) + + binned_values = get_bin(frames_flatten) + + frame_bin_prefix = (paddle.arange(0, batch_size * time_window) * 512).reshape([-1, 1]) + binned_values = (binned_values + frame_bin_prefix).reshape([-1, 1]) + histograms = paddle.zeros_like(frame_bin_prefix, dtype='int32').tile([512]).reshape([-1]) + histograms = histograms.scatter_nd_add(binned_values, paddle.ones_like(binned_values, dtype='int32').reshape([-1])) + histograms = histograms.reshape([batch_size, time_window, 512]).astype('float32') + histograms_normalized = functional.normalize(histograms, p=2, axis=2) + return histograms_normalized + + def forward(self, inputs): + x = self.compute_color_histograms(inputs) + batch_size = paddle.slice(x.shape, starts=[0], ends=[1], axes=[0]) if x.shape[0] == -1 else x.shape[0] + time_window = x.shape[1] + similarities = paddle.bmm(x, x.transpose([0, 2, 1])) # [batch_size, time_window, time_window] + similarities_padded = functional.pad(similarities, + [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2], + data_format='NCL') + + batch_indices = paddle.arange(0, batch_size).reshape([batch_size, 1, 1]) + batch_indices = paddle.tile(batch_indices, [1, time_window, self.lookup_window]) + time_indices = paddle.arange(0, time_window).reshape([1, time_window, 1]) + time_indices = paddle.tile(time_indices, [batch_size, 1, self.lookup_window]) + lookup_indices = paddle.arange(0, self.lookup_window).reshape([1, 1, self.lookup_window]) + lookup_indices = paddle.tile(lookup_indices, [batch_size, time_window, 1]) + time_indices + + indices = paddle.stack([batch_indices, time_indices, lookup_indices], -1) + similarities = paddle.gather_nd(similarities_padded, indices) + + if self.fc is not None: + return functional.relu(self.fc(similarities)) + return similarities + + +@BACKBONES.register() +class TransNetV2(nn.Layer): + """TransNetV2 model from + `"TransNet V2: An effective deep network architecture for fast shot transition detection" `_ + """ + def __init__(self, + F=16, L=3, S=2, D=1024, + use_many_hot_targets=True, + use_frame_similarity=True, + use_color_histograms=True, + use_mean_pooling=False, + dropout_rate=0.5, + use_convex_comb_reg=False, + use_resnet_features=False, + use_resnet_like_top=False, + frame_similarity_on_last_layer=False, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]): + super(TransNetV2, self).__init__() + + self.mean = np.array(mean, np.float32).reshape([1, 3, 1, 1]) * 255 + self.std = np.array(std, np.float32).reshape([1, 3, 1, 1]) * 255 + + self.use_resnet_features = use_resnet_features + self.resnet_layers = ResNetFeatures(in_filters=3, mean=self.mean, std=self.std) if self.use_resnet_features else None + self.resnet_like_top = use_resnet_like_top + if self.resnet_like_top: + self.resnet_like_top_conv = nn.Conv3D(64 if self.use_resnet_features else 3, 32, kernel_size=(3, 7, 7), + stride=(1, 2, 2), + padding=(1, 3, 3), + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=False) + self.resnet_like_top_bn = nn.BatchNorm3D(32, momentum=0.99, epsilon=1e-03, + weight_attr=ParamAttr( + initializer=nn.initializer.Constant(value=1.)), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.))) + self.resnet_like_top_max_pool = nn.MaxPool3D(kernel_size=(1, 3, 3), stride=(1, 2, 2), + padding=(0, 1, 1)) + + if self.resnet_like_top: + in_filters = 32 + elif self.use_resnet_features: + in_filters = 64 + else: + in_filters = 3 + self.SDDCNN = nn.LayerList( + [StackedDDCNNV2(in_filters=in_filters, n_blocks=S, filters=F, + stochastic_depth_drop_prob=0.)] + + [StackedDDCNNV2(in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2 ** i) for i in range(1, L)] + ) + + self.frame_sim_layer = FrameSimilarity( + sum([(F * 2 ** i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128, + use_bias=True + ) if use_frame_similarity else None + self.color_hist_layer = ColorHistograms( + lookup_window=101, output_dim=128 + ) if use_color_histograms else None + + self.dropout = nn.Dropout(dropout_rate) if dropout_rate is not None else None + + output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6 # 3x6 for spatial dimensions + if use_frame_similarity: output_dim += 128 + if use_color_histograms: output_dim += 128 + + self.use_mean_pooling = use_mean_pooling + + self.has_downsample = False + if self.use_resnet_features or self.resnet_like_top or self.use_mean_pooling: + self.has_downsample = True + self.fc1 = nn.Linear(512 if self.has_downsample else output_dim, D, + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)) + ) + self.frame_similarity_on_last_layer = frame_similarity_on_last_layer + self.cls_layer1 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1, + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)) + ) + self.cls_layer2 = nn.Linear(1152 if self.frame_similarity_on_last_layer else D, 1, + weight_attr=ParamAttr(initializer=nn.initializer.XavierUniform()), + bias_attr=ParamAttr(initializer=nn.initializer.Constant(value=0.)) + ) if use_many_hot_targets else None + + self.convex_comb_reg = ConvexCombinationRegularization( + in_filters=(F * 2 ** (L - 1) * 4)) if use_convex_comb_reg else None + + def forward(self, inputs): + assert list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == paddle.float32, \ + "incorrect input type and/or shape" + out_dict = {} + + # shape [B, T, H, W, 3] to shape [B, 3, T, H, W] + x = inputs.transpose([0, 4, 1, 2, 3]) + if self.use_resnet_features: + x = self.resnet_layers(x) + else: + x = x / 255. + inputs = inputs.clip(min=0).astype('uint8') + if self.resnet_like_top: + x = self.resnet_like_top_conv(x) + x = self.resnet_like_top_bn(x) + x = self.resnet_like_top_max_pool(x) + block_features = [] + for block in self.SDDCNN: + x = block(x) + block_features.append(x) + if self.convex_comb_reg is not None: + out_dict["alphas"], out_dict["comb_reg_loss"] = self.convex_comb_reg(inputs.transpose([0, 4, 1, 2, 3]), x) + if self.use_mean_pooling: + x = paddle.mean(x, axis=[3, 4]) + x = x.transpose([0, 2, 1]) + else: + x = x.transpose([0, 2, 3, 4, 1]) + x = x.reshape([x.shape[0], x.shape[1], x.shape[2]*x.shape[3]*x.shape[4]]) + if self.frame_sim_layer is not None: + x = paddle.concat([self.frame_sim_layer(block_features), x], 2) + if self.color_hist_layer is not None: + x = paddle.concat([self.color_hist_layer(inputs), x], 2) + x = self.fc1(x) + x = functional.relu(x) + if self.dropout is not None: + x = self.dropout(x) + if self.frame_sim_layer is not None and self.frame_similarity_on_last_layer: + x = paddle.concat([self.frame_sim_layer(block_features), x], 2) + one_hot = self.cls_layer1(x) + if self.cls_layer2 is not None: + out_dict["many_hot"] = self.cls_layer2(x) + + if len(out_dict) > 0: + return one_hot, out_dict + + return one_hot + diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit.py new file mode 100644 index 0000000..84f434f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit.py @@ -0,0 +1,465 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Constant + +from ...utils import load_ckpt +from ..registry import BACKBONES +from ..weight_init import trunc_normal_ + +__all__ = ['VisionTransformer'] + +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +def to_2tuple(x): + return tuple([x] * 2) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + # issuecomment-532968956 ... + See discussion: https://github.com/tensorflow/tpu/issues/494 + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0): + super().__init__() + + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.attn_drop = nn.Dropout(attn_drop) + + def forward(self, x): + N, C = x.shape[1:] + qkv = self.qkv(x).reshape( + (-1, N, 3, self.num_heads, C // self.num_heads)).transpose( + (2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.1, + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5, + attention_type='divided_space_time'): + + super().__init__() + self.attention_type = attention_type + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm1 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + + self.attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + # Temporal Attention Parameters + if self.attention_type == 'divided_space_time': + if isinstance(norm_layer, str): + self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.temporal_norm1 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + self.temporal_attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + self.temporal_fc = nn.Linear(dim, dim) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm2 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + def forward(self, x, B, T, W): + num_spatial_tokens = (x.shape[1] - 1) // T + H = num_spatial_tokens // W + if self.attention_type in ['space_only', 'joint_space_time']: + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + elif self.attention_type == 'divided_space_time': + ########## Temporal ########## + xt = x[:, 1:, :] + _, _, _, _t, _m = B, H, W, T, xt.shape[-1] + xt = xt.reshape([-1, _t, _m]) + + res_temporal = self.drop_path( + self.temporal_attn(self.temporal_norm1(xt))) + + _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1] + res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m]) + + res_temporal = self.temporal_fc(res_temporal) + xt = x[:, 1:, :] + res_temporal + + ########## Spatial ########## + init_cls_token = x[:, 0, :].unsqueeze(1) + cls_token = init_cls_token.tile((1, T, 1)) + _b, _t, _m = cls_token.shape + cls_token = cls_token.reshape([-1, _m]).unsqueeze(1) + + xs = xt + _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1] + xs = xs.reshape([-1, _h, _w, _t, _m]).transpose( + (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m]) + xs = paddle.concat((cls_token, xs), axis=1) + res_spatial = self.drop_path(self.attn(self.norm1(xs))) + + # Taking care of CLS token + cls_token = res_spatial[:, 0, :] + _, _t, _m = B, T, cls_token.shape[-1] + cls_token = cls_token.reshape([-1, _t, _m]) + # averaging for every frame + cls_token = paddle.mean(cls_token, axis=1, keepdim=True) + + res_spatial = res_spatial[:, 1:, :] + _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1] + res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose( + (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m]) + + res = res_spatial + x = xt + x = paddle.concat((init_cls_token, x), axis=1) + paddle.concat( + (cls_token, res), axis=1) + + # Mlp + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + else: + raise NotImplementedError + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // + patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D(in_channels, + embed_dim, + kernel_size=patch_size, + stride=patch_size) + + def forward(self, x): + B, C, T, H, W = x.shape + + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = x.transpose((0, 2, 1, 3, 4)) + x = x.reshape([-1, C, H, W]) + x = self.proj(x) + W = x.shape[-1] + x = x.flatten(2).transpose((0, 2, 1)) + return x, T, W + + +@BACKBONES.register() +class VisionTransformer(nn.Layer): + """ Vision Transformer with support for patch input + """ + def __init__(self, + pretrained=None, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + epsilon=1e-5, + num_seg=8, + attention_type='divided_space_time', + **args): + super().__init__() + self.pretrained = pretrained + self.num_seg = num_seg + self.attention_type = attention_type + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed(img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + # Positional Embeddings + self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), + default_initializer=zeros_) + self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, + embed_dim), + default_initializer=zeros_) + self.pos_drop = nn.Dropout(p=drop_rate) + + if self.attention_type != 'space_only': + self.time_embed = self.create_parameter(shape=(1, num_seg, + embed_dim), + default_initializer=zeros_) + self.time_drop = nn.Dropout(p=drop_rate) + + self.add_parameter("pos_embed", self.pos_embed) + self.add_parameter("cls_token", self.cls_token) + + dpr = np.linspace(0, drop_path_rate, depth) + + self.blocks = nn.LayerList([ + Block(dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + epsilon=epsilon, + attention_type=self.attention_type) for i in range(depth) + ]) + + self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) + + def init_weights(self): + """First init model's weight""" + trunc_normal_(self.pos_embed, std=0.02) + trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_fn) + + if self.attention_type == 'divided_space_time': + i = 0 + for m in self.blocks.sublayers(include_self=True): + m_str = str(m) + if 'Block' in m_str: + if i > 0: + zeros_(m.temporal_fc.weight) + zeros_(m.temporal_fc.bias) + i += 1 + """Second, if provide pretrained ckpt, load it""" + if isinstance( + self.pretrained, str + ) and self.pretrained.strip() != "": # load pretrained weights + load_ckpt(self, + self.pretrained, + num_patches=self.patch_embed.num_patches, + num_seg=self.num_seg, + attention_type=self.attention_type) + + def _init_fn(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + ones_(m.weight) + zeros_(m.bias) + + def forward_features(self, x): + # B = x.shape[0] + B = paddle.shape(x)[0] + x, T, W = self.patch_embed(x) # [BT,nH*nW,F] + cls_tokens = self.cls_token.expand((B * T, -1, -1)) # [1,1,F]->[BT,1,F] + x = paddle.concat((cls_tokens, x), axis=1) + pos_interp = (x.shape[1] != self.pos_embed.shape[1]) + if pos_interp: + pos_embed = self.pos_embed + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose( + (0, 2, 1)) + P = int(other_pos_embed.shape[2]**0.5) + H = x.shape[1] // W + other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P]) + new_pos_embed = F.interpolate(other_pos_embed, + size=(H, W), + mode='nearest') + new_pos_embed = new_pos_embed.flatten(2) + new_pos_embed = new_pos_embed.transpose((0, 2, 1)) + new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), + axis=1) + x = x + new_pos_embed + else: + x = x + self.pos_embed + + x = self.pos_drop(x) + + # Time Embeddings + if self.attention_type != 'space_only': + cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split( + T)[0].index_select(paddle.to_tensor([0]), axis=1) + x = x[:, 1:] + _, _n, _m = x.shape + _t = T + x = x.reshape([-1, _t, _n, _m]).transpose( + (0, 2, 1, 3)).reshape([-1, _t, _m]) + # Resizing time embeddings in case they don't match + time_interp = (T != self.time_embed.shape[1]) + if time_interp: # T' != T + time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0) + new_time_embed = F.interpolate(time_embed, + size=(T, x.shape[-1]), + mode='nearest').squeeze(0) + new_time_embed = new_time_embed.transpose((0, 2, 1)) + x = x + new_time_embed + else: + x = x + self.time_embed + + x = self.time_drop(x) + _, _t, _m = x.shape + x = x.reshape([-1, W * W * T, _m]) + x = paddle.concat((cls_tokens, x), axis=1) + + # Attention blocks + for blk in self.blocks: + x = blk(x, B, T, W) + + # Predictions for space-only baseline + if self.attention_type == 'space_only': + _, _n, _m = x.shape + _t = T + x = x.reshape([-1, _t, _n, _m]) + x = paddle.mean(x, 1) # averaging predictions for every frame + + x = self.norm(x) + return x[:, 0] # [B, embed_dim] + + def forward(self, x): + x = self.forward_features(x) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit_tweaks.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit_tweaks.py new file mode 100644 index 0000000..a20af30 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/vit_tweaks.py @@ -0,0 +1,515 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant +from paddle.regularizer import L2Decay + +from ...utils import load_ckpt +from ..registry import BACKBONES +from ..weight_init import trunc_normal_ + +__all__ = ['VisionTransformer_tweaks'] + +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +def to_2tuple(x): + return tuple([x] * 2) + + +def rand_bbox(size, lam): + """ rand_bbox """ + w = size[2] + h = size[3] + cut_rat = np.sqrt(1. - lam) + cut_w = np.int(w * cut_rat) + cut_h = np.int(h * cut_rat) + + # uniform + cx = np.random.randint(w) + cy = np.random.randint(h) + + bbx1 = np.clip(cx - cut_w // 2, 0, w) + bby1 = np.clip(cy - cut_h // 2, 0, h) + bbx2 = np.clip(cx + cut_w // 2, 0, w) + bby2 = np.clip(cy + cut_h // 2, 0, h) + + return bbx1, bby1, bbx2, bby2 + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + # issuecomment-532968956 ... + See discussion: https://github.com/tensorflow/tpu/issues/494 + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0., + wd_bias=True, + lr_mult=1.0): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + wd_bias=True, + lr_mult=1.0): + super().__init__() + + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.attn_drop = nn.Dropout(attn_drop) + + def forward(self, x): + N, C = x.shape[1:] + qkv = self.qkv(x).reshape( + (-1, N, 3, self.num_heads, C // self.num_heads)).transpose( + (2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.1, + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5, + attention_type='divided_space_time', + wd_bias=True, + lr_mult=1.0): + + super().__init__() + self.attention_type = attention_type + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm1 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + + self.attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + wd_bias=wd_bias, + lr_mult=lr_mult) + + # Temporal Attention Parameters + if self.attention_type == 'divided_space_time': + if isinstance(norm_layer, str): + self.temporal_norm1 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.temporal_norm1 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + self.temporal_attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + wd_bias=wd_bias, + lr_mult=lr_mult) + self.temporal_fc = nn.Linear(dim, dim) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + elif isinstance(norm_layer, Callable): + self.norm2 = norm_layer(dim, epsilon=epsilon) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + wd_bias=wd_bias, + lr_mult=lr_mult) + + def forward(self, x, B, T, W): + num_spatial_tokens = (x.shape[1] - 1) // T + H = num_spatial_tokens // W + if self.attention_type in ['space_only', 'joint_space_time']: + x = paddle.add(x, self.drop_path(self.attn(self.norm1(x)))) + x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x)))) + return x + elif self.attention_type == 'divided_space_time': + ########## Temporal ########## + xt = x[:, 1:, :] + _, _, _, _t, _m = B, H, W, T, xt.shape[-1] + xt = xt.reshape([-1, _t, _m]) + + res_temporal = self.drop_path( + self.temporal_attn(self.temporal_norm1(xt))) + + _, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1] + res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m]) + + res_temporal = self.temporal_fc(res_temporal) + xt = paddle.add(x[:, 1:, :], res_temporal) + + ########## Spatial ########## + init_cls_token = x[:, 0, :].unsqueeze(1) + cls_token = init_cls_token.tile((1, T, 1)) + _b, _t, _m = cls_token.shape + cls_token = cls_token.reshape([-1, _m]).unsqueeze(1) + + xs = xt + _, _h, _w, _t, _m = B, H, W, T, xs.shape[-1] + xs = xs.reshape([-1, _h, _w, _t, _m]).transpose( + (0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m]) + xs = paddle.concat((cls_token, xs), axis=1) + res_spatial = self.drop_path(self.attn(self.norm1(xs))) + + # Taking care of CLS token + cls_token = res_spatial[:, 0, :] + _, _t, _m = B, T, cls_token.shape[-1] + cls_token = cls_token.reshape([-1, _t, _m]) + # averaging for every frame + cls_token = paddle.mean(cls_token, axis=1, keepdim=True) + + res_spatial = res_spatial[:, 1:, :] + _, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1] + res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose( + (0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m]) + + res = res_spatial + x = xt + x = paddle.add(paddle.concat((init_cls_token, x), axis=1), + paddle.concat((cls_token, res), axis=1)) + # Mlp + x = paddle.add(x, self.drop_path(self.mlp(self.norm2(x)))) + return x + else: + raise NotImplementedError + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768, + wd_bias=True, + lr_mult=1.0): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // + patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D(in_channels, + embed_dim, + kernel_size=patch_size, + stride=patch_size) + + def forward(self, x): + B, C, T, H, W = x.shape + + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = x.transpose((0, 2, 1, 3, 4)) # [B,T,C,H,W] + x = x.reshape([-1, C, H, W]) # [BT,C,H,W] + x = self.proj(x) # [BT,F,nH,nW] + W = x.shape[-1] + x = x.flatten(2).transpose((0, 2, 1)) # [BT,F,nHnW] + return x, T, W + + +@BACKBONES.register() +class VisionTransformer_tweaks(nn.Layer): + """ Vision Transformer with support for patch input + """ + def __init__(self, + pretrained=None, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + epsilon=1e-5, + num_seg=8, + attention_type='divided_space_time', + wd_bias=True, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + **args): + super().__init__() + self.pretrained = pretrained + self.num_seg = num_seg + self.attention_type = attention_type + self.lr_mult_list = lr_mult_list + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed(img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim, + wd_bias=wd_bias, + lr_mult=self.lr_mult_list[0]) + num_patches = self.patch_embed.num_patches + + # Positional Embeddings + self.cls_token = self.create_parameter( + shape=(1, 1, embed_dim), + default_initializer=zeros_, + attr=ParamAttr(regularizer=L2Decay(0.0))) + self.pos_embed = self.create_parameter( + shape=(1, num_patches + 1, embed_dim), + default_initializer=zeros_, + attr=ParamAttr(regularizer=L2Decay(0.0))) + self.pos_drop = nn.Dropout(p=drop_rate) + + if self.attention_type != 'space_only': + self.time_embed = self.create_parameter( + shape=(1, num_seg, embed_dim), + default_initializer=zeros_, + attr=ParamAttr(regularizer=L2Decay(0.0))) + self.time_drop = nn.Dropout(p=drop_rate) + + self.add_parameter("pos_embed", self.pos_embed) + self.add_parameter("cls_token", self.cls_token) + + dpr = np.linspace(0, drop_path_rate, depth) + + self.blocks = nn.LayerList([ + Block(dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + epsilon=epsilon, + attention_type=self.attention_type, + wd_bias=wd_bias, + lr_mult=self.lr_mult_list[(i // 4) + 1]) for i in range(depth) + ]) + + self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) + + def init_weights(self): + """First init model's weight""" + trunc_normal_(self.pos_embed, std=0.02) + trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_fn) + + if self.attention_type == 'divided_space_time': + i = 0 + for m in self.blocks.sublayers(include_self=True): + m_str = str(m) + if 'Block' in m_str: + if i > 0: + zeros_(m.temporal_fc.weight) + zeros_(m.temporal_fc.bias) + i += 1 + """Second, if provide pretrained ckpt, load it""" + if isinstance( + self.pretrained, str + ) and self.pretrained.strip() != "": # load pretrained weights + load_ckpt(self, + self.pretrained, + num_patches=self.patch_embed.num_patches, + num_seg=self.num_seg, + attention_type=self.attention_type) + elif self.pretrained is None or self.pretrained.strip() == "": + pass + else: + raise NotImplementedError + + def _init_fn(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + ones_(m.weight) + zeros_(m.bias) + + def forward_features(self, x): + # B = x.shape[0] + B = paddle.shape(x)[0] + x, T, W = self.patch_embed(x) # [BT,nH*nW,F] + cls_tokens = self.cls_token.expand((B * T, -1, -1)) # [1,1,F]->[BT,1,F] + x = paddle.concat((cls_tokens, x), axis=1) + pos_interp = (x.shape[1] != self.pos_embed.shape[1]) + if pos_interp: + pos_embed = self.pos_embed + cls_pos_embed = pos_embed[0, 0, :].unsqueeze(0).unsqueeze(1) + other_pos_embed = pos_embed[0, 1:, :].unsqueeze(0).transpose( + (0, 2, 1)) + P = int(other_pos_embed.shape[2]**0.5) + H = x.shape[1] // W + other_pos_embed = other_pos_embed.reshape([1, x.shape[2], P, P]) + new_pos_embed = F.interpolate(other_pos_embed, + size=(H, W), + mode='nearest') + new_pos_embed = new_pos_embed.flatten(2) + new_pos_embed = new_pos_embed.transpose((0, 2, 1)) + new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), + axis=1) + x = paddle.add(x, new_pos_embed) + else: + x = paddle.add(x, self.pos_embed) + + x = self.pos_drop(x) + + # Time Embeddings + if self.attention_type != 'space_only': + cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split( + T)[0].index_select(paddle.to_tensor([0]), axis=1) + x = x[:, 1:] + _, _n, _m = x.shape + _t = T + x = x.reshape([-1, _t, _n, _m]).transpose( + (0, 2, 1, 3)).reshape([-1, _t, _m]) + # Resizing time embeddings in case they don't match + time_interp = (T != self.time_embed.shape[1]) + if time_interp: # T' != T + time_embed = self.time_embed.transpose((0, 2, 1)).unsqueeze(0) + new_time_embed = F.interpolate(time_embed, + size=(T, x.shape[-1]), + mode='nearest').squeeze(0) + new_time_embed = new_time_embed.transpose((0, 2, 1)) + x = paddle.add(x, new_time_embed) + else: + x = paddle.add(x, self.time_embed) + + x = self.time_drop(x) + _, _t, _m = x.shape + x = x.reshape([-1, W * W * T, _m]) + x = paddle.concat((cls_tokens, x), axis=1) + + # Attention blocks + for blk in self.blocks: + x = blk(x, B, T, W) + + # Predictions for space-only baseline + if self.attention_type == 'space_only': + _, _n, _m = x.shape + _t = T + x = x.reshape([-1, _t, _n, _m]) + x = paddle.mean(x, 1) # averaging predictions for every frame + + x = self.norm(x) + return x[:, 0] # [B, embed_dim] + + def forward(self, x): + x = self.forward_features(x) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py new file mode 100644 index 0000000..5e6b88d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/backbones/yowo.py @@ -0,0 +1,150 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..registry import BACKBONES +from .darknet import Darknet +from .resnext101 import ResNext101 +import paddle.nn as nn +import paddle + + +class CAM_Module(nn.Layer): + def __init__(self, in_dim): + super(CAM_Module, self).__init__() + self.chanel_in = in_dim + temp = paddle.zeros([1], dtype='float32') + self.gamma = paddle.create_parameter(shape=temp.shape, dtype=str(temp.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(temp)) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x): + m_batchsize, C, height, width = x.shape + proj_query = paddle.reshape(x, [m_batchsize, C, -1]) + proj_key = paddle.transpose(paddle.reshape( + x, [m_batchsize, C, -1]), perm=[0, 2, 1]) + energy = paddle.bmm(proj_query, proj_key) + energy_new = paddle.expand_as(paddle.max( + energy, axis=-1, keepdim=True), energy) - energy + attention = self.softmax(energy_new) + proj_value = paddle.reshape(x, [m_batchsize, C, -1]) + + out = paddle.bmm(attention, proj_value) + out = out.reshape([m_batchsize, C, height, width]) + out = self.gamma * out + x + return out + + +class CFAMBlock(nn.Layer): + def __init__(self, in_channels, out_channels): + super(CFAMBlock, self).__init__() + inter_channels = 1024 + self.conv_bn_relu1 = nn.Sequential(nn.Conv2D(in_channels, inter_channels, kernel_size=1, bias_attr=False), + nn.BatchNorm2D(inter_channels), + nn.ReLU()) + self.conv_bn_relu2 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False), + nn.BatchNorm2D(inter_channels), + nn.ReLU()) + + self.sc = CAM_Module(inter_channels) + + self.conv_bn_relu3 = nn.Sequential(nn.Conv2D(inter_channels, inter_channels, 3, padding=1, bias_attr=False), + nn.BatchNorm2D(inter_channels), + nn.ReLU()) + self.conv_out = nn.Sequential(nn.Dropout2D(0.1), nn.Conv2D( + inter_channels, out_channels, 1, bias_attr=True)) + + def forward(self, x): + x = self.conv_bn_relu1(x) + x = self.conv_bn_relu2(x) + x = self.sc(x) + x = self.conv_bn_relu3(x) + output = self.conv_out(x) + + return output + + +@BACKBONES.register() +class YOWO(nn.Layer): + def __init__(self, num_class, pretrained_2d=None, pretrained_3d=None): + super(YOWO, self).__init__() + + self.pretrained_2d = pretrained_2d + self.pretrained_3d = pretrained_3d + self.backbone_2d = Darknet() + self.backbone_3d = ResNext101() + self.num_ch_2d = 425 + self.num_ch_3d = 2048 + self.num_class = num_class + self.cfam = CFAMBlock(self.num_ch_2d + self.num_ch_3d, 1024) + self.conv_final = nn.Conv2D( + 1024, 5 * (self.num_class + 4 + 1), kernel_size=1, bias_attr=False) + self.seen = 0 + + def init_weights(self): + if self.pretrained_2d is not None: + self.backbone_2d = self.load_pretrain_weight( + self.backbone_2d, self.pretrained_2d) + if self.pretrained_3d is not None: + self.backbone_3d = self.load_pretrain_weight( + self.backbone_3d, self.pretrained_3d) + + def load_pretrain_weight(self, model, weights_path): + model_dict = model.state_dict() + + param_state_dict = paddle.load(weights_path) + ignore_weights = set() + + # hack: fit for faster rcnn. Pretrain weights contain prefix of 'backbone' + # while res5 module is located in bbox_head.head. Replace the prefix of + # res5 with 'bbox_head.head' to load pretrain weights correctly. + for k in list(param_state_dict.keys()): + if 'backbone.res5' in k: + new_k = k.replace('backbone', 'bbox_head.head') + if new_k in model_dict.keys(): + value = param_state_dict.pop(k) + param_state_dict[new_k] = value + + for name, weight in param_state_dict.items(): + if name in model_dict.keys(): + if list(weight.shape) != list(model_dict[name].shape): + print( + '{} not used, shape {} unmatched with {} in model.'.format( + name, weight.shape, list(model_dict[name].shape))) + ignore_weights.add(name) + else: + print('Redundant weight {} and ignore it.'.format(name)) + ignore_weights.add(name) + + for weight in ignore_weights: + param_state_dict.pop(weight, None) + + model.set_dict(param_state_dict) + print('Finish loading model weights: {}'.format(weights_path)) + return model + + def forward(self, input): + x_3d = input # Input clip + x_2d = input[:, :, -1, :, :] # Last frame of the clip that is read + + x_2d = self.backbone_2d(x_2d) + + x_3d = self.backbone_3d(x_3d) + + x_3d = paddle.squeeze(x_3d, axis=2) + + x = paddle.concat([x_3d, x_2d], axis=1) + x = self.cfam(x) + out = self.conv_final(x) + + return out diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/bbox_utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/bbox_utils.py new file mode 100644 index 0000000..23b4555 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/bbox_utils.py @@ -0,0 +1,528 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn.functional as F +import math +import numpy as np + + +def bbox2delta(src_boxes, tgt_boxes, weights): + src_w = src_boxes[:, 2] - src_boxes[:, 0] + src_h = src_boxes[:, 3] - src_boxes[:, 1] + src_ctr_x = src_boxes[:, 0] + 0.5 * src_w + src_ctr_y = src_boxes[:, 1] + 0.5 * src_h + + tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] + tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] + tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w + tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h + + wx, wy, ww, wh = weights + dx = wx * (tgt_ctr_x - src_ctr_x) / src_w + dy = wy * (tgt_ctr_y - src_ctr_y) / src_h + dw = ww * paddle.log(tgt_w / src_w) + dh = wh * paddle.log(tgt_h / src_h) + + deltas = paddle.stack((dx, dy, dw, dh), axis=1) + return deltas + + +def delta2bbox(deltas, boxes, weights): + clip_scale = math.log(1000.0 / 16) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = weights + dx = deltas[:, 0::4] / wx + dy = deltas[:, 1::4] / wy + dw = deltas[:, 2::4] / ww + dh = deltas[:, 3::4] / wh + # Prevent sending too large values into paddle.exp() + dw = paddle.clip(dw, max=clip_scale) + dh = paddle.clip(dh, max=clip_scale) + + pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1) + pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1) + pred_w = paddle.exp(dw) * widths.unsqueeze(1) + pred_h = paddle.exp(dh) * heights.unsqueeze(1) + + pred_boxes = [] + pred_boxes.append(pred_ctr_x - 0.5 * pred_w) + pred_boxes.append(pred_ctr_y - 0.5 * pred_h) + pred_boxes.append(pred_ctr_x + 0.5 * pred_w) + pred_boxes.append(pred_ctr_y + 0.5 * pred_h) + pred_boxes = paddle.stack(pred_boxes, axis=-1) + + return pred_boxes + + +def expand_bbox(bboxes, scale): + w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5 + h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5 + x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5 + y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32) + bboxes_exp[:, 0] = x_c - w_half + bboxes_exp[:, 2] = x_c + w_half + bboxes_exp[:, 1] = y_c - h_half + bboxes_exp[:, 3] = y_c + h_half + + return bboxes_exp + + +def clip_bbox(boxes, im_shape): + h, w = im_shape[0], im_shape[1] + x1 = boxes[:, 0].clip(0, w) + y1 = boxes[:, 1].clip(0, h) + x2 = boxes[:, 2].clip(0, w) + y2 = boxes[:, 3].clip(0, h) + return paddle.stack([x1, y1, x2, y2], axis=1) + + +def nonempty_bbox(boxes, min_size=0, return_mask=False): + w = boxes[:, 2] - boxes[:, 0] + h = boxes[:, 3] - boxes[:, 1] + mask = paddle.logical_and(w > min_size, w > min_size) + if return_mask: + return mask + keep = paddle.nonzero(mask).flatten() + return keep + + +def bbox_area(boxes): + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def bbox_overlaps(boxes1, boxes2): + """ + Calculate overlaps between boxes1 and boxes2 + + Args: + boxes1 (Tensor): boxes with shape [M, 4] + boxes2 (Tensor): boxes with shape [N, 4] + + Return: + overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N] + """ + area1 = bbox_area(boxes1) + area2 = bbox_area(boxes2) + + xy_max = paddle.minimum( + paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) + xy_min = paddle.maximum( + paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) + width_height = xy_max - xy_min + width_height = width_height.clip(min=0) + inter = width_height.prod(axis=2) + + overlaps = paddle.where(inter > 0, inter / + (paddle.unsqueeze(area1, 1) + area2 - inter), + paddle.zeros_like(inter)) + return overlaps + + +def xywh2xyxy(box): + x, y, w, h = box + x1 = x - w * 0.5 + y1 = y - h * 0.5 + x2 = x + w * 0.5 + y2 = y + h * 0.5 + return [x1, y1, x2, y2] + + +def make_grid(h, w, dtype): + yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)]) + return paddle.stack((xv, yv), 2).cast(dtype=dtype) + + +def decode_yolo(box, anchor, downsample_ratio): + """decode yolo box + + Args: + box (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + anchor (list): anchor with the shape [na, 2] + downsample_ratio (int): downsample ratio, default 32 + scale (float): scale, default 1. + + Return: + box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1] + """ + x, y, w, h = box + na, grid_h, grid_w = x.shape[1:4] + grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2)) + x1 = (x + grid[:, :, :, :, 0:1]) / grid_w + y1 = (y + grid[:, :, :, :, 1:2]) / grid_h + + anchor = paddle.to_tensor(anchor) + anchor = paddle.cast(anchor, x.dtype) + anchor = anchor.reshape((1, na, 1, 1, 2)) + w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w) + h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h) + + return [x1, y1, w1, h1] + + +def iou_similarity(box1, box2, eps=1e-9): + """Calculate iou of box1 and box2 + + Args: + box1 (Tensor): box with the shape [N, M1, 4] + box2 (Tensor): box with the shape [N, M2, 4] + + Return: + iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] + """ + box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] + box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] + px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4] + gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4] + x1y1 = paddle.maximum(px1y1, gx1y1) + x2y2 = paddle.minimum(px2y2, gx2y2) + overlap = (x2y2 - x1y1).clip(0).prod(-1) + area1 = (px2y2 - px1y1).clip(0).prod(-1) + area2 = (gx2y2 - gx1y1).clip(0).prod(-1) + union = area1 + area2 - overlap + eps + return overlap / union + + +def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9): + """calculate the iou of box1 and box2 + + Args: + box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + giou (bool): whether use giou or not, default False + diou (bool): whether use diou or not, default False + ciou (bool): whether use ciou or not, default False + eps (float): epsilon to avoid divide by zero + + Return: + iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1] + """ + px1, py1, px2, py2 = box1 + gx1, gy1, gx2, gy2 = box2 + x1 = paddle.maximum(px1, gx1) + y1 = paddle.maximum(py1, gy1) + x2 = paddle.minimum(px2, gx2) + y2 = paddle.minimum(py2, gy2) + + overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0)) + + area1 = (px2 - px1) * (py2 - py1) + area1 = area1.clip(0) + + area2 = (gx2 - gx1) * (gy2 - gy1) + area2 = area2.clip(0) + + union = area1 + area2 - overlap + eps + iou = overlap / union + + if giou or ciou or diou: + # convex w, h + cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1) + ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1) + if giou: + c_area = cw * ch + eps + return iou - (c_area - union) / c_area + else: + # convex diagonal squared + c2 = cw**2 + ch**2 + eps + # center distance + rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4 + if diou: + return iou - rho2 / c2 + else: + w1, h1 = px2 - px1, py2 - py1 + eps + w2, h2 = gx2 - gx1, gy2 - gy1 + eps + delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2) + v = (4 / math.pi**2) * paddle.pow(delta, 2) + alpha = v / (1 + eps - iou + v) + alpha.stop_gradient = True + return iou - (rho2 / c2 + v * alpha) + else: + return iou + + +def rect2rbox(bboxes): + """ + :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax) + :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle) + """ + bboxes = bboxes.reshape(-1, 4) + num_boxes = bboxes.shape[0] + + x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0 + y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0 + edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0]) + edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1]) + angles = np.zeros([num_boxes], dtype=bboxes.dtype) + + inds = edges1 < edges2 + + rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1) + rboxes[inds, 2] = edges2[inds] + rboxes[inds, 3] = edges1[inds] + rboxes[inds, 4] = np.pi / 2.0 + return rboxes + + +def delta2rbox(Rrois, + deltas, + means=[0, 0, 0, 0, 0], + stds=[1, 1, 1, 1, 1], + wh_ratio_clip=1e-6): + """ + :param Rrois: (cx, cy, w, h, theta) + :param deltas: (dx, dy, dw, dh, dtheta) + :param means: + :param stds: + :param wh_ratio_clip: + :return: + """ + means = paddle.to_tensor(means) + stds = paddle.to_tensor(stds) + deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]]) + denorm_deltas = deltas * stds + means + + dx = denorm_deltas[:, 0] + dy = denorm_deltas[:, 1] + dw = denorm_deltas[:, 2] + dh = denorm_deltas[:, 3] + dangle = denorm_deltas[:, 4] + + max_ratio = np.abs(np.log(wh_ratio_clip)) + dw = paddle.clip(dw, min=-max_ratio, max=max_ratio) + dh = paddle.clip(dh, min=-max_ratio, max=max_ratio) + + Rroi_x = Rrois[:, 0] + Rroi_y = Rrois[:, 1] + Rroi_w = Rrois[:, 2] + Rroi_h = Rrois[:, 3] + Rroi_angle = Rrois[:, 4] + + gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin( + Rroi_angle) + Rroi_x + gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos( + Rroi_angle) + Rroi_y + gw = Rroi_w * dw.exp() + gh = Rroi_h * dh.exp() + ga = np.pi * dangle + Rroi_angle + ga = (ga + np.pi / 4) % np.pi - np.pi / 4 + ga = paddle.to_tensor(ga) + + gw = paddle.to_tensor(gw, dtype='float32') + gh = paddle.to_tensor(gh, dtype='float32') + bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1) + return bboxes + + +def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]): + """ + + Args: + proposals: + gt: + means: 1x5 + stds: 1x5 + + Returns: + + """ + proposals = proposals.astype(np.float64) + + PI = np.pi + + gt_widths = gt[..., 2] + gt_heights = gt[..., 3] + gt_angle = gt[..., 4] + + proposals_widths = proposals[..., 2] + proposals_heights = proposals[..., 3] + proposals_angle = proposals[..., 4] + + coord = gt[..., 0:2] - proposals[..., 0:2] + dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4]) + * coord[..., 1]) / proposals_widths + dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4]) + * coord[..., 1]) / proposals_heights + dw = np.log(gt_widths / proposals_widths) + dh = np.log(gt_heights / proposals_heights) + da = (gt_angle - proposals_angle) + + da = (da + PI / 4) % PI - PI / 4 + da /= PI + + deltas = np.stack([dx, dy, dw, dh, da], axis=-1) + means = np.array(means, dtype=deltas.dtype) + stds = np.array(stds, dtype=deltas.dtype) + deltas = (deltas - means) / stds + deltas = deltas.astype(np.float32) + return deltas + + +def bbox_decode(bbox_preds, + anchors, + means=[0, 0, 0, 0, 0], + stds=[1, 1, 1, 1, 1]): + """decode bbox from deltas + Args: + bbox_preds: [N,H,W,5] + anchors: [H*W,5] + return: + bboxes: [N,H,W,5] + """ + means = paddle.to_tensor(means) + stds = paddle.to_tensor(stds) + num_imgs, H, W, _ = bbox_preds.shape + bboxes_list = [] + for img_id in range(num_imgs): + bbox_pred = bbox_preds[img_id] + # bbox_pred.shape=[5,H,W] + bbox_delta = bbox_pred + anchors = paddle.to_tensor(anchors) + bboxes = delta2rbox( + anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6) + bboxes = paddle.reshape(bboxes, [H, W, 5]) + bboxes_list.append(bboxes) + return paddle.stack(bboxes_list, axis=0) + + +def poly_to_rbox(polys): + """ + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + to + rotated_boxes:[x_ctr,y_ctr,w,h,angle] + """ + rotated_boxes = [] + for poly in polys: + poly = np.array(poly[:8], dtype=np.float32) + + pt1 = (poly[0], poly[1]) + pt2 = (poly[2], poly[3]) + pt3 = (poly[4], poly[5]) + pt4 = (poly[6], poly[7]) + + edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[ + 1]) * (pt1[1] - pt2[1])) + edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[ + 1]) * (pt2[1] - pt3[1])) + + width = max(edge1, edge2) + height = min(edge1, edge2) + + rbox_angle = 0 + if edge1 > edge2: + rbox_angle = np.arctan2( + np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0])) + elif edge2 >= edge1: + rbox_angle = np.arctan2( + np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0])) + + def norm_angle(angle, range=[-np.pi / 4, np.pi]): + return (angle - range[0]) % range[1] + range[0] + + rbox_angle = norm_angle(rbox_angle) + + x_ctr = np.float(pt1[0] + pt3[0]) / 2 + y_ctr = np.float(pt1[1] + pt3[1]) / 2 + rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle]) + rotated_boxes.append(rotated_box) + ret_rotated_boxes = np.array(rotated_boxes) + assert ret_rotated_boxes.shape[1] == 5 + return ret_rotated_boxes + + +def cal_line_length(point1, point2): + import math + return math.sqrt( + math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2)) + + +def get_best_begin_point_single(coordinate): + x1, y1, x2, y2, x3, y3, x4, y4 = coordinate + xmin = min(x1, x2, x3, x4) + ymin = min(y1, y2, y3, y4) + xmax = max(x1, x2, x3, x4) + ymax = max(y1, y2, y3, y4) + combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], + [[x4, y4], [x1, y1], [x2, y2], [x3, y3]], + [[x3, y3], [x4, y4], [x1, y1], [x2, y2]], + [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]] + dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] + force = 100000000.0 + force_flag = 0 + for i in range(4): + temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \ + + cal_line_length(combinate[i][1], dst_coordinate[1]) \ + + cal_line_length(combinate[i][2], dst_coordinate[2]) \ + + cal_line_length(combinate[i][3], dst_coordinate[3]) + if temp_force < force: + force = temp_force + force_flag = i + if force_flag != 0: + pass + return np.array(combinate[force_flag]).reshape(8) + + +def rbox2poly_single(rrect): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + # rect 2x4 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + # poly + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) + poly = get_best_begin_point_single(poly) + return poly + + +def rbox2poly(rrects): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + polys = [] + for rrect in rrects: + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) + poly = get_best_begin_point_single(poly) + polys.append(poly) + polys = np.array(polys) + return polys diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/builder.py b/Bank_second_part/detect_process/paddlevideo/modeling/builder.py new file mode 100644 index 0000000..71503eb --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/builder.py @@ -0,0 +1,127 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .registry import BACKBONES, HEADS, LOSSES, RECOGNIZERS, LOCALIZERS, ROI_EXTRACTORS, DETECTORS, BBOX_ASSIGNERS, BBOX_SAMPLERS, BBOX_CODERS, PARTITIONERS, MULTIMODAL, SEGMENT, SEGMENTERS +from ..utils import build +from .registry import (BACKBONES, BBOX_ASSIGNERS, BBOX_CODERS, BBOX_SAMPLERS, + DETECTORS, ESTIMATORS, HEADS, LOCALIZERS, LOSSES, + MULTIMODAL, PARTITIONERS, RECOGNIZERS, ROI_EXTRACTORS) + + +def build_backbone(cfg): + """Build backbone.""" + return build(cfg, BACKBONES) + + +def build_roi_extractor(cfg): + """Build roi extractor.""" + return build(cfg, ROI_EXTRACTORS) + + +def build_assigner(cfg, **default_args): + """Builder of box assigner.""" + return build(cfg, BBOX_ASSIGNERS) + + +def build_sampler(cfg, **default_args): + """Builder of box sampler.""" + return build(cfg, BBOX_SAMPLERS) + + +def build_roi_extractor(cfg): + """Build roi extractor.""" + return build(cfg, ROI_EXTRACTORS) + + +def build_assigner(cfg, **default_args): + """Builder of box assigner.""" + return build(cfg, BBOX_ASSIGNERS) + + +def build_sampler(cfg, **default_args): + """Builder of box sampler.""" + return build(cfg, BBOX_SAMPLERS) + + +def build_head(cfg): + """Build head.""" + return build(cfg, HEADS) + + +def build_loss(cfg): + """Build loss.""" + return build(cfg, LOSSES) + + +def build_recognizer(cfg): + """Build recognizer.""" + return build(cfg, RECOGNIZERS, key='framework') + + +def build_segmenter(cfg): + """Build segmenter.""" + return build(cfg, SEGMENTERS, key='framework') + + +def build_localizer(cfg): + """Build localizer.""" + return build(cfg, LOCALIZERS, key='framework') + + +def build_detector(cfg, train_cfg=None, test_cfg=None): + """Build detector.""" + return build(cfg, DETECTORS, key='framework') + + +def build_partitioner(cfg): + """Build partitioner.""" + return build(cfg, PARTITIONERS, key='framework') + + +def build_estimator(cfg): + """Build estimator.""" + return build(cfg, ESTIMATORS, key='framework') + + +def build_multimodal(cfg): + """Build multimodal.""" + return build(cfg, MULTIMODAL, key='framework') + + +def build_segment(cfg): + """Build segment.""" + return build(cfg, SEGMENT, key='framework') + + +def build_model(cfg): + cfg_copy = cfg.copy() + framework_type = cfg_copy.get('framework') + if framework_type in RECOGNIZERS: + return build_recognizer(cfg) + elif framework_type in LOCALIZERS: + return build_localizer(cfg) + elif framework_type in PARTITIONERS: + return build_partitioner(cfg) + elif framework_type in DETECTORS: + return build_detector(cfg) + elif framework_type in ESTIMATORS: + return build_estimator(cfg) + elif framework_type in MULTIMODAL: + return build_multimodal(cfg) + elif framework_type in SEGMENTERS: + return build_segmenter(cfg) + elif framework_type in SEGMENT: + return build_segment(cfg) + else: + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__init__.py new file mode 100644 index 0000000..d68fe09 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .estimators import BaseEstimator, DepthEstimator +from .localizers import BaseLocalizer, BMNLocalizer +from .partitioners import BasePartitioner, TransNetV2Partitioner +from .recognizers import BaseRecognizer, Recognizer2D +from .multimodal import ActBert, BaseMultimodal +from .segment import BaseSegment, CFBI +from .segmenters import MSTCN + +__all__ = [ + 'BaseRecognizer', 'Recognizer2D', 'BaseLocalizer', 'BMNLocalizer', + 'BasePartitioner', 'TransNetV2Partitioner', 'BaseEstimator', + 'DepthEstimator', 'BaseMultimodal', 'ActBert', 'BaseSegment', 'CFBI', + 'MSTCN' +] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..4d7bf2b Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__init__.py new file mode 100644 index 0000000..74dcac0 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from .base import BaseDetector +from .fast_rcnn import FastRCNN +from .two_stage import TwoStageDetector + +__all__ = ['BaseDetector', 'TwoStageDetector', 'FastRCNN'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..bdf6421 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..b61c7f1 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/fast_rcnn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/fast_rcnn.cpython-310.pyc new file mode 100644 index 0000000..e7704f4 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/fast_rcnn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/two_stage.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/two_stage.cpython-310.pyc new file mode 100644 index 0000000..176db47 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/__pycache__/two_stage.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/base.py new file mode 100644 index 0000000..4d5ccb8 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/base.py @@ -0,0 +1,51 @@ +from abc import abstractmethod +from ... import builder +import paddle.nn as nn +from ...registry import DETECTORS + +@DETECTORS.register() +class BaseDetector(nn.Layer): + """Base class for detectors. """ + def __init__(self, backbone=None, head=None): + + super().__init__() + + def init_weights(self): + """Initialize the model network weights. """ + self.backbone.init_weights() + self.head.init_weights() + + def extract_feature(self, imgs, iter_num): + """Extract features through a backbone. """ + feature = self.backbone(imgs) + return feature + + def forward(self, data_batch, mode='infer'): + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating step. + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Test step. + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/fast_rcnn.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/fast_rcnn.py new file mode 100644 index 0000000..e8f912d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/fast_rcnn.py @@ -0,0 +1,34 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .two_stage import TwoStageDetector +from ...registry import DETECTORS + +@DETECTORS.register() +class FastRCNN(TwoStageDetector): + + def __init__(self, + backbone, + head=None, + train_cfg=None, + test_cfg=None, + neck=None, + pretrained=None): + super(FastRCNN, self).__init__( + backbone=backbone, + neck=neck, + roi_head=head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/two_stage.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/two_stage.py new file mode 100644 index 0000000..f9deb1d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/detectors/two_stage.py @@ -0,0 +1,186 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from ... import builder +import paddle.distributed as dist +from ...registry import DETECTORS +from .base import BaseDetector + + +@DETECTORS.register() +class TwoStageDetector(BaseDetector): + """Base class for two-stage detectors. """ + + def __init__(self, + backbone, + neck=None, + rpn_head=None, + roi_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(TwoStageDetector, self).__init__() + self.backbone = builder.build_backbone(backbone) + + if neck is not None: + self.neck = neck # useless + + if rpn_head is not None: + rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None + rpn_head_ = rpn_head.copy() + rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) + self.rpn_head = builder.build_head(rpn_head_) + + if roi_head is not None: + self.roi_head = builder.build_head(roi_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if pretrained is not None: + self.init_weights(pretrained=pretrained) + + @property + def with_rpn(self): + """whether the detector has RPN""" + return hasattr(self, 'rpn_head') and self.rpn_head is not None + + @property + def with_roi_head(self): + """whether the detector has a RoI head""" + return hasattr(self, 'roi_head') and self.roi_head is not None + + def init_weights(self, pretrained=None): + """Initialize the weights in detector. """ + super(TwoStageDetector, self).init_weights(pretrained) + self.backbone.init_weights(pretrained=pretrained) + if self.with_rpn: + self.rpn_head.init_weights() + if self.with_roi_head: + self.roi_head.init_weights(pretrained) + + def extract_feat(self, img): + """Directly extract features from the backbone.""" + x = self.backbone(img) + return x + + def train_step(self, data, **kwargs): + img_slow = data[0] + img_fast = data[1] + proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas( + data) + img_shape = data[7] + img_idx = data[8] + img_metas = scores, entity_ids + x = self.extract_feat(img=[img_slow, img_fast]) + roi_losses = self.roi_head.train_step(x, img_metas, proposals, + gt_bboxes, gt_labels, **kwargs) + losses = dict() + losses.update(roi_losses) + + return losses + + def val_step(self, data, rescale=False): + img_slow = data[0] + img_fast = data[1] + proposals, gt_bboxes, gt_labels, scores, entity_ids = self.get_unpad_datas( + data) + img_shape = data[7] + img_metas = scores, entity_ids + x = self.extract_feat(img=[img_slow, img_fast]) + + return self.roi_head.simple_test(x, + proposals[0], + img_shape, + rescale=rescale) + + def test_step(self, data, rescale=False): + return self.val_step(data, rescale) + + def infer_step(self, data, rescale=False): + ''' model inference''' + + img_slow = data[0] + img_fast = data[1] + proposals = data[2] + img_shape = data[3] + + # using slowfast model to extract spatio-temporal features + x = self.extract_feat(img=[img_slow, img_fast]) + + ret = self.roi_head.simple_test(x, + proposals[0], + img_shape, + rescale=rescale) + return ret + + def get_unpad_datas(self, data): + ''' get original datas padded in dataset ''' + pad_proposals = data[2] + pad_gt_bboxes = data[3] + pad_gt_labels = data[4] + pad_scores, pad_entity_ids = data[5], data[6] + len_proposals = data[9] + len_gt_bboxes = data[10] + len_gt_labels = data[11] + len_scores = data[12] + len_entity_ids = data[13] + N = pad_proposals.shape[0] + proposals = [] + gt_bboxes = [] + gt_labels = [] + scores = [] + entity_ids = [] + for bi in range(N): + pad_proposal = pad_proposals[bi] + len_proposal = len_proposals[bi] + index_proposal = paddle.arange(len_proposal) + proposal = paddle.index_select(x=pad_proposal, + index=index_proposal, + axis=0) + proposals.append(proposal) + + pad_gt_bbox = pad_gt_bboxes[bi] + len_gt_bbox = len_gt_bboxes[bi] + index_gt_bbox = paddle.arange(len_gt_bbox) + gt_bbox = paddle.index_select(x=pad_gt_bbox, + index=index_gt_bbox, + axis=0) + gt_bboxes.append(gt_bbox) + + pad_gt_label = pad_gt_labels[bi] + len_gt_label = len_gt_labels[bi] + index_gt_label = paddle.arange(len_gt_label) + gt_label = paddle.index_select(x=pad_gt_label, + index=index_gt_label, + axis=0) + gt_labels.append(gt_label) + + pad_score = pad_scores[bi] + len_score = len_scores[bi] + index_score = paddle.arange(len_score) + score = paddle.index_select(x=pad_score, index=index_score, axis=0) + scores.append(score) + + pad_entity_id = pad_entity_ids[bi] + len_entity_id = len_entity_ids[bi] + index_entity_id = paddle.arange(len_entity_id) + entity_id = paddle.index_select(x=pad_entity_id, + index=index_entity_id, + axis=0) + entity_ids.append(entity_id) + + return proposals, gt_bboxes, gt_labels, scores, entity_ids diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__init__.py new file mode 100644 index 0000000..e2bda93 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__init__.py @@ -0,0 +1,4 @@ +from .base import BaseEstimator +from .depth_estimator import DepthEstimator + +__all__ = ['DepthEstimator', 'BaseEstimator'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..72d1ff9 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..8bee686 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/depth_estimator.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/depth_estimator.cpython-310.pyc new file mode 100644 index 0000000..7ad977b Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/__pycache__/depth_estimator.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/base.py new file mode 100644 index 0000000..cdddd67 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/base.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from abc import abstractmethod + +import paddle +import paddle.nn as nn +from paddlevideo.modeling.registry import ESTIMATORS +from paddlevideo.utils import get_logger + +from ... import builder + +logger = get_logger("paddlevideo") + + +@ESTIMATORS.register() +class BaseEstimator(nn.Layer): + """BaseEstimator + + """ + def __init__(self, backbone=None, head=None): + super().__init__() + if backbone is not None: + self.backbone = builder.build_backbone(backbone) + if hasattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + self.backbone = None + + if head is not None: + self.head_name = head.name + self.head = builder.build_head(head) + if hasattr(self.head, 'init_weights'): + self.head.init_weights() + else: + self.head = None + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch): + """Define how the model is going to valid, from input to output.""" + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + raise NotImplementedError + + @abstractmethod + def infer_step(self, data_batch): + """Define how the model is going to infer, from input to output.""" + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/depth_estimator.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/depth_estimator.py new file mode 100644 index 0000000..13ee877 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/estimators/depth_estimator.py @@ -0,0 +1,59 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import paddle +from paddlevideo.modeling.framework.estimators.base import BaseEstimator +from paddlevideo.modeling.registry import ESTIMATORS +from paddlevideo.utils import get_logger + +from ... import builder + +logger = get_logger("paddlevideo") + + +@ESTIMATORS.register() +class DepthEstimator(BaseEstimator): + """DepthEstimator + """ + def forward_net(self, inputs, day_or_night='day_and_night'): + if self.backbone is not None: + outputs = self.backbone(inputs, day_or_night) + else: + outputs = inputs + return outputs + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + inputs, _ = data_batch + outputs = self.forward_net(inputs, day_or_night='day_and_night') + loss_metrics = self.head.loss(inputs, outputs) + return loss_metrics + + def val_step(self, data_batch): + inputs, day_or_night = data_batch + outputs = self.forward_net(inputs, day_or_night=day_or_night) + loss_metrics = self.head.loss(inputs, outputs) + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + inputs, day_or_night = data_batch + outputs = self.forward_net(inputs, day_or_night=day_or_night) + loss_metrics = self.head.loss(inputs, outputs) + return loss_metrics + + def infer_step(self, data_batch): + """Define how the model is going to infer, from input to output.""" + inputs = data_batch[0] + outputs = self.forward_net(inputs, day_or_night='day') + return outputs diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__init__.py new file mode 100644 index 0000000..323a72c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__init__.py @@ -0,0 +1,19 @@ +# copyright (c) 2020 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license" +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +from .base import BaseLocalizer +from .bmn_localizer import BMNLocalizer +from .yowo_localizer import YOWOLocalizer + +__all__ = ['BaseLocalizer', 'BMNLocalizer', 'YOWOLocalizer'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..4e79cd8 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..f7e5cef Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/bmn_localizer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/bmn_localizer.cpython-310.pyc new file mode 100644 index 0000000..3fdc87d Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/bmn_localizer.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_localizer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_localizer.cpython-310.pyc new file mode 100644 index 0000000..f3c15db Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_localizer.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_utils.cpython-310.pyc new file mode 100644 index 0000000..15bd20f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/__pycache__/yowo_utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/base.py new file mode 100644 index 0000000..cfd2869 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/base.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +import paddle.nn as nn +from ... import builder + + +class BaseLocalizer(nn.Layer): + """Base class for Localization. + All localizer should subclass it. + All subclass should overwrite: + - Methods:``train_step``, define your train step. + - Methods:``valid_step``, define your valid step, always the same as train_step. + - Methods:``test_step``, define your test step. + """ + def __init__(self, backbone, loss): + super().__init__() + self.backbone = builder.build_backbone(backbone) + self.loss = builder.build_loss(loss) + self.init_weights() + + def init_weights(self): + """Initialize the model network weights. """ + if getattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + pass + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. input_data_batch -> loss_metric + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating setp. input_data_batch -> loss_metric + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Tets setp. to get acc in test data. input_data_batch -> output + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/bmn_localizer.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/bmn_localizer.py new file mode 100644 index 0000000..5afbd3a --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/bmn_localizer.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import LOCALIZERS +from .base import BaseLocalizer + +import paddle + + +@LOCALIZERS.register() +class BMNLocalizer(BaseLocalizer): + """BMN Localization framework + """ + def forward_net(self, imgs): + """Call backbone forward. + """ + preds = self.backbone(imgs) + return preds + + def train_step(self, data_batch): + """Training step. + """ + x_data = data_batch[0] + gt_iou_map = data_batch[1] + gt_start = data_batch[2] + gt_end = data_batch[3] + gt_iou_map.stop_gradient = True + gt_start.stop_gradient = True + gt_end.stop_gradient = True + + # call Model forward + pred_bm, pred_start, pred_end = self.forward_net(x_data) + # call Loss forward + loss = self.loss(pred_bm, pred_start, pred_end, gt_iou_map, gt_start, + gt_end) + avg_loss = paddle.mean(loss) + loss_metrics = dict() + loss_metrics['loss'] = avg_loss + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + return self.train_step(data_batch) + + def test_step(self, data_batch): + """Test step. + """ + x_data = data_batch[0] + pred_bm, pred_start, pred_end = self.forward_net(x_data) + return pred_bm, pred_start, pred_end + + def infer_step(self, data_batch): + """Infer step + """ + x_data = data_batch[0] + + # call Model forward + pred_bm, pred_start, pred_end = self.forward_net(x_data) + return pred_bm, pred_start, pred_end diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_localizer.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_localizer.py new file mode 100644 index 0000000..c3613c6 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_localizer.py @@ -0,0 +1,161 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import LOCALIZERS +from .base import BaseLocalizer +from .yowo_utils import truths_length, nms, get_region_boxes, bbox_iou + + +@LOCALIZERS.register() +class YOWOLocalizer(BaseLocalizer): + """YOWO Localization framework + """ + + def forward_net(self, imgs): + """Call backbone forward. + """ + # imgs.shape=[N,C,T,H,W], for YOWO + preds = self.backbone(imgs) + return preds + + def train_step(self, data_batch): + """Training step. + """ + x_data = data_batch[0] + target = data_batch[1].squeeze(1) # indeed do squeeze to adapt to paddle tensor + target.stop_gradient = True + + # call Model forward + out = self.forward_net(x_data) + # call Loss forward + loss, nCorrect = self.loss(out, target) + loss_metrics = dict() + loss_metrics['loss'] = loss + loss_metrics['nCorrect'] = nCorrect + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + total = 0.0 + proposals = 0.0 + correct = 0.0 + fscore = 0.0 + eps = 1e-5 + nms_thresh = 0.4 + iou_thresh = 0.5 + + x_data = data_batch[0] + target = data_batch[1].squeeze(1) # indeed do squeeze to adapt to paddle tensor + frame_idx = data_batch[2] + target.stop_gradient = True + # call Model forward + out = self.forward_net(x_data) + all_boxes = get_region_boxes(out) + out_boxes = [] + + for i in range(out.shape[0]): + boxes = all_boxes[i] + boxes = nms(boxes, nms_thresh) + out_boxes.append(boxes) + truths = target[i].reshape([-1, 5]) + num_gts = truths_length(truths) + total = total + num_gts + pred_list = [] + for i in range(len(boxes)): + if boxes[i][4] > 0.25: + proposals = proposals + 1 + pred_list.append(i) + for i in range(num_gts): + box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]] + best_iou = 0 + best_j = -1 + for j in pred_list: # ITERATE THROUGH ONLY CONFIDENT BOXES + iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False) + if iou > best_iou: + best_j = j + best_iou = iou + if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]: + correct = correct + 1 + + precision = 1.0 * correct / (proposals + eps) + recall = 1.0 * correct / (total + eps) + fscore = 2.0 * precision * recall / (precision + recall + eps) + + outs = dict() + outs['precision'] = precision + outs['recall'] = recall + outs['fscore'] = fscore + outs['frame_idx'] = frame_idx + return outs + + def test_step(self, data_batch): + """Test step. + """ + total = 0.0 + proposals = 0.0 + correct = 0.0 + fscore = 0.0 + eps = 1e-5 + nms_thresh = 0.4 + iou_thresh = 0.5 + + x_data = data_batch[0] + target = data_batch[1].squeeze(1) # indeed do squeeze to adapt to paddle tensor + frame_idx = data_batch[2] + target.stop_gradient = True + # call Model forward + out = self.forward_net(x_data) + all_boxes = get_region_boxes(out) + out_boxes = [] + + for i in range(out.shape[0]): + boxes = all_boxes[i] + boxes = nms(boxes, nms_thresh) + out_boxes.append(boxes) + truths = target[i].reshape([-1, 5]) + num_gts = truths_length(truths) + total = total + num_gts + pred_list = [] + for i in range(len(boxes)): + if boxes[i][4] > 0.25: + proposals = proposals + 1 + pred_list.append(i) + for i in range(num_gts): + box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]] + best_iou = 0 + best_j = -1 + for j in pred_list: # ITERATE THROUGH ONLY CONFIDENT BOXES + iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False) + if iou > best_iou: + best_j = j + best_iou = iou + if best_iou > iou_thresh and int(boxes[best_j][6]) == box_gt[6]: + correct = correct + 1 + + precision = 1.0 * correct / (proposals + eps) + recall = 1.0 * correct / (total + eps) + fscore = 2.0 * precision * recall / (precision + recall + eps) + + outs = dict() + outs['boxes'] = out_boxes + outs['precision'] = precision + outs['recall'] = recall + outs['fscore'] = fscore + outs['frame_idx'] = frame_idx + return outs + + def infer_step(self, data_batch): + """Infer step. + """ + out = self.forward_net(data_batch[0]) + return out \ No newline at end of file diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_utils.py new file mode 100644 index 0000000..9f0e016 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/localizers/yowo_utils.py @@ -0,0 +1,359 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn as nn +import numpy as np +from builtins import range as xrange + + +def truths_length(truths): + for i in range(50): + if truths[i][1] == 0: + return i + + +def nms(boxes, nms_thresh): + if len(boxes) == 0: + return boxes + + det_confs = paddle.zeros([len(boxes)]) + for i in range(len(boxes)): + det_confs[i] = 1 - boxes[i][4] + + sortIds = paddle.argsort(det_confs) + out_boxes = [] + for i in range(len(boxes)): + box_i = boxes[sortIds[i]] + if box_i[4] > 0: + out_boxes.append(box_i) + for j in range(i + 1, len(boxes)): + box_j = boxes[sortIds[j]] + if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh: + box_j[4] = 0 + return out_boxes + + +def convert2cpu(gpu_matrix): + float_32_g = gpu_matrix.astype('float32') + return float_32_g.cpu() + + +def convert2cpu_long(gpu_matrix): + int_64_g = gpu_matrix.astype('int64') + return int_64_g.cpu() + + +def get_region_boxes(output, conf_thresh=0.005, num_classes=24, + anchors=[0.70458, 1.18803, 1.26654, 2.55121, 1.59382, + 4.08321, 2.30548, 4.94180, 3.52332, 5.91979], + num_anchors=5, only_objectness=1, validation=False): + anchor_step = len(anchors) // num_anchors + if output.dim() == 3: + output = output.unsqueeze(0) + batch = output.shape[0] + assert (output.shape[1] == (5 + num_classes) * num_anchors) + h = output.shape[2] + w = output.shape[3] + all_boxes = [] + output = paddle.reshape( + output, [batch * num_anchors, 5 + num_classes, h * w]) + output = paddle.transpose(output, (1, 0, 2)) + output = paddle.reshape( + output, [5 + num_classes, batch * num_anchors * h * w]) + + grid_x = paddle.linspace(0, w - 1, w) + grid_x = paddle.tile(grid_x, [h, 1]) + grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1]) + grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda() + + grid_y = paddle.linspace(0, h - 1, h) + grid_y = paddle.tile(grid_y, [w, 1]).t() + grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1]) + grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda() + + sigmoid = nn.Sigmoid() + xs = sigmoid(output[0]) + grid_x + ys = sigmoid(output[1]) + grid_y + + anchor_w = paddle.to_tensor(anchors) + anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step]) + anchor_w = paddle.index_select(anchor_w, index=paddle.to_tensor( + np.array([0]).astype('int32')), axis=1) + + anchor_h = paddle.to_tensor(anchors) + anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step]) + anchor_h = paddle.index_select(anchor_h, index=paddle.to_tensor( + np.array([1]).astype('int32')), axis=1) + + anchor_w = paddle.tile(anchor_w, [batch, 1]) + anchor_w = paddle.tile(anchor_w, [1, 1, h * w]) + anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda() + + anchor_h = paddle.tile(anchor_h, [batch, 1]) + anchor_h = paddle.tile(anchor_h, [1, 1, h * w]) + anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda() + + ws = paddle.exp(output[2]) * anchor_w + hs = paddle.exp(output[3]) * anchor_h + + det_confs = sigmoid(output[4]) + + cls_confs = paddle.to_tensor(output[5:5 + num_classes], stop_gradient=True) + cls_confs = paddle.transpose(cls_confs, [1, 0]) + s = nn.Softmax() + cls_confs = paddle.to_tensor(s(cls_confs)) + + cls_max_confs = paddle.max(cls_confs, axis=1) + cls_max_ids = paddle.argmax(cls_confs, axis=1) + + cls_max_confs = paddle.reshape(cls_max_confs, [-1]) + cls_max_ids = paddle.reshape(cls_max_ids, [-1]) + + sz_hw = h * w + sz_hwa = sz_hw * num_anchors + + det_confs = convert2cpu(det_confs) + cls_max_confs = convert2cpu(cls_max_confs) + cls_max_ids = convert2cpu_long(cls_max_ids) + xs = convert2cpu(xs) + ys = convert2cpu(ys) + ws = convert2cpu(ws) + hs = convert2cpu(hs) + if validation: + cls_confs = convert2cpu(cls_confs.reshape([-1, num_classes])) + for b in range(batch): + boxes = [] + for cy in range(h): + for cx in range(w): + for i in range(num_anchors): + ind = b * sz_hwa + i * sz_hw + cy * w + cx + det_conf = det_confs[ind] + if only_objectness: + conf = det_confs[ind] + else: + conf = det_confs[ind] * cls_max_confs[ind] + + if conf > conf_thresh: + bcx = xs[ind] + bcy = ys[ind] + bw = ws[ind] + bh = hs[ind] + cls_max_conf = cls_max_confs[ind] + cls_max_id = cls_max_ids[ind] + box = [bcx / w, bcy / h, bw / w, bh / h, + det_conf, cls_max_conf, cls_max_id] + if (not only_objectness) and validation: + for c in range(num_classes): + tmp_conf = cls_confs[ind][c] + if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh: + box.append(tmp_conf) + box.append(c) + boxes.append(box) + all_boxes.append(boxes) + return all_boxes + + +def bbox_iou(box1, box2, x1y1x2y2=True): + if x1y1x2y2: + mx = min(box1[0], box2[0]) + Mx = max(box1[2], box2[2]) + my = min(box1[1], box2[1]) + My = max(box1[3], box2[3]) + w1 = box1[2] - box1[0] + h1 = box1[3] - box1[1] + w2 = box2[2] - box2[0] + h2 = box2[3] - box2[1] + else: + mx = min(float(box1[0] - box1[2] / 2.0), + float(box2[0] - box2[2] / 2.0)) + Mx = max(float(box1[0] + box1[2] / 2.0), + float(box2[0] + box2[2] / 2.0)) + my = min(float(box1[1] - box1[3] / 2.0), + float(box2[1] - box2[3] / 2.0)) + My = max(float(box1[1] + box1[3] / 2.0), + float(box2[1] + box2[3] / 2.0)) + w1 = box1[2] + h1 = box1[3] + w2 = box2[2] + h2 = box2[3] + uw = Mx - mx + uh = My - my + cw = w1 + w2 - uw + ch = h1 + h2 - uh + carea = 0 + if cw <= 0 or ch <= 0: + return paddle.to_tensor(0.0) + + area1 = w1 * h1 + area2 = w2 * h2 + carea = cw * ch + uarea = area1 + area2 - carea + return carea / uarea + + +def bbox_ious(boxes1, boxes2, x1y1x2y2=True): + if x1y1x2y2: + mx = paddle.min(boxes1[0], boxes2[0]) + Mx = paddle.max(boxes1[2], boxes2[2]) + my = paddle.min(boxes1[1], boxes2[1]) + My = paddle.max(boxes1[3], boxes2[3]) + w1 = boxes1[2] - boxes1[0] + h1 = boxes1[3] - boxes1[1] + w2 = boxes2[2] - boxes2[0] + h2 = boxes2[3] - boxes2[1] + else: + mx = paddle.min(paddle.stack( + [boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0], axis=0), axis=0) + Mx = paddle.max(paddle.stack( + [boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0], axis=0), axis=0) + my = paddle.min(paddle.stack( + [boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0], axis=0), axis=0) + My = paddle.max(paddle.stack( + [boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0], axis=0), axis=0) + w1 = boxes1[2] + h1 = boxes1[3] + w2 = boxes2[2] + h2 = boxes2[3] + uw = Mx - mx + uh = My - my + cw = w1 + w2 - uw + ch = h1 + h2 - uh + mask = paddle.cast(cw <= 0, dtype="int32") + \ + paddle.cast(ch <= 0, dtype="int32") > 0 + area1 = w1 * h1 + area2 = w2 * h2 + carea = cw * ch + carea[mask] = 0 + uarea = area1 + area2 - carea + return carea / uarea + + +# this function works for building the groud truth +def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, + sil_thresh): + # nH, nW here are number of grids in y and x directions (7, 7 here) + nB = target.shape[0] # batch size + nA = num_anchors # 5 for our case + nC = num_classes + anchor_step = len(anchors) // num_anchors + conf_mask = paddle.ones([nB, nA, nH, nW]) * noobject_scale + coord_mask = paddle.zeros([nB, nA, nH, nW]) + cls_mask = paddle.zeros([nB, nA, nH, nW]) + tx = paddle.zeros([nB, nA, nH, nW]) + ty = paddle.zeros([nB, nA, nH, nW]) + tw = paddle.zeros([nB, nA, nH, nW]) + th = paddle.zeros([nB, nA, nH, nW]) + tconf = paddle.zeros([nB, nA, nH, nW]) + tcls = paddle.zeros([nB, nA, nH, nW]) + + # for each grid there are nA anchors + # nAnchors is the number of anchor for one image + nAnchors = nA * nH * nW + nPixels = nH * nW + # for each image + for b in xrange(nB): + # get all anchor boxes in one image + # (4 * nAnchors) + cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() + # initialize iou score for each anchor + cur_ious = paddle.zeros([nAnchors]) + for t in xrange(50): + # for each anchor 4 coordinate parameters, already in the coordinate system for the whole image + # this loop is for anchors in each image + # for each anchor 5 parameters are available (class, x, y, w, h) + if target[b][t * 5 + 1] == 0: + break + gx = target[b][t * 5 + 1] * nW + gy = target[b][t * 5 + 2] * nH + gw = target[b][t * 5 + 3] * nW + gh = target[b][t * 5 + 4] * nH + # groud truth boxes + cur_gt_boxes = paddle.tile(paddle.to_tensor( + [gx, gy, gw, gh], dtype='float32').t(), [nAnchors, 1]).t() + # bbox_ious is the iou value between orediction and groud truth + cur_ious = paddle.max( + paddle.stack([cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)], axis=0), axis=0) + # if iou > a given threshold, it is seen as it includes an object + # conf_mask[b][cur_ious>sil_thresh] = 0 + conf_mask_t = paddle.reshape(conf_mask, [nB, -1]) + conf_mask_t[b, cur_ious > sil_thresh] = 0 + conf_mask_tt = paddle.reshape(conf_mask_t[b], [nA, nH, nW]) + conf_mask[b] = conf_mask_tt + + # number of ground truth + nGT = 0 + nCorrect = 0 + for b in xrange(nB): + # anchors for one batch (at least batch size, and for some specific classes, there might exist more than one anchor) + for t in xrange(50): + if target[b][t * 5 + 1] == 0: + break + nGT = nGT + 1 + best_iou = 0.0 + best_n = -1 + min_dist = 10000 + # the values saved in target is ratios + # times by the width and height of the output feature maps nW and nH + gx = target[b][t * 5 + 1] * nW + gy = target[b][t * 5 + 2] * nH + gi = int(gx) + gj = int(gy) + + gw = target[b][t * 5 + 3] * nW + gh = target[b][t * 5 + 4] * nH + gt_box = [0, 0, gw, gh] + for n in xrange(nA): + # get anchor parameters (2 values) + aw = anchors[anchor_step * n] + ah = anchors[anchor_step * n + 1] + anchor_box = [0, 0, aw, ah] + # only consider the size (width and height) of the anchor box + iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) + # get the best anchor form with the highest iou + if iou > best_iou: + best_iou = iou + best_n = n + + # then we determine the parameters for an anchor (4 values together) + gt_box = [gx, gy, gw, gh] + # find corresponding prediction box + pred_box = pred_boxes[b * nAnchors + + best_n * nPixels + gj * nW + gi] + + # only consider the best anchor box, for each image + coord_mask[b, best_n, gj, gi] = 1 + cls_mask[b, best_n, gj, gi] = 1 + + # in this cell of the output feature map, there exists an object + conf_mask[b, best_n, gj, gi] = object_scale + tx[b, best_n, gj, gi] = paddle.cast( + target[b][t * 5 + 1] * nW - gi, dtype='float32') + ty[b, best_n, gj, gi] = paddle.cast( + target[b][t * 5 + 2] * nH - gj, dtype='float32') + tw[b, best_n, gj, gi] = math.log( + gw / anchors[anchor_step * best_n]) + th[b, best_n, gj, gi] = math.log( + gh / anchors[anchor_step * best_n + 1]) + iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou + # confidence equals to iou of the corresponding anchor + tconf[b, best_n, gj, gi] = paddle.cast(iou, dtype='float32') + tcls[b, best_n, gj, gi] = paddle.cast( + target[b][t * 5], dtype='float32') + # if ious larger than 0.5, we justify it as a correct prediction + if iou > 0.5: + nCorrect = nCorrect + 1 + # true values are returned + return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__init__.py new file mode 100644 index 0000000..e1efec3 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from .base import BaseMultimodal +from .actbert import ActBert + +__all__ = ['BaseMultimodal', 'ActBert'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..107d9c8 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/actbert.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/actbert.cpython-310.pyc new file mode 100644 index 0000000..4f09ee5 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/actbert.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..a2e4620 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/actbert.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/actbert.py new file mode 100644 index 0000000..4f2c074 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/actbert.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import MULTIMODAL +from .base import BaseMultimodal +import paddle +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@MULTIMODAL.register() +class ActBert(BaseMultimodal): + """ActBert model framework.""" + def forward_net(self, text_ids, action_feat, image_feat, image_loc, + token_type_ids, text_mask, image_mask, action_mask): + pred = self.backbone(text_ids, action_feat, image_feat, image_loc, + token_type_ids, text_mask, image_mask, action_mask) + return pred + + def train_step(self, data_batch): + """For ActBert Dataset. Define how the model is going to train, from input to output. + """ + text_ids, action_feat, image_feat, image_loc, \ + token_type_ids, text_mask, image_mask, action_mask, \ + text_labels, action_label, next_sentence_label, image_label, image_target = data_batch + loss_metrics = dict() + pred = self.backbone(text_ids, action_feat, image_feat, image_loc, + token_type_ids, text_mask, image_mask, action_mask) + prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = pred + total_loss = self.loss(prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \ + text_labels, image_label, image_target, action_label, next_sentence_label) + loss_metrics['loss'] = paddle.mean(total_loss) + return loss_metrics + + def val_step(self, data_batch): + """For ActBert Dataset. Define how the model is going to val, from input to output. + """ + return self.train_step(data_batch) + + def test_step(self, data_batch): + """For MSR-VTT Dataset. Define how the model is going to test, from input to output.""" + text_ids, action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask = data_batch[: + -1] + action_feat = action_feat.squeeze(0) + image_feat = image_feat.squeeze(0) + image_loc = image_loc.squeeze(0) + image_mask = image_mask.squeeze(0) + action_mask = action_mask.squeeze(0) + prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score = self.forward_net(text_ids, \ + action_feat, image_feat, image_loc, token_type_ids, text_mask, image_mask, action_mask) + return prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score + + def infer_step(self, data_batch): + pass diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/base.py new file mode 100644 index 0000000..bc57f97 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/multimodal/base.py @@ -0,0 +1,81 @@ +from abc import abstractmethod +from ... import builder +import paddle.nn as nn + + +class BaseMultimodal(nn.Layer): + """Base class for Multimodal. + + All Multimodal model should subclass it. + All subclass should overwrite: + + - Methods:``train_step``, supporting to forward when training. + - Methods:``valid_step``, supporting to forward when validating. + - Methods:``test_step``, supporting to forward when testing. + + Args: + backbone (dict): Backbone modules to extract feature. + head (dict): Head to process feature. + loss(dict): Loss function. + + """ + def __init__(self, backbone=None, head=None, loss=None): + super().__init__() + if backbone is not None: + self.backbone = builder.build_backbone(backbone) + if hasattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + self.backbone = None + if head is not None: + self.head_name = head.name + self.head = builder.build_head(head) + if hasattr(self.head, 'init_weights'): + self.head.init_weights() + else: + self.head = None + if loss is not None: + self.loss = builder.build_loss(loss) + else: + self.loss = None + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating step. + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Test step. + """ + raise NotImplementedError + + @abstractmethod + def infer_step(self, data_batch, **kwargs): + """Infer step. + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__init__.py new file mode 100644 index 0000000..0c6de50 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__init__.py @@ -0,0 +1,18 @@ +# copyright (c) 2020 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license" +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +from .base import BasePartitioner +from .transnetv2_partitioner import TransNetV2Partitioner + +__all__ = ['BasePartitioner', 'TransNetV2Partitioner'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..ac5d9bc Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..8194832 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/transnetv2_partitioner.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/transnetv2_partitioner.cpython-310.pyc new file mode 100644 index 0000000..d9faf4a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/__pycache__/transnetv2_partitioner.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/base.py new file mode 100644 index 0000000..a7c9259 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/base.py @@ -0,0 +1,84 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +import paddle.nn as nn +from ... import builder + + +class BasePartitioner(nn.Layer): + """Base class for Partition. + All partitioner should subclass it. + All subclass should overwrite: + - Methods:``train_step``, define your train step. + - Methods:``valid_step``, define your valid step, always the same as train_step. + - Methods:``test_step``, define your test step. + """ + def __init__(self, backbone=None, head=None): + super().__init__() + if backbone is not None: + self.backbone = builder.build_backbone(backbone) + if hasattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + self.backbone = None + if head is not None: + self.head_name = head.name + self.head = builder.build_head(head) + if hasattr(self.head, 'init_weights'): + self.head.init_weights() + else: + self.head = None + + def init_weights(self): + """Initialize the model network weights. """ + if getattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + pass + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. input_data_batch -> loss_metric + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating setp. input_data_batch -> loss_metric + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Tets setp. to get acc in test data. input_data_batch -> output + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py new file mode 100644 index 0000000..c329506 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/partitioners/transnetv2_partitioner.py @@ -0,0 +1,68 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import PARTITIONERS +from .base import BasePartitioner + +import paddle + + +@PARTITIONERS.register() +class TransNetV2Partitioner(BasePartitioner): + """TransNetV2 Partitioner framework + """ + def forward_net(self, imgs): + one_hot_pred = self.backbone(imgs) + return one_hot_pred + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + frame_sequence = data_batch[0] + one_hot_gt, many_hot_gt = data_batch[1:] + one_hot_pred = self.forward_net(frame_sequence) + dict_ = {} + if isinstance(one_hot_pred, tuple): + one_hot_pred, dict_ = one_hot_pred + many_hot_pred = dict_.get("many_hot", None) + comb_reg_loss = dict_.get("comb_reg_loss", None) + loss_metrics = self.head.loss(one_hot_pred, one_hot_gt, + many_hot_pred, many_hot_gt, + reg_losses={"comb_reg": comb_reg_loss}) + return loss_metrics + + def val_step(self, data_batch): + frame_sequence = data_batch[0] + one_hot_gt, many_hot_gt = data_batch[1:] + one_hot_pred = self.forward_net(frame_sequence) + dict_ = {} + if isinstance(one_hot_pred, tuple): + one_hot_pred, dict_ = one_hot_pred + many_hot_pred = dict_.get("many_hot", None) + comb_reg_loss = dict_.get("comb_reg_loss", None) + loss_metrics = self.head.loss(one_hot_pred, one_hot_gt, + many_hot_pred, many_hot_gt, + reg_losses={"comb_reg": comb_reg_loss}) + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics + frame_sequence = data_batch[0] + one_hot_pred = self.forward_net(frame_sequence) + return one_hot_pred + + def infer_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + frame_sequence = data_batch[0] + one_hot_pred = self.forward_net(frame_sequence) + return one_hot_pred diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__init__.py new file mode 100644 index 0000000..764b37f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from .base import BaseRecognizer +from .recognizer1d import Recognizer1D, RecognizerAction +from .recognizer2d import Recognizer2D +from .recognizer3d import Recognizer3D +from .recognizer_transformer import RecognizerTransformer +from .recognizer_gcn import RecognizerGCN +from .recognizerMRI import RecognizerMRI +from .recognizer3dMRI import Recognizer3DMRI +from .recognizer_transformer_MRI import RecognizerTransformer_MRI +from .recognizer_movinet_frame import MoViNetRecognizerFrame +from .recognizerDistillation import RecognizerDistillation + +__all__ = [ + 'BaseRecognizer', 'Recognizer1D', 'Recognizer2D', 'Recognizer3D', + 'RecognizerTransformer', 'RecognizerGCN', 'RecognizerMRI', + 'Recognizer3DMRI', 'RecognizerTransformer_MRI', 'MoViNetRecognizerFrame', + 'RecognizerAction', 'RecognizerDistillation' +] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..f671541 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..fec0174 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer1d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer1d.cpython-310.pyc new file mode 100644 index 0000000..57a4caf Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer1d.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer2d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer2d.cpython-310.pyc new file mode 100644 index 0000000..6181368 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer2d.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3d.cpython-310.pyc new file mode 100644 index 0000000..cd733c4 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3d.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3dMRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3dMRI.cpython-310.pyc new file mode 100644 index 0000000..7e1f46b Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer3dMRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerDistillation.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerDistillation.cpython-310.pyc new file mode 100644 index 0000000..af8ba91 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerDistillation.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerMRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerMRI.cpython-310.pyc new file mode 100644 index 0000000..efac30d Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizerMRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_gcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_gcn.cpython-310.pyc new file mode 100644 index 0000000..57257fd Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_gcn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_movinet_frame.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_movinet_frame.cpython-310.pyc new file mode 100644 index 0000000..0f82b4f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_movinet_frame.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer.cpython-310.pyc new file mode 100644 index 0000000..0ad9fce Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer_MRI.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer_MRI.cpython-310.pyc new file mode 100644 index 0000000..197d673 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/__pycache__/recognizer_transformer_MRI.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/base.py new file mode 100644 index 0000000..bf31caf --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/base.py @@ -0,0 +1,81 @@ +from abc import abstractmethod +from ... import builder +import paddle.nn as nn + + +class BaseRecognizer(nn.Layer): + """Base class for recognizers. + + All recognizers should subclass it. + All subclass should overwrite: + + - Methods:``train_step``, supporting to forward when training. + - Methods:``valid_step``, supporting to forward when validating. + - Methods:``test_step``, supporting to forward when testing. + + Args: + backbone (dict): Backbone modules to extract feature. + head (dict): Classification head to process feature. + + """ + def __init__(self, backbone=None, head=None, runtime_cfg=None): + + super().__init__() + if backbone is not None: + self.backbone = builder.build_backbone(backbone) + if hasattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + self.backbone = None + if head is not None: + self.head_name = head.name + self.head = builder.build_head(head) + if hasattr(self.head, 'init_weights'): + self.head.init_weights() + else: + self.head = None + + # Settings when the model is running, + # such as 'avg_type' + self.runtime_cfg = runtime_cfg + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating step. + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Test step. + """ + raise NotImplementedError + + @abstractmethod + def infer_step(self, data_batch, **kwargs): + """Infer step. + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer1d.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer1d.py new file mode 100644 index 0000000..2c7fa94 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer1d.py @@ -0,0 +1,111 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer + + +@RECOGNIZERS.register() +class Recognizer1D(BaseRecognizer): + """1D recognizer model framework.""" + def forward_net(self, imgs): + """Define how the model is going to train, from input to output. + """ + lstm_logit, lstm_output = self.head(imgs) + return lstm_logit, lstm_output + + def train_step(self, data_batch): + """Training step. + """ + rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels = data_batch + imgs = [(rgb_data, rgb_len, rgb_mask), + (audio_data, audio_len, audio_mask)] + + # call forward + lstm_logit, lstm_output = self.forward_net(imgs) + loss = self.head.loss(lstm_logit, labels) + hit_at_one, perr, gap = self.head.metric(lstm_output, labels) + loss_metrics = dict() + loss_metrics['loss'] = loss + loss_metrics['hit_at_one'] = hit_at_one + loss_metrics['perr'] = perr + loss_metrics['gap'] = gap + + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + return self.train_step(data_batch) + + def test_step(self, data_batch): + """Testing setp. + """ + return self.train_step(data_batch) + + def infer_step(self, data_batch): + """Infering setp. + """ + rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch + imgs = [(rgb_data, rgb_len, rgb_mask), + (audio_data, audio_len, audio_mask)] + # call forward + lstm_logit, _ = self.forward_net(imgs) + return lstm_logit + + +@RECOGNIZERS.register() +class RecognizerAction(BaseRecognizer): + """1D recognizer model framework.""" + def forward_net(self, imgs): + """Define how the model is going to train, from input to output. + """ + lstm_logit, lstm_output = self.head(imgs) + return lstm_logit, lstm_output + + def train_step(self, data_batch): + """Training step. + """ + rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask, labels, labels_iou = data_batch + imgs = [(rgb_data, rgb_len, rgb_mask), + (audio_data, audio_len, audio_mask)] + + # call forward + output_logit, output_iou = self.forward_net(imgs) + loss = self.head.loss(output_logit, output_iou, labels, labels_iou) + top1, top5 = self.head.metric(output_logit, labels) + loss_metrics = dict() + loss_metrics['loss'] = loss + loss_metrics['top1'] = top1 + loss_metrics['top5'] = top5 + + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + return self.train_step(data_batch) + + def test_step(self, data_batch): + """Testing setp. + """ + return self.train_step(data_batch) + + def infer_step(self, data_batch): + """Infering setp. + """ + rgb_data, rgb_len, rgb_mask, audio_data, audio_len, audio_mask = data_batch + imgs = [(rgb_data, rgb_len, rgb_mask), + (audio_data, audio_len, audio_mask)] + # call forward + output_logit, output_iou = self.forward_net(imgs) + return output_logit, output_iou diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer2d.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer2d.py new file mode 100644 index 0000000..d8aa661 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer2d.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer +import paddle +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class Recognizer2D(BaseRecognizer): + """2D recognizer model framework.""" + def forward_net(self, imgs): + # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method. + num_segs = imgs.shape[ + 1] # imgs.shape=[N,T,C,H,W], for most commonly case + imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:])) + + if self.backbone is not None: + feature = self.backbone(imgs) + else: + feature = imgs + + if self.head is not None: + cls_score = self.head(feature, num_segs) + else: + cls_score = None + + return cls_score + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + loss_metrics = self.head.loss(cls_score, labels) + return loss_metrics + + def val_step(self, data_batch): + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + loss_metrics = self.head.loss(cls_score, labels, valid_mode=True) + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics + imgs = data_batch[0] + cls_score = self.forward_net(imgs) + return cls_score + + def infer_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + imgs = data_batch[0] + cls_score = self.forward_net(imgs) + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3d.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3d.py new file mode 100644 index 0000000..f0ecff1 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3d.py @@ -0,0 +1,93 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class Recognizer3D(BaseRecognizer): + """3D Recognizer model framework. + """ + + def forward_net(self, imgs): + """Define how the model is going to run, from input to output. + """ + feature = self.backbone(imgs) + cls_score = self.head(feature) + return cls_score + + def train_step(self, data_batch): + """Training step. + """ + if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly': + imgs = data_batch[0] + labels = data_batch[1:] + if imgs.dim() == 6: + imgs = imgs.reshape([-1] + imgs.shape[2:]) + else: + imgs = data_batch[0:2] + labels = data_batch[2:] + + # call forward + cls_score = self.forward_net(imgs) + loss_metrics = self.head.loss(cls_score, labels) + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly': + imgs = data_batch[0] + labels = data_batch[1:] + if imgs.dim() == 6: + imgs = imgs.reshape([-1] + imgs.shape[2:]) + else: + imgs = data_batch[0:2] + labels = data_batch[2:] + + # call forward + cls_score = self.forward_net(imgs) + loss_metrics = self.head.loss(cls_score, labels, valid_mode=True) + return loss_metrics + + def test_step(self, data_batch): + """Test step. + """ + if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly': + imgs = data_batch[0] + if imgs.dim() == 6: + imgs = imgs.reshape([-1] + imgs.shape[2:]) + else: + imgs = data_batch[0:2] + # call forward + cls_score = self.forward_net(imgs) + + return cls_score + + def infer_step(self, data_batch): + """Infer step. + """ + if self.backbone.__class__.__name__ == 'ResNet3dSlowOnly': + imgs = data_batch[0] + # call forward + imgs = imgs.reshape([-1] + imgs.shape[2:]) + cls_score = self.forward_net(imgs) + else: + imgs = data_batch[0:2] + # call forward + cls_score = self.forward_net(imgs) + + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py new file mode 100644 index 0000000..9298491 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer3dMRI.py @@ -0,0 +1,81 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer +from paddlevideo.utils import get_logger +import paddle + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class Recognizer3DMRI(BaseRecognizer): + """3D Recognizer model framework. + """ + def forward_net(self, imgs): + """Define how the model is going to run, from input to output. + """ + + imgs[0] = paddle.cast(imgs[0], "float32") + imgs[1] = paddle.cast(imgs[1], "float32") + imgs[0] = imgs[0].unsqueeze(1) + imgs[1] = imgs[1].unsqueeze(1) + + feature = self.backbone(imgs) + cls_score = self.head(feature) + return cls_score + + def train_step(self, data_batch): + """Training step. + """ + imgs = data_batch[0:2] + labels = data_batch[2:] + + # call forward + cls_score = self.forward_net(imgs) + cls_score = paddle.nn.functional.sigmoid(cls_score) + loss_metrics = self.head.loss(cls_score, labels, if_top5=False) + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + imgs = data_batch[0:2] + labels = data_batch[2:] + + # call forward + cls_score = self.forward_net(imgs) + cls_score = paddle.nn.functional.sigmoid(cls_score) + loss_metrics = self.head.loss(cls_score, + labels, + valid_mode=True, + if_top5=False) + return loss_metrics + + def test_step(self, data_batch): + """Test step. + """ + imgs = data_batch[0:2] + # call forward + cls_score = self.forward_net(imgs) + + return cls_score + + def infer_step(self, data_batch): + """Infer step. + """ + imgs = data_batch[0:2] + # call forward + cls_score = self.forward_net(imgs) + + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py new file mode 100644 index 0000000..6f48a08 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerDistillation.py @@ -0,0 +1,231 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from abc import abstractmethod +import paddle +import paddle.nn as nn + +from ...registry import RECOGNIZERS +from ... import builder +from paddlevideo.utils import get_logger, get_dist_info + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class RecognizerDistillation(nn.Layer): + """recognizer Distillation framework.""" + def __init__(self, + freeze_params_list=None, + models=None, + loss=None, + **kargs): + """ + Args: + freeze_params_list: list, set each model is trainable or not + models: config of distillaciton model. + loss: config of loss list + """ + super().__init__() + self.model_list = [] + self.model_name_list = [] + self.loss_cfgs = loss + + if freeze_params_list is None: + freeze_params_list = [False] * len(models) + assert len(freeze_params_list) == len(models) + + # build Teacher and Student model + for idx, model_config in enumerate(models): + assert len(model_config) == 1 + key = list(model_config.keys())[0] #Teacher or Student + model_config = model_config[key] + model_name = model_config['backbone']['name'] + + backbone, head = None, None + if model_config.get('backbone'): + backbone = builder.build_backbone(model_config['backbone']) + if hasattr(backbone, 'init_weights'): + backbone.init_weights() + if model_config.get('head'): + head = builder.build_head(model_config['head']) + if hasattr(head, 'init_weights'): + head.init_weights() + + model = nn.Sequential(backbone, head) + logger.info('build distillation {} model done'.format(key)) + # for add all parameters in nn.Layer class + self.model_list.append(self.add_sublayer(key, model)) + self.model_name_list.append({model_name: key}) + + # set model trainable or not + if freeze_params_list[idx]: + for param in model.parameters(): + param.trainable = False + + # build loss: support for loss list + self.loss_func_list = [] + mode_keys = list(loss.keys()) + for mode in mode_keys: + loss_cfgs = loss[mode] + for loss_cfg in loss_cfgs: + loss_func_dict = {} + model_name_pairs = loss_cfg.pop('model_name_pairs') + loss_func = builder.build_loss(loss_cfg) + loss_func_dict['mode'] = mode + loss_func_dict['loss_func'] = loss_func + loss_func_dict['model_name_pairs'] = model_name_pairs + self.loss_func_list.append(loss_func_dict) + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + def get_loss(self, output, labels, mode): + """ + Args: + output: dict, output name and its value + labels: label of data + mode: str, 'Train' or 'Val' + """ + output['GroundTruth'] = labels + loss_list = [] + + for loss_func_dict in self.loss_func_list: + if mode == loss_func_dict['mode']: + model_name_pairs = loss_func_dict['model_name_pairs'] + loss_func = loss_func_dict['loss_func'] + loss_val = loss_func(output[model_name_pairs[0]], + output[model_name_pairs[1]]) + loss_list.append(loss_val) + + total_loss = paddle.add_n(loss_list) + return total_loss + + def get_acc(self, scores, labels, mode='Train'): + def _get_acc(score, label, mode='Train'): + top1 = paddle.metric.accuracy(input=score, label=label, k=1) + top5 = paddle.metric.accuracy(input=score, label=label, k=5) + _, world_size = get_dist_info() + # Deal with multi cards validate + if world_size > 1 and mode == 'Val': #reduce sum when valid + top1 = paddle.distributed.all_reduce( + top1, op=paddle.distributed.ReduceOp.SUM) / world_size + top5 = paddle.distributed.all_reduce( + top5, op=paddle.distributed.ReduceOp.SUM) / world_size + return top1, top5 + + if len(labels) == 1: + label = labels[0] + return _get_acc(scores, label) + # Deal with VideoMix + elif len(labels) == 3: + label_a, label_b, lam = labels + top1a, top5a = _get_acc(scores, label_a, mode) + top1b, top5b = _get_acc(scores, label_b, mode) + top1 = lam * top1a + (1 - lam) * top1b + top5 = lam * top5a + (1 - lam) * top5b + return top1, top5 + + def forward_model(self, imgs, model_name, model): + if model_name in ['PPTSM_v2', 'ResNetTweaksTSM']: + # [N,T,C,H,W] -> [N*T,C,H,W] + imgs = paddle.reshape(imgs, [-1] + list(imgs.shape[2:])) + + return model(imgs) + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + out = {} + loss_metrics = {} + imgs = data_batch[0] + labels = data_batch[1:] + + for idx, item in enumerate(self.model_name_list): + model = self.model_list[idx] + model_name = list(item.keys())[0] + model_type = item[model_name] # Teacher or Student + out[model_type] = self.forward_model(imgs, model_name, model) + + # out_student, out_teacher + loss = self.get_loss(out, labels, 'Train') + loss_metrics['loss'] = loss + # calculate acc with student output + top1, top5 = self.get_acc(out['Student'], labels) + loss_metrics['top1'] = top1 + loss_metrics['top5'] = top5 + + return loss_metrics + + def val_step(self, data_batch): + out = {} + loss_metrics = {} + imgs = data_batch[0] + labels = data_batch[1:] + + for idx, item in enumerate(self.model_name_list): + model = self.model_list[idx] + model_name = list(item.keys())[0] + model_type = item[model_name] # Teacher or Student + out[model_type] = self.forward_model(imgs, model_name, model) + + # Loss of student with gt: out_student, label + loss = self.get_loss(out, labels, 'Val') + loss_metrics['loss'] = loss + + top1, top5 = self.get_acc(out['Student'], labels, 'Val') + loss_metrics['top1'] = top1 + loss_metrics['top5'] = top5 + + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + imgs = data_batch[0] + + # Use Student to test + for idx, item in enumerate(self.model_name_list): + model = self.model_list[idx] + model_name = list(item.keys())[0] + model_type = item[model_name] # Teacher or Student + if model_type == "Student": + out = self.forward_model(imgs, model_name, model) + + return out + + def infer_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + imgs = data_batch[0] + + # Use Student to infer + for idx, item in enumerate(self.model_name_list): + model = self.model_list[idx] + model_name = list(item.keys())[0] + model_type = item[model_name] # Teacher or Student + if model_type == "Student": + out = self.forward_model(imgs, model_name, model) + + return out diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerMRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerMRI.py new file mode 100644 index 0000000..4b1713e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizerMRI.py @@ -0,0 +1,76 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer +import paddle +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class RecognizerMRI(BaseRecognizer): + """2D recognizer model framework.""" + def forward_net(self, imgs): + # NOTE: As the num_segs is an attribute of dataset phase, and didn't pass to build_head phase, should obtain it from imgs(paddle.Tensor) now, then call self.head method. + num_segs = imgs.shape[ + 1] # imgs.shape=[N,T,C,H,W], for most commonly case + imgs = paddle.reshape_(imgs, [-1] + list(imgs.shape[2:])) + imgs = paddle.cast(imgs, "float32") ############# + imgs = imgs.unsqueeze(1) + + if self.backbone != None: + feature = self.backbone(imgs) + else: + feature = imgs + + if self.head != None: + cls_score = self.head(feature, num_segs) + else: + cls_score = None + + return cls_score + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + cls_score = paddle.nn.functional.sigmoid(cls_score) + loss_metrics = self.head.loss(cls_score, labels, if_top5=False) + return loss_metrics + + def val_step(self, data_batch): + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + cls_score = paddle.nn.functional.sigmoid(cls_score) + loss_metrics = self.head.loss(cls_score, + labels, + valid_mode=True, + if_top5=False) + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + # NOTE: (shipping) when testing, the net won't call head.loss, we deal with the test processing in /paddlevideo/metrics + imgs = data_batch[0] + cls_score = self.forward_net(imgs) + return cls_score + + def infer_step(self, data_batch): + """Define how the model is going to test, from input to output.""" + imgs = data_batch[0] + cls_score = self.forward_net(imgs) + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py new file mode 100644 index 0000000..281c5ac --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_gcn.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class RecognizerGCN(BaseRecognizer): + """GCN Recognizer model framework. + """ + + def __init__(self, + backbone=None, + head=None, + runtime_cfg=None, + if_top5=True): + """ + Args: + backbone (dict): Backbone modules to extract feature. + head (dict): Classification head to process feature. + is_top5 (bool): Whether to display top-5 accuracy during training/validation steps. + """ + super(RecognizerGCN, self).__init__(backbone, head, runtime_cfg) + self.if_top5 = if_top5 + + def forward_net(self, data): + """Define how the model is going to run, from input to output. + """ + feature = self.backbone(data) + cls_score = self.head(feature) + return cls_score + + def train_step(self, data_batch): + """Training step. + """ + data = data_batch[0] + label = data_batch[1:] + + # call forward + cls_score = self.forward_net(data) + loss_metrics = self.head.loss(cls_score, label, if_top5=self.if_top5) + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + data = data_batch[0] + label = data_batch[1:] + + # call forward + cls_score = self.forward_net(data) + loss_metrics = self.head.loss(cls_score, + label, + valid_mode=True, + if_top5=self.if_top5) + return loss_metrics + + def test_step(self, data_batch): + """Test step. + """ + data = data_batch[0] + + # call forward + cls_score = self.forward_net(data) + return cls_score + + def infer_step(self, data_batch): + """Infer step. + """ + data = data_batch[0] + + # call forward + cls_score = self.forward_net(data) + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py new file mode 100644 index 0000000..1ad2e14 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_movinet_frame.py @@ -0,0 +1,78 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import paddle + +from paddlevideo.utils import get_logger +from .base import BaseRecognizer +from ...registry import RECOGNIZERS + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class MoViNetRecognizerFrame(BaseRecognizer): + + def forward_net(self, imgs): + """Define how the model is going to run, from input to output. + """ + self.backbone.clean_activation_buffers() + outputs = self.backbone(imgs) + cls_score = self.head(outputs) + return cls_score + + def train_step(self, data_batch): + """Training step. + """ + imgs = data_batch[0] + labels = data_batch[1] #.astype("int64") + data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4]) + # call forward + cls_score = self.forward_net(data) + loss_metrics = self.head.loss_func(cls_score, labels) + top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1) + top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5) + output = {'loss': loss_metrics, 'top1': top1, 'top5': top5} + return output + + def val_step(self, data_batch): + """Validating setp. + """ + imgs = data_batch[0] + labels = data_batch[1] #.astype("int64") + data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4]) + # call forward + cls_score = self.forward_net(data) + loss_metrics = self.head.loss_func(cls_score, labels) + top1 = paddle.metric.accuracy(input=cls_score, label=labels, k=1) + top5 = paddle.metric.accuracy(input=cls_score, label=labels, k=5) + output = {'loss': loss_metrics, 'top1': top1, 'top5': top5} + return output + + def test_step(self, data_batch): + """Test step. + """ + imgs = data_batch[0] + data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4]) + # call forward + cls_score = self.forward_net(data) + return cls_score + + def infer_step(self, data_batch): + """Infer step. + """ + imgs = data_batch[0] + # call forward + data = paddle.transpose(imgs, perm=[0, 2, 1, 3, 4]) + cls_score = self.forward_net(data) + + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py new file mode 100644 index 0000000..4144eda --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer.py @@ -0,0 +1,98 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import paddle +import paddle.nn.functional as F +from paddlevideo.utils import get_logger + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class RecognizerTransformer(BaseRecognizer): + """Transformer's recognizer model framework.""" + def forward_net(self, imgs): + # imgs.shape=[N,C,T,H,W], for transformer case + if self.backbone is not None: + feature = self.backbone(imgs) + else: + feature = imgs + + if self.head is not None: + cls_score = self.head(feature) + else: + cls_score = None + + return cls_score + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + loss_metrics = self.head.loss(cls_score, labels) + return loss_metrics + + def val_step(self, data_batch): + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + loss_metrics = self.head.loss(cls_score, labels, valid_mode=True) + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to infer, from input to output.""" + imgs = data_batch[0] + num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg + cls_score = [] + for i in range(num_views): + view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) * + self.runtime_cfg.test.num_seg] + cls_score.append(self.forward_net(view)) + cls_score = self._average_view(cls_score, + self.runtime_cfg.test.avg_type) + return cls_score + + def infer_step(self, data_batch): + """Define how the model is going to infer, from input to output.""" + imgs = data_batch[0] + num_views = imgs.shape[2] // self.runtime_cfg.test.num_seg + cls_score = [] + for i in range(num_views): + view = imgs[:, :, i * self.runtime_cfg.test.num_seg:(i + 1) * + self.runtime_cfg.test.num_seg] + cls_score.append(self.forward_net(view)) + cls_score = self._average_view(cls_score, + self.runtime_cfg.test.avg_type) + return cls_score + + def _average_view(self, cls_score, avg_type='score'): + """Combine the predicted results of different views + + Args: + cls_score (list): results of multiple views + avg_type (str, optional): Average calculation method. Defaults to 'score'. + """ + assert avg_type in ['score', 'prob'], \ + f"Currently only the average of 'score' or 'prob' is supported, but got {avg_type}" + if avg_type == 'score': + return paddle.add_n(cls_score) / len(cls_score) + elif avg_type == 'prob': + return paddle.add_n( + [F.softmax(score, axis=-1) + for score in cls_score]) / len(cls_score) + else: + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py new file mode 100644 index 0000000..e8696b4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/recognizers/recognizer_transformer_MRI.py @@ -0,0 +1,104 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import paddle +import paddle.nn.functional as F +from paddlevideo.utils import get_logger + +from ...registry import RECOGNIZERS +from .base import BaseRecognizer + +logger = get_logger("paddlevideo") + + +@RECOGNIZERS.register() +class RecognizerTransformer_MRI(BaseRecognizer): + """Transformer's recognizer model framework.""" + def forward_net(self, imgs): + # imgs.shape=[N,C,T,H,W], for transformer case + + imgs = paddle.cast(imgs, "float32") ############# + imgs = imgs.unsqueeze(1) + + if self.backbone != None: + feature = self.backbone(imgs) + else: + feature = imgs + + if self.head != None: + cls_score = self.head(feature) + else: + cls_score = None + + return cls_score + + def train_step(self, data_batch): + """Define how the model is going to train, from input to output. + """ + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + cls_score = paddle.nn.functional.sigmoid(cls_score) + loss_metrics = self.head.loss(cls_score, labels, if_top5=False) + return loss_metrics + + def val_step(self, data_batch): + imgs = data_batch[0] + labels = data_batch[1:] + cls_score = self.forward_net(imgs) + cls_score = paddle.nn.functional.sigmoid(cls_score) + loss_metrics = self.head.loss(cls_score, + labels, + valid_mode=True, + if_top5=False) + return loss_metrics + + def test_step(self, data_batch): + """Define how the model is going to infer, from input to output.""" + imgs = data_batch[0] + num_views = imgs.shape[2] // self.backbone.seg_num + cls_score = [] + for i in range(num_views): + view = imgs[:, :, i * self.backbone.seg_num:(i + 1) * + self.backbone.seg_num] + cls_score.append(self.forward_net(view)) + cls_score = self.average_view(cls_score) + return cls_score + + def infer_step(self, data_batch): + """Define how the model is going to infer, from input to output.""" + imgs = data_batch[0] + num_views = imgs.shape[2] // self.backbone.seg_num + cls_score = [] + for i in range(num_views): + view = imgs[:, :, i * self.backbone.seg_num:(i + 1) * + self.backbone.seg_num] + cls_score.append(self.forward_net(view)) + cls_score = self.average_view(cls_score) + return cls_score + + def average_view(self, cls_score, average_type='score'): + """Combine the scores of different views + + Args: + cls_score (list): Scores of multiple views + average_type (str, optional): Average calculation method. Defaults to 'score'. + """ + assert average_type in ['score', 'prob'], \ + f"Currently only the average of 'score' or 'prob' is supported, but got {average_type}" + if average_type == 'score': + return paddle.add_n(cls_score) / len(cls_score) + elif average_type == 'avg': + return paddle.add_n([F.softmax(score) + for score in cls_score]) / len(cls_score) + else: + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__init__.py new file mode 100644 index 0000000..28a1d2e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from .base import BaseSegment +from .cfbi import CFBI + +__all__ = ['BaseSegment', 'CFBI'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..eadd905 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..416bb39 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/cfbi.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/cfbi.cpython-310.pyc new file mode 100644 index 0000000..4d91c88 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/cfbi.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..740259a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/__pycache__/utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/base.py new file mode 100644 index 0000000..0c5cb07 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/base.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from abc import abstractmethod +from ... import builder +import paddle.nn as nn + + +class BaseSegment(nn.Layer): + """Base class for semi-Video Object Segmentation. + All subclass should overwrite: + + - Methods:``train_step``, supporting to forward when training. + - Methods:``valid_step``, supporting to forward when validating. + - Methods:``test_step``, supporting to forward when testing. + + Args: + backbone (dict): Backbone modules to extract feature. + head (dict): Head to process feature. + loss(dict): Loss function. + """ + def __init__(self, backbone=None, head=None, loss=None): + super().__init__() + if backbone is not None: + self.backbone = builder.build_backbone(backbone) + if hasattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + self.backbone = None + if head is not None: + self.head_name = head.name + self.head = builder.build_head(head) + if hasattr(self.head, 'init_weights'): + self.head.init_weights() + else: + self.head = None + if loss is not None: + self.loss = builder.build_loss(loss) + else: + self.loss = None + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating step. + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Test step. + """ + raise NotImplementedError + + @abstractmethod + def infer_step(self, data_batch, **kwargs): + """Infer step. + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/cfbi.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/cfbi.py new file mode 100644 index 0000000..dcdc512 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/cfbi.py @@ -0,0 +1,286 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +from .utils import foreground2background, global_matching_for_eval, local_matching, calculate_attention_head_for_eval +from ...registry import SEGMENT +from .base import BaseSegment +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") + + +@SEGMENT.register() +class CFBI(BaseSegment): + """CFBI model framework.""" + def __init__(self, backbone=None, head=None, loss=None): + super().__init__(backbone, head, loss) + x1 = paddle.zeros([3, 1, 1, 1]) + self.bg_bias = paddle.create_parameter( + shape=x1.shape, + dtype=x1.dtype, + default_initializer=nn.initializer.Assign(x1)) + self.fg_bias = paddle.create_parameter( + shape=x1.shape, + dtype=x1.dtype, + default_initializer=nn.initializer.Assign(x1)) + self.epsilon = 1e-05 + + def test_step(self, data_batch): + """Define how the model is going to test, from input to output. + """ + self.test_mode = True + ref_embeddings, ref_masks, prev_embedding, prev_mask, current_frame, pred_size, gt_ids = data_batch + current_frame_embedding_4x, current_frame_embedding_8x, current_frame_embedding_16x, \ + current_low_level = self.backbone(current_frame) + + current_frame_embedding = [ + current_frame_embedding_4x, current_frame_embedding_8x, + current_frame_embedding_16x + ] + + if prev_embedding is None: + return None, current_frame_embedding + else: + bs, c, h, w = current_frame_embedding_4x.shape + + tmp_dic, _ = self.before_seghead_process( + ref_embeddings, + prev_embedding, + current_frame_embedding, + ref_masks, + prev_mask, + gt_ids, + current_low_level=current_low_level, + ) + all_pred = [] + for i in range(bs): + pred = tmp_dic[i] + + pred = F.interpolate(pred, + size=[pred_size[0], pred_size[1]], + mode='bilinear', + align_corners=True) + all_pred.append(pred) + all_pred = paddle.concat(all_pred, axis=0) + all_pred = F.softmax(all_pred, axis=1) + return all_pred, current_frame_embedding + + def before_seghead_process(self, + ref_frame_embeddings=None, + previous_frame_embeddings=None, + current_frame_embeddings=None, + ref_frame_labels=None, + previous_frame_mask=None, + gt_ids=None, + current_low_level=None): + """ process befor segmentation head""" + TEST_GLOBAL_MATCHING_CHUNK = [4, 1, 1] + TEST_GLOBAL_ATROUS_RATE = [2, 1, 1] + TRAIN_LOCAL_ATROUS_RATE = [2, 1, 1] + TEST_LOCAL_ATROUS_RATE = [2, 1, 1] + MODEL_FLOAT16_MATCHING = False + TEST_GLOBAL_MATCHING_MIN_PIXEL = 100 + MODEL_MULTI_LOCAL_DISTANCE = [[4, 8, 12, 16, 20, 24], + [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]] + TRAIN_LOCAL_PARALLEL = True + TEST_LOCAL_PARALLEL = True + MODEL_MATCHING_BACKGROUND = True + MODEL_SEMANTIC_MATCHING_DIM = [32, 64, 128] + + dic_tmp = [] + boards = {} + scale_ref_frame_labels = [] + scale_previous_frame_labels = [] + for current_frame_embedding in current_frame_embeddings: + bs, c, h, w = current_frame_embedding.shape + if not self.test_mode: + raise NotImplementedError + else: + ref_frame_embeddings = list(zip(*ref_frame_embeddings)) + all_scale_ref_frame_label = [] + for ref_frame_label in ref_frame_labels: + scale_ref_frame_label = paddle.cast(F.interpolate( + paddle.cast(ref_frame_label, dtype="float32"), + size=(h, w), + mode='nearest'), + dtype="int32") + all_scale_ref_frame_label.append(scale_ref_frame_label) + scale_ref_frame_labels.append(all_scale_ref_frame_label) + scale_previous_frame_label = paddle.cast(F.interpolate( + paddle.cast(previous_frame_mask, dtype="float32"), + size=(h, w), + mode='nearest'), + dtype="int32") + scale_previous_frame_labels.append(scale_previous_frame_label) + for n in range(bs): + ref_obj_ids = paddle.reshape( + paddle.cast(paddle.arange(0, + np.array(gt_ids)[n] + 1), + dtype="int32"), [-1, 1, 1, 1]) + obj_num = ref_obj_ids.shape[0] + low_level_feat = paddle.unsqueeze(current_low_level[n], axis=0) + all_CE_input = [] + all_attention_head = [] + for scale_idx, current_frame_embedding, ref_frame_embedding, previous_frame_embedding, \ + scale_ref_frame_label, scale_previous_frame_label in zip(range(3), \ + current_frame_embeddings, ref_frame_embeddings, previous_frame_embeddings, \ + scale_ref_frame_labels, scale_previous_frame_labels): + #Prepare + seq_current_frame_embedding = current_frame_embedding[n] + seq_prev_frame_embedding = previous_frame_embedding[n] + seq_previous_frame_label = paddle.cast( + (paddle.cast(scale_previous_frame_label[n], dtype="int32") + == ref_obj_ids), + dtype="float32") + if np.array(gt_ids)[n] > 0: + dis_bias = paddle.concat([ + paddle.unsqueeze(self.bg_bias[scale_idx], axis=0), + paddle.expand( + paddle.unsqueeze(self.fg_bias[scale_idx], axis=0), + [np.array(gt_ids)[n], -1, -1, -1]) + ], + axis=0) + else: + dis_bias = paddle.unsqueeze(self.bg_bias[scale_idx], axis=0) + #Global FG map + matching_dim = MODEL_SEMANTIC_MATCHING_DIM[scale_idx] + seq_current_frame_embedding_for_matching = paddle.transpose( + seq_current_frame_embedding[:matching_dim], [1, 2, 0]) + + if not self.test_mode: + raise NotImplementedError + else: + all_scale_ref_frame_label = scale_ref_frame_label + all_ref_frame_embedding = ref_frame_embedding + all_reference_embeddings = [] + all_reference_labels = [] + seq_ref_frame_labels = [] + count = 0 + for idx in range(len(all_scale_ref_frame_label)): + + ref_frame_embedding = all_ref_frame_embedding[idx] + scale_ref_frame_label = all_scale_ref_frame_label[idx] + + seq_ref_frame_embedding = ref_frame_embedding[n] + seq_ref_frame_embedding = paddle.transpose( + seq_ref_frame_embedding, [1, 2, 0]) + seq_ref_frame_label = paddle.cast( + (paddle.cast(scale_ref_frame_label[n], + dtype="int32") == ref_obj_ids), + dtype="float32") + seq_ref_frame_labels.append(seq_ref_frame_label) + seq_ref_frame_label = paddle.transpose( + paddle.squeeze(seq_ref_frame_label, axis=1), + [1, 2, 0]) + all_reference_embeddings.append( + seq_ref_frame_embedding[:, :, :matching_dim]) + all_reference_labels.append(seq_ref_frame_label) + global_matching_fg = global_matching_for_eval( + all_reference_embeddings=all_reference_embeddings, + query_embeddings= + seq_current_frame_embedding_for_matching, + all_reference_labels=all_reference_labels, + n_chunks=TEST_GLOBAL_MATCHING_CHUNK[scale_idx], + dis_bias=dis_bias, + atrous_rate=TEST_GLOBAL_ATROUS_RATE[scale_idx], + use_float16=MODEL_FLOAT16_MATCHING, + atrous_obj_pixel_num=TEST_GLOBAL_MATCHING_MIN_PIXEL) + + # Local FG map + seq_prev_frame_embedding_for_matching = paddle.transpose( + seq_prev_frame_embedding[:matching_dim], [1, 2, 0]) + seq_previous_frame_label_for_matching = paddle.transpose( + paddle.squeeze(seq_previous_frame_label, axis=1), [1, 2, 0]) + local_matching_fg = local_matching( + prev_frame_embedding=seq_prev_frame_embedding_for_matching, + query_embedding=seq_current_frame_embedding_for_matching, + prev_frame_labels=seq_previous_frame_label_for_matching, + multi_local_distance=MODEL_MULTI_LOCAL_DISTANCE[scale_idx], + dis_bias=dis_bias, + atrous_rate=TRAIN_LOCAL_ATROUS_RATE[scale_idx] if + not self.test_mode else TEST_LOCAL_ATROUS_RATE[scale_idx], + use_float16=MODEL_FLOAT16_MATCHING, + allow_downsample=False, + allow_parallel=TRAIN_LOCAL_PARALLEL + if not self.test_mode else TEST_LOCAL_PARALLEL) + + #Aggregate Pixel-level Matching + to_cat_global_matching_fg = paddle.transpose( + paddle.squeeze(global_matching_fg, axis=0), [2, 3, 0, 1]) + to_cat_local_matching_fg = paddle.transpose( + paddle.squeeze(local_matching_fg, axis=0), [2, 3, 0, 1]) + all_to_cat = [ + to_cat_global_matching_fg, to_cat_local_matching_fg, + seq_previous_frame_label + ] + + #Global and Local BG map + if MODEL_MATCHING_BACKGROUND: + to_cat_global_matching_bg = foreground2background( + to_cat_global_matching_fg, + np.array(gt_ids)[n] + 1) + reshaped_prev_nn_feature_n = paddle.unsqueeze( + paddle.transpose(to_cat_local_matching_fg, + [0, 2, 3, 1]), + axis=1) + to_cat_local_matching_bg = foreground2background( + reshaped_prev_nn_feature_n, + np.array(gt_ids)[n] + 1) + to_cat_local_matching_bg = paddle.squeeze(paddle.transpose( + to_cat_local_matching_bg, [0, 4, 2, 3, 1]), + axis=-1) + all_to_cat += [ + to_cat_local_matching_bg, to_cat_global_matching_bg + ] + + to_cat_current_frame_embedding = paddle.expand( + paddle.unsqueeze(current_frame_embedding[n], axis=0), + [obj_num, -1, -1, -1]) + to_cat_prev_frame_embedding = paddle.expand( + paddle.unsqueeze(previous_frame_embedding[n], axis=0), + [obj_num, -1, -1, -1]) + to_cat_prev_frame_embedding_fg = to_cat_prev_frame_embedding * seq_previous_frame_label + to_cat_prev_frame_embedding_bg = to_cat_prev_frame_embedding * ( + 1 - seq_previous_frame_label) + all_to_cat += [ + to_cat_current_frame_embedding, + to_cat_prev_frame_embedding_fg, + to_cat_prev_frame_embedding_bg + ] + + CE_input = paddle.concat(all_to_cat, axis=1) + #Instance-level Attention + if not self.test_mode: + raise NotImplementedError + else: + attention_head = calculate_attention_head_for_eval( + all_ref_frame_embedding, + seq_ref_frame_labels, + paddle.expand( + paddle.unsqueeze(previous_frame_embedding[n], + axis=0), [obj_num, -1, -1, -1]), + seq_previous_frame_label, + epsilon=self.epsilon) + + all_CE_input.append(CE_input) + all_attention_head.append(attention_head) + + #Collaborative Ensembler + pred = self.head(all_CE_input, all_attention_head, low_level_feat) + dic_tmp.append(pred) + + return dic_tmp, boards diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/utils.py new file mode 100644 index 0000000..1ec3be4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segment/utils.py @@ -0,0 +1,754 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def foreground2background(dis, obj_num): + if obj_num == 1: + return dis + bg_dis = [] + for i in range(obj_num): + obj_back = [] + for j in range(obj_num): + if i == j: + continue + obj_back.append(paddle.unsqueeze(dis[j], axis=0)) + obj_back = paddle.concat(x=obj_back, axis=1) + obj_back = paddle.min(x=obj_back, axis=1, keepdim=True) + bg_dis.append(obj_back) + bg_dis = paddle.concat(x=bg_dis, axis=0) + return bg_dis + + +WRONG_LABEL_PADDING_DISTANCE = 5e4 + + +#GLOBAL_DIST_MAP +def _pairwise_distances(x, x2, y, y2): + """ + Computes pairwise squared l2 distances between tensors x and y. + Args: + x: [n, feature_dim]. + y: [m, feature_dim]. + Returns: + d: [n, m]. + """ + xs = x2 + ys = y2 + + xs = paddle.unsqueeze(xs, axis=1) + ys = paddle.unsqueeze(ys, axis=0) + d = xs + ys - 2. * paddle.matmul(x, y, transpose_y=True) + return d + + +def _flattened_pairwise_distances(reference_embeddings, ref_square, + query_embeddings, query_square): + """ + Calculates flattened tensor of pairwise distances between ref and query. + Args: + reference_embeddings: [..., embedding_dim], + the embedding vectors for the reference frame + query_embeddings: [..., embedding_dim], + the embedding vectors for the query frames. + Returns: + dists: [reference_embeddings.size / embedding_dim, query_embeddings.size / embedding_dim] + """ + dists = _pairwise_distances(query_embeddings, query_square, + reference_embeddings, ref_square) + return dists + + +def _nn_features_per_object_for_chunk(reference_embeddings, ref_square, + query_embeddings, query_square, + wrong_label_mask): + """Extracts features for each object using nearest neighbor attention. + Args: + reference_embeddings: [n_chunk, embedding_dim], + the embedding vectors for the reference frame. + query_embeddings: [m_chunk, embedding_dim], + the embedding vectors for the query frames. + wrong_label_mask: [n_objects, n_chunk], + the mask for pixels not used for matching. + Returns: + nn_features: A float32 tensor of nearest neighbor features of shape + [m_chunk, n_objects, n_chunk]. + """ + if reference_embeddings.dtype == "float16": + wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float16") + else: + wrong_label_mask = paddle.cast(wrong_label_mask, dtype="float32") + + reference_embeddings_key = reference_embeddings + query_embeddings_key = query_embeddings + dists = _flattened_pairwise_distances(reference_embeddings_key, ref_square, + query_embeddings_key, query_square) + dists = (paddle.unsqueeze(dists, axis=1) + + paddle.unsqueeze(wrong_label_mask, axis=0) * + WRONG_LABEL_PADDING_DISTANCE) + features = paddle.min(dists, axis=2, keepdim=True) + return features + + +def _nearest_neighbor_features_per_object_in_chunks(reference_embeddings_flat, + query_embeddings_flat, + reference_labels_flat, + n_chunks): + """Calculates the nearest neighbor features per object in chunks to save mem. + Uses chunking to bound the memory use. + Args: + reference_embeddings_flat: [n, embedding_dim], + the embedding vectors for the reference frame. + query_embeddings_flat: [m, embedding_dim], + the embedding vectors for the query frames. + reference_labels_flat: [n, n_objects], + the class labels of the reference frame. + n_chunks: Integer, the number of chunks to use to save memory + (set to 1 for no chunking). + Returns: + nn_features: [m, n_objects, n]. + """ + + feature_dim, embedding_dim = query_embeddings_flat.shape + chunk_size = int(np.ceil(float(feature_dim) / n_chunks)) + wrong_label_mask = reference_labels_flat < 0.1 + + wrong_label_mask = paddle.transpose(x=wrong_label_mask, perm=[1, 0]) + ref_square = paddle.sum(paddle.pow(reference_embeddings_flat, 2), axis=1) + query_square = paddle.sum(paddle.pow(query_embeddings_flat, 2), axis=1) + + all_features = [] + for n in range(n_chunks): + if n_chunks == 1: + query_embeddings_flat_chunk = query_embeddings_flat + query_square_chunk = query_square + chunk_start = 0 + else: + chunk_start = n * chunk_size + chunk_end = (n + 1) * chunk_size + query_square_chunk = query_square[chunk_start:chunk_end] + if query_square_chunk.shape[0] == 0: + continue + query_embeddings_flat_chunk = query_embeddings_flat[ + chunk_start:chunk_end] + features = _nn_features_per_object_for_chunk( + reference_embeddings_flat, ref_square, query_embeddings_flat_chunk, + query_square_chunk, wrong_label_mask) + all_features.append(features) + if n_chunks == 1: + nn_features = all_features[0] + else: + nn_features = paddle.concat(all_features, axis=0) + + return nn_features + + +def global_matching(reference_embeddings, + query_embeddings, + reference_labels, + n_chunks=100, + dis_bias=0., + ori_size=None, + atrous_rate=1, + use_float16=True, + atrous_obj_pixel_num=0): + """ + Calculates the distance to the nearest neighbor per object. + For every pixel of query_embeddings calculate the distance to the + nearest neighbor in the (possibly subsampled) reference_embeddings per object. + Args: + reference_embeddings: [height, width, embedding_dim], + the embedding vectors for the reference frame. + query_embeddings: [height, width, + embedding_dim], the embedding vectors for the query frames. + reference_labels: [height, width, obj_nums], + the class labels of the reference frame. + n_chunks: Integer, the number of chunks to use to save memory + (set to 1 for no chunking). + dis_bias: [n_objects], foreground and background bias + ori_size: (ori_height, ori_width), + the original spatial size. If "None", (ori_height, ori_width) = (height, width). + atrous_rate: Integer, the atrous rate of reference_embeddings. + use_float16: Bool, if "True", use float16 type for matching. + Returns: + nn_features: [1, ori_height, ori_width, n_objects, feature_dim]. + """ + + assert (reference_embeddings.shape[:2] == reference_labels.shape[:2]) + if use_float16: + query_embeddings = paddle.cast(query_embeddings, dtype="float16") + reference_embeddings = paddle.cast(reference_embeddings, + dtype="float16") + h, w, embedding_dim = query_embeddings.shape + obj_nums = reference_labels.shape[2] + + if atrous_rate > 1: + h_pad = (atrous_rate - h % atrous_rate) % atrous_rate + w_pad = (atrous_rate - w % atrous_rate) % atrous_rate + selected_points = paddle.zeros([h + h_pad, w + w_pad]) + selected_points = selected_points.view( + (h + h_pad) // atrous_rate, atrous_rate, (w + w_pad) // atrous_rate, + atrous_rate) + selected_points[:, 0, :, 0] = 1. + selected_points = paddle.reshape(selected_points, + [h + h_pad, w + w_pad, 1])[:h, :w] + is_big_obj = (paddle.sum( + reference_labels, + axis=(0, 1))) > (atrous_obj_pixel_num * atrous_rate**2) + reference_labels[:, :, + is_big_obj] = reference_labels[:, :, + is_big_obj] * selected_points + + reference_embeddings_flat = paddle.reshape(reference_embeddings, + [-1, embedding_dim]) + reference_labels_flat = paddle.reshape(reference_labels, [-1, obj_nums]) + query_embeddings_flat = paddle.reshape(query_embeddings, + [-1, embedding_dim]) + + all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9 + reference_labels_flat = paddle.reshape( + paddle.masked_select(reference_labels_flat, + paddle.expand(all_ref_fg, [-1, obj_nums])), + [-1, obj_nums]) + if reference_labels_flat.shape[0] == 0: + return paddle.ones([1, h, w, obj_nums, 1]) + reference_embeddings_flat = paddle.reshape( + paddle.masked_select(reference_embeddings_flat, + paddle.expand(all_ref_fg, [-1, embedding_dim])), + [-1, embedding_dim]) + + nn_features = _nearest_neighbor_features_per_object_in_chunks( + reference_embeddings_flat, query_embeddings_flat, reference_labels_flat, + n_chunks) + + nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1]) + nn_features_reshape = ( + F.sigmoid(nn_features_reshape + + paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2 + + #TODO: ori_size is not None + + if use_float16: + nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32") + return nn_features_reshape + + +def global_matching_for_eval(all_reference_embeddings, + query_embeddings, + all_reference_labels, + n_chunks=20, + dis_bias=0., + ori_size=None, + atrous_rate=1, + use_float16=True, + atrous_obj_pixel_num=0): + """ + Calculates the distance to the nearest neighbor per object. + For every pixel of query_embeddings calculate the distance to the + nearest neighbor in the (possibly subsampled) reference_embeddings per object. + Args: + all_reference_embeddings: A list of reference_embeddings, + each with size [height, width, embedding_dim], + the embedding vectors for the reference frame. + query_embeddings: [n_query_images, height, width, + embedding_dim], the embedding vectors for the query frames. + all_reference_labels: A list of reference_labels, + each with size [height, width, obj_nums], + the class labels of the reference frame. + n_chunks: Integer, the number of chunks to use to save memory + (set to 1 for no chunking). + dis_bias: [n_objects], foreground and background bias + ori_size: (ori_height, ori_width), + the original spatial size. If "None", (ori_height, ori_width) = (height, width). + atrous_rate: Integer, the atrous rate of reference_embeddings. + use_float16: Bool, if "True", use float16 type for matching. + Returns: + nn_features: [n_query_images, ori_height, ori_width, n_objects, feature_dim]. + """ + + h, w, embedding_dim = query_embeddings.shape + obj_nums = all_reference_labels[0].shape[2] + all_reference_embeddings_flat = [] + all_reference_labels_flat = [] + ref_num = len(all_reference_labels) + n_chunks *= ref_num + if atrous_obj_pixel_num > 0: + if atrous_rate > 1: + h_pad = (atrous_rate - h % atrous_rate) % atrous_rate + w_pad = (atrous_rate - w % atrous_rate) % atrous_rate + selected_points = paddle.zeros([h + h_pad, w + w_pad]) + selected_points = paddle.reshape( + selected_points, [(h + h_pad) // atrous_rate, atrous_rate, + (w + w_pad) // atrous_rate, atrous_rate]) + selected_points[:, 0, :, 0] = 1. + selected_points = paddle.reshape(selected_points, + [h + h_pad, w + w_pad, 1])[:h, :w] + + for reference_embeddings, reference_labels, idx in zip( + all_reference_embeddings, all_reference_labels, range(ref_num)): + if atrous_rate > 1: + is_big_obj = paddle.sum( + reference_labels, + axis=(0, 1)) > (atrous_obj_pixel_num * atrous_rate**2) + is_big_obj = list(np.array(is_big_obj)) + for j in range(len(is_big_obj)): + if is_big_obj[j] == True: + reference_labels[:, :, j:j + + 1] = reference_labels[:, :, j:j + + 1] * selected_points + + reference_embeddings_flat = paddle.reshape(reference_embeddings, + [-1, embedding_dim]) + reference_labels_flat = paddle.reshape(reference_labels, + [-1, obj_nums]) + + all_reference_embeddings_flat.append(reference_embeddings_flat) + all_reference_labels_flat.append(reference_labels_flat) + + reference_embeddings_flat = paddle.concat( + x=all_reference_embeddings_flat, axis=0) + reference_labels_flat = paddle.concat(x=all_reference_labels_flat, + axis=0) + else: + if ref_num == 1: + reference_embeddings, reference_labels = all_reference_embeddings[ + 0], all_reference_labels[0] + if atrous_rate > 1: + h_pad = (atrous_rate - h % atrous_rate) % atrous_rate + w_pad = (atrous_rate - w % atrous_rate) % atrous_rate + if h_pad > 0 or w_pad > 0: + reference_embeddings = F.pad(reference_embeddings, + [0, h_pad, 0, w_pad, 0, 0]) + reference_labels = F.pad(reference_labels, + [0, h_pad, 0, w_pad, 0, 0]) + reference_embeddings = paddle.reshape( + reference_embeddings, + [(h + h_pad) // atrous_rate, atrous_rate, + (w + w_pad) // atrous_rate, atrous_rate, 32]) + reference_labels = paddle.reshape( + reference_labels, + [(h + h_pad) // atrous_rate, atrous_rate, + (w + w_pad) // atrous_rate, atrous_rate, -1]) + reference_embeddings = paddle.reshape( + reference_embeddings[:, 0, :, 0, :], + reference_embeddings[:, 0, :, 0, :].shape) + reference_labels = paddle.reshape( + reference_labels[:, 0, :, 0, :], + reference_labels[:, 0, :, 0, :].shape) + reference_embeddings_flat = paddle.reshape(reference_embeddings, + [-1, embedding_dim]) + reference_labels_flat = paddle.reshape(reference_labels, + [-1, obj_nums]) + else: + for reference_embeddings, reference_labels, idx in zip( + all_reference_embeddings, all_reference_labels, + range(ref_num)): + if atrous_rate > 1: + h_pad = (atrous_rate - h % atrous_rate) % atrous_rate + w_pad = (atrous_rate - w % atrous_rate) % atrous_rate + if h_pad > 0 or w_pad > 0: + reference_embeddings = F.pad(reference_embeddings, + [0, h_pad, 0, w_pad, 0, 0]) + reference_labels = F.pad(reference_labels, + [0, h_pad, 0, w_pad, 0, 0]) + + reference_embeddings = paddle.reshape( + reference_embeddings, + [(h + h_pad) // atrous_rate, atrous_rate, + (w + w_pad) // atrous_rate, atrous_rate, -1]) + reference_labels = paddle.reshape( + reference_labels, + [(h + h_pad) // atrous_rate, atrous_rate, + (w + w_pad) // atrous_rate, atrous_rate, -1]) + reference_embeddings = paddle.reshape( + reference_embeddings[:, 0, :, 0, :], + reference_embeddings[:, 0, :, 0, :].shape) + reference_labels = paddle.reshape( + reference_labels[:, 0, :, 0, :], + reference_labels[:, 0, :, 0, :].shape) + + reference_embeddings_flat = paddle.reshape( + reference_embeddings, [-1, embedding_dim]) + reference_labels_flat = paddle.reshape(reference_labels, + [-1, obj_nums]) + + all_reference_embeddings_flat.append(reference_embeddings_flat) + all_reference_labels_flat.append(reference_labels_flat) + + reference_embeddings_flat = paddle.concat( + all_reference_embeddings_flat, axis=0) + reference_labels_flat = paddle.concat(all_reference_labels_flat, + axis=0) + + query_embeddings_flat = paddle.reshape(query_embeddings, + [-1, embedding_dim]) + + all_ref_fg = paddle.sum(reference_labels_flat, axis=1, keepdim=True) > 0.9 + reference_labels_flat = paddle.reshape( + paddle.masked_select(reference_labels_flat, + paddle.expand(all_ref_fg, [-1, obj_nums])), + [-1, obj_nums]) + if reference_labels_flat.shape[0] == 0: + return paddle.ones([1, h, w, obj_nums, 1]) + reference_embeddings_flat = paddle.reshape( + paddle.masked_select(reference_embeddings_flat, + paddle.expand(all_ref_fg, [-1, embedding_dim])), + [-1, embedding_dim]) + if use_float16: + query_embeddings_flat = paddle.cast(query_embeddings_flat, + dtype="float16") + reference_embeddings_flat = paddle.cast(reference_embeddings_flat, + dtype="float16") + nn_features = _nearest_neighbor_features_per_object_in_chunks( + reference_embeddings_flat, query_embeddings_flat, reference_labels_flat, + n_chunks) + + nn_features_reshape = paddle.reshape(nn_features, [1, h, w, obj_nums, 1]) + nn_features_reshape = ( + F.sigmoid(nn_features_reshape + + paddle.reshape(dis_bias, [1, 1, 1, -1, 1])) - 0.5) * 2 + + # TODO: ori_size is not None + + if use_float16: + nn_features_reshape = paddle.cast(nn_features_reshape, dtype="float32") + return nn_features_reshape + + +#LOCAL_DIST_MAP +def local_pairwise_distances(x, + y, + max_distance=9, + atrous_rate=1, + allow_downsample=False): + """Computes pairwise squared l2 distances using a local search window. + Use for-loop for saving memory. + Args: + x: Float32 tensor of shape [height, width, feature_dim]. + y: Float32 tensor of shape [height, width, feature_dim]. + max_distance: Integer, the maximum distance in pixel coordinates + per dimension which is considered to be in the search window. + atrous_rate: Integer, the atrous rate of local matching. + allow_downsample: Bool, if "True", downsample x and y + with a stride of 2. + Returns: + Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2]. + """ + if allow_downsample: + ori_height = x.shape[0] + ori_width = x.shape[1] + x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0) + y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0) + down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1) + x = F.interpolate(x, + size=down_size, + mode='bilinear', + align_corners=True) + y = F.interpolate(y, + size=down_size, + mode='bilinear', + align_corners=True) + x = paddle.unsqueeze(paddle.transpose(x, [1, 2, 0]), axis=0) + y = paddle.unsqueeze(paddle.transpose(y, [1, 2, 0]), axis=0) + + pad_max_distance = max_distance - max_distance % atrous_rate + # no change pad + padded_y = F.pad(y, (0, 0, pad_max_distance, pad_max_distance, + pad_max_distance, pad_max_distance), + value=WRONG_LABEL_PADDING_DISTANCE) + + height, width, _ = x.shape + dists = [] + for y in range(2 * pad_max_distance // atrous_rate + 1): + y_start = y * atrous_rate + y_end = y_start + height + y_slice = padded_y[y_start:y_end] + for x in range(2 * max_distance + 1): + x_start = x * atrous_rate + x_end = x_start + width + offset_y = y_slice[:, x_start:x_end] + dist = paddle.sum(paddle.pow((x - offset_y), 2), axis=2) + dists.append(dist) + dists = paddle.stack(dists, axis=2) + + return dists + + +def local_pairwise_distances_parallel(x, + y, + max_distance=9, + atrous_rate=1, + allow_downsample=True): + """Computes pairwise squared l2 distances using a local search window. + Args: + x: Float32 tensor of shape [height, width, feature_dim]. + y: Float32 tensor of shape [height, width, feature_dim]. + max_distance: Integer, the maximum distance in pixel coordinates + per dimension which is considered to be in the search window. + atrous_rate: Integer, the atrous rate of local matching. + allow_downsample: Bool, if "True", downsample x and y + with a stride of 2. + Returns: + Float32 distances tensor of shape [height, width, (2 * max_distance + 1) ** 2]. + """ + + ori_height, ori_width, _ = x.shape + x = paddle.unsqueeze(paddle.transpose(x, [2, 0, 1]), axis=0) + y = paddle.unsqueeze(paddle.transpose(y, [2, 0, 1]), axis=0) + if allow_downsample: + down_size = (int(ori_height / 2) + 1, int(ori_width / 2) + 1) + x = F.interpolate(x, + size=down_size, + mode='bilinear', + align_corners=True) + y = F.interpolate(y, + size=down_size, + mode='bilinear', + align_corners=True) + + _, channels, height, width = x.shape + + x2 = paddle.reshape(paddle.sum(paddle.pow(x, 2), axis=1), + [height, width, 1]) + y2 = paddle.reshape(paddle.sum(paddle.pow(y, 2), axis=1), + [1, 1, height, width]) + + pad_max_distance = max_distance - max_distance % atrous_rate + # no change pad + padded_y = F.pad(y, (pad_max_distance, pad_max_distance, pad_max_distance, + pad_max_distance)) + padded_y2 = F.pad(y2, (pad_max_distance, pad_max_distance, pad_max_distance, + pad_max_distance), + value=WRONG_LABEL_PADDING_DISTANCE) + + offset_y = paddle.transpose( + paddle.reshape( + F.unfold(x=padded_y, + kernel_sizes=[height, width], + strides=[atrous_rate, atrous_rate]), + [channels, height * width, -1]), [1, 0, 2]) + offset_y2 = paddle.reshape( + F.unfold(padded_y2, + kernel_sizes=[height, width], + strides=[atrous_rate, atrous_rate]), [height, width, -1]) + x = paddle.transpose(paddle.reshape(x, [channels, height * width, -1]), + [1, 2, 0]) + + dists = x2 + offset_y2 - 2. * paddle.reshape(paddle.matmul(x, offset_y), + [height, width, -1]) + + return dists + + +def local_matching(prev_frame_embedding, + query_embedding, + prev_frame_labels, + dis_bias=0., + multi_local_distance=[15], + ori_size=None, + atrous_rate=1, + use_float16=True, + allow_downsample=True, + allow_parallel=True): + """Computes nearest neighbor features while only allowing local matches. + Args: + prev_frame_embedding: [height, width, embedding_dim], + the embedding vectors for the last frame. + query_embedding: [height, width, embedding_dim], + the embedding vectors for the query frames. + prev_frame_labels: [height, width, n_objects], + the class labels of the previous frame. + multi_local_distance: A list of Integer, + a list of maximum distance allowed for local matching. + ori_size: (ori_height, ori_width), + the original spatial size. If "None", (ori_height, ori_width) = (height, width). + atrous_rate: Integer, the atrous rate of local matching. + use_float16: Bool, if "True", use float16 type for matching. + allow_downsample: Bool, if "True", downsample prev_frame_embedding and query_embedding + with a stride of 2. + allow_parallel: Bool, if "True", do matching in a parallel way. If "False", do matching in + a for-loop way, which will save GPU memory. + Returns: + nn_features: A float32 np.array of nearest neighbor features of shape + [1, height, width, n_objects, 1]. + """ + max_distance = multi_local_distance[-1] + + if ori_size is None: + height, width = prev_frame_embedding.shape[:2] + ori_size = (height, width) + + obj_num = prev_frame_labels.shape[2] + pad = paddle.ones([1]) * WRONG_LABEL_PADDING_DISTANCE + if use_float16: + query_embedding = paddle.cast(query_embedding, dtype="float16") + prev_frame_embedding = paddle.cast(prev_frame_embedding, + dtype="float16") + pad = paddle.cast(pad, dtype="float16") + + if allow_parallel: + d = local_pairwise_distances_parallel(query_embedding, + prev_frame_embedding, + max_distance=max_distance, + atrous_rate=atrous_rate, + allow_downsample=allow_downsample) + else: + d = local_pairwise_distances(query_embedding, + prev_frame_embedding, + max_distance=max_distance, + atrous_rate=atrous_rate, + allow_downsample=allow_downsample) + + height, width = d.shape[:2] + + labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), 1) + labels = paddle.unsqueeze(paddle.transpose(prev_frame_labels, [2, 0, 1]), + axis=1) + if (height, width) != ori_size: + labels = F.interpolate(labels, size=(height, width), mode='nearest') + + pad_max_distance = max_distance - max_distance % atrous_rate + atrous_max_distance = pad_max_distance // atrous_rate + #no change pad + padded_labels = F.pad(labels, ( + pad_max_distance, + pad_max_distance, + pad_max_distance, + pad_max_distance, + ), + mode='constant', + value=0) + + offset_masks = paddle.transpose( + paddle.reshape( + F.unfold(padded_labels, + kernel_sizes=[height, width], + strides=[atrous_rate, atrous_rate]), + [obj_num, height, width, -1]), [1, 2, 3, 0]) > 0.9 + + d_tiled = paddle.expand(paddle.unsqueeze( + d, axis=-1), [-1, -1, -1, obj_num]) # h, w, num_local_pos, obj_num + + d_masked = paddle.where(offset_masks, d_tiled, pad) + dists = paddle.min(d_masked, axis=2) + multi_dists = [ + paddle.unsqueeze(paddle.transpose(dists, [2, 0, 1]), axis=1) + ] # n_objects, num_multi_local, h, w + + reshaped_d_masked = paddle.reshape(d_masked, [ + height, width, 2 * atrous_max_distance + 1, 2 * atrous_max_distance + 1, + obj_num + ]) + for local_dis in multi_local_distance[:-1]: + local_dis = local_dis // atrous_rate + start_idx = atrous_max_distance - local_dis + end_idx = atrous_max_distance + local_dis + 1 + new_d_masked = paddle.reshape( + reshaped_d_masked[:, :, start_idx:end_idx, start_idx:end_idx, :], + reshaped_d_masked[:, :, start_idx:end_idx, + start_idx:end_idx, :].shape) + new_d_masked = paddle.reshape(new_d_masked, + [height, width, -1, obj_num]) + new_dists = paddle.min(new_d_masked, axis=2) + new_dists = paddle.unsqueeze(paddle.transpose(new_dists, [2, 0, 1]), + axis=1) + multi_dists.append(new_dists) + + multi_dists = paddle.concat(multi_dists, axis=1) + multi_dists = (F.sigmoid(multi_dists + + paddle.reshape(dis_bias, [-1, 1, 1, 1])) - 0.5) * 2 + + if use_float16: + multi_dists = paddle.cast(multi_dists, dtype="float32") + + if (height, width) != ori_size: + multi_dists = F.interpolate(multi_dists, + size=ori_size, + mode='bilinear', + align_corners=True) + multi_dists = paddle.transpose(multi_dists, perm=[2, 3, 0, 1]) + multi_dists = paddle.reshape(multi_dists, + [1, ori_size[0], ori_size[1], obj_num, -1]) + + return multi_dists + + +def calculate_attention_head(ref_embedding, + ref_label, + prev_embedding, + prev_label, + epsilon=1e-5): + + ref_head = ref_embedding * ref_label + ref_head_pos = paddle.sum(ref_head, axis=(2, 3)) + ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos + ref_pos_num = paddle.sum(ref_label, axis=(2, 3)) + ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3)) + ref_head_pos = ref_head_pos / (ref_pos_num + epsilon) + ref_head_neg = ref_head_neg / (ref_neg_num + epsilon) + + prev_head = prev_embedding * prev_label + prev_head_pos = paddle.sum(prev_head, axis=(2, 3)) + prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos + prev_pos_num = paddle.sum(prev_label, axis=(2, 3)) + prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3)) + prev_head_pos = prev_head_pos / (prev_pos_num + epsilon) + prev_head_neg = prev_head_neg / (prev_neg_num + epsilon) + + total_head = paddle.concat( + x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1) + + return total_head + + +def calculate_attention_head_for_eval(ref_embeddings, + ref_labels, + prev_embedding, + prev_label, + epsilon=1e-5): + total_ref_head_pos = 0. + total_ref_head_neg = 0. + total_ref_pos_num = 0. + total_ref_neg_num = 0. + + for idx in range(len(ref_embeddings)): + ref_embedding = ref_embeddings[idx] + ref_label = ref_labels[idx] + ref_head = ref_embedding * ref_label + ref_head_pos = paddle.sum(ref_head, axis=(2, 3)) + ref_head_neg = paddle.sum(ref_embedding, axis=(2, 3)) - ref_head_pos + ref_pos_num = paddle.sum(ref_label, axis=(2, 3)) + ref_neg_num = paddle.sum(1. - ref_label, axis=(2, 3)) + total_ref_head_pos = total_ref_head_pos + ref_head_pos + total_ref_head_neg = total_ref_head_neg + ref_head_neg + total_ref_pos_num = total_ref_pos_num + ref_pos_num + total_ref_neg_num = total_ref_neg_num + ref_neg_num + ref_head_pos = total_ref_head_pos / (total_ref_pos_num + epsilon) + ref_head_neg = total_ref_head_neg / (total_ref_neg_num + epsilon) + + prev_head = prev_embedding * prev_label + prev_head_pos = paddle.sum(prev_head, axis=(2, 3)) + prev_head_neg = paddle.sum(prev_embedding, axis=(2, 3)) - prev_head_pos + prev_pos_num = paddle.sum(prev_label, axis=(2, 3)) + prev_neg_num = paddle.sum(1. - prev_label, axis=(2, 3)) + prev_head_pos = prev_head_pos / (prev_pos_num + epsilon) + prev_head_neg = prev_head_neg / (prev_neg_num + epsilon) + + total_head = paddle.concat( + x=[ref_head_pos, ref_head_neg, prev_head_pos, prev_head_neg], axis=1) + return total_head diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__init__.py new file mode 100644 index 0000000..de4bf57 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from .base import BaseSegmenter +from .ms_tcn import MSTCN +from .asrf import ASRF + +__all__ = ['BaseSegmenter', 'MSTCN', 'ASRF'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..c427464 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/asrf.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/asrf.cpython-310.pyc new file mode 100644 index 0000000..a035c2c Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/asrf.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..abe08a4 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/ms_tcn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/ms_tcn.cpython-310.pyc new file mode 100644 index 0000000..8bcaf14 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/ms_tcn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..0acc81f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/__pycache__/utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/asrf.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/asrf.py new file mode 100644 index 0000000..3d962c7 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/asrf.py @@ -0,0 +1,143 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import SEGMENTERS +from .base import BaseSegmenter + +import paddle +import paddle.nn.functional as F +from .utils import ASRFPostProcessing + + +@SEGMENTERS.register() +class ASRF(BaseSegmenter): + """ASRF model framework.""" + + def __init__(self, + postprocessing_method, + boundary_threshold, + backbone=None, + head=None, + loss=None): + + super().__init__(backbone=backbone, head=head, loss=loss) + self.postprocessing_method = postprocessing_method + self.boundary_threshold = boundary_threshold + + def forward_net(self, video_feature): + """Define how the model is going to train, from input to output. + """ + if self.backbone is not None: + feature = self.backbone(video_feature) + else: + feature = video_feature + + if self.head is not None: + network_outputs = self.head(feature) + else: + network_outputs = None + + return network_outputs + + def train_step(self, data_batch): + """Training step. + """ + feature, label, boundary = data_batch + # call forward + outputs_cls, outputs_boundary = self.forward_net(feature) + + # transfer data + outputs_cls_np = outputs_cls[-1].numpy() + outputs_boundary_np = outputs_boundary[-1].numpy() + + # caculate loss + if self.loss is not None: + output_loss = self.loss(feature, outputs_cls, label, + outputs_boundary, boundary) + else: + output_loss = None + + # predict post process + predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np, + self.postprocessing_method) + predicted = paddle.squeeze(predicted) + + loss_metrics = dict() + loss_metrics['loss'] = output_loss + loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, label) + + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + feature, label, boundary = data_batch + + # call forward + outputs_cls, outputs_boundary = self.forward_net(feature) + + # transfer data + outputs_cls_np = outputs_cls[-1].numpy() + outputs_boundary_np = outputs_boundary[-1].numpy() + + ## caculate loss + if self.loss is not None: + output_loss = self.loss(feature, outputs_cls, label, + outputs_boundary, boundary) + else: + output_loss = None + + # predict post process + predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np, + self.postprocessing_method) + predicted = paddle.squeeze(predicted) + + outputs_dict = dict() + outputs_dict['loss'] = output_loss + outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, label) + return outputs_dict + + def test_step(self, data_batch): + """Testing setp. + """ + feature, _, _ = data_batch + + outputs_dict = dict() + # call forward + outputs_cls, outputs_boundary = self.forward_net(feature) + # transfer data + outputs_cls_np = outputs_cls[-1].numpy() + outputs_boundary_np = outputs_boundary[-1].numpy() + + # predict post process + predicted = ASRFPostProcessing(outputs_cls_np, outputs_boundary_np, + self.postprocessing_method) + outputs_dict['predict'] = paddle.to_tensor(predicted[0, :]) + outputs_dict['output_np'] = F.sigmoid(outputs_cls[-1]) + return outputs_dict + + def infer_step(self, data_batch): + """Infering setp. + """ + feature = data_batch[0] + + # call forward + outputs_cls, outputs_boundary = self.forward_net(feature) + # transfer data + outputs_cls_np = outputs_cls[-1] + outputs_boundary_np = outputs_boundary[-1] + + outputs = [ + outputs_cls_np, outputs_boundary_np, + F.sigmoid(outputs_cls[-1]) + ] + return outputs diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/base.py new file mode 100644 index 0000000..e0856d9 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/base.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from abc import abstractmethod +from ... import builder +import paddle.nn as nn + + +class BaseSegmenter(nn.Layer): + """Base class for segementers. + + All segementers should subclass it. + All subclass should overwrite: + + - Methods:``train_step``, supporting to forward when training. + - Methods:``valid_step``, supporting to forward when validating. + - Methods:``test_step``, supporting to forward when testing. + + Args: + backbone (dict): Backbone modules to extract feature. + head (dict): Classification head to process feature. + + """ + + def __init__(self, backbone=None, head=None, loss=None): + + super().__init__() + # build backbone + if backbone is not None: + self.backbone = builder.build_backbone(backbone) + if hasattr(self.backbone, 'init_weights'): + self.backbone.init_weights() + else: + self.backbone = None + # build head + if head is not None: + self.head_name = head.name + self.head = builder.build_head(head) + if hasattr(self.head, 'init_weights'): + self.head.init_weights() + else: + self.head = None + # build loss + if loss is not None: + self.loss_name = loss.name + self.loss = builder.build_loss(loss) + if hasattr(self.loss, 'init_weights'): + self.loss.init_weights() + else: + self.loss = None + + def forward(self, data_batch, mode='infer'): + """ + 1. Define how the model is going to run, from input to output. + 2. Console of train, valid, test or infer step + 3. Set mode='infer' is used for saving inference model, refer to tools/export_model.py + """ + if mode == 'train': + return self.train_step(data_batch) + elif mode == 'valid': + return self.val_step(data_batch) + elif mode == 'test': + return self.test_step(data_batch) + elif mode == 'infer': + return self.infer_step(data_batch) + else: + raise NotImplementedError + + @abstractmethod + def train_step(self, data_batch, **kwargs): + """Training step. + """ + raise NotImplementedError + + @abstractmethod + def val_step(self, data_batch, **kwargs): + """Validating step. + """ + raise NotImplementedError + + @abstractmethod + def test_step(self, data_batch, **kwargs): + """Test step. + """ + raise NotImplementedError + + @abstractmethod + def infer_step(self, data_batch, **kwargs): + """Infer step. + """ + raise NotImplementedError diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/ms_tcn.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/ms_tcn.py new file mode 100644 index 0000000..a5982a7 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/ms_tcn.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from ...registry import SEGMENTERS +from .base import BaseSegmenter + +import paddle +import paddle.nn.functional as F + + +@SEGMENTERS.register() +class MSTCN(BaseSegmenter): + """MS-TCN model framework.""" + + def forward_net(self, video_feature): + """Define how the model is going to train, from input to output. + """ + if self.backbone is not None: + feature = self.backbone(video_feature) + else: + feature = video_feature + + if self.head is not None: + cls_score = self.head(feature) + else: + cls_score = None + + return cls_score + + def train_step(self, data_batch): + """Training step. + """ + video_feat, video_gt = data_batch + + # call forward + output = self.forward_net(video_feat) + loss = 0. + for i in range(len(output)): + loss += self.head.loss(output[i], video_gt) + + predicted = paddle.argmax(output[-1], axis=1) + predicted = paddle.squeeze(predicted) + + loss_metrics = dict() + loss_metrics['loss'] = loss + loss_metrics['F1@0.50'] = self.head.get_F1_score(predicted, video_gt) + return loss_metrics + + def val_step(self, data_batch): + """Validating setp. + """ + video_feat, video_gt = data_batch + + # call forward + output = self.forward_net(video_feat) + loss = 0. + for i in range(len(output)): + loss += self.head.loss(output[i], video_gt) + + predicted = paddle.argmax(output[-1], axis=1) + predicted = paddle.squeeze(predicted) + + outputs_dict = dict() + outputs_dict['loss'] = loss + outputs_dict['F1@0.50'] = self.head.get_F1_score(predicted, video_gt) + return outputs_dict + + def test_step(self, data_batch): + """Testing setp. + """ + video_feat, _ = data_batch + + outputs_dict = dict() + # call forward + output = self.forward_net(video_feat) + predicted = paddle.argmax(output[-1], axis=1) + predicted = paddle.squeeze(predicted) + outputs_dict['predict'] = predicted + outputs_dict['output_np'] = F.sigmoid(output[-1]) + return outputs_dict + + def infer_step(self, data_batch): + """Infering setp. + """ + video_feat = data_batch[0] + + # call forward + output = self.forward_net(video_feat) + predicted = paddle.argmax(output[-1], axis=1) + predicted = paddle.squeeze(predicted) + output_np = F.sigmoid(output[-1]) + return predicted, output_np diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/utils.py b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/utils.py new file mode 100644 index 0000000..9c21cbb --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/framework/segmenters/utils.py @@ -0,0 +1,343 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +# https://github.com/yiskw713/asrf/libs/postprocess.py + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import math + + +class GaussianSmoothing(nn.Layer): + """ + Apply gaussian smoothing on a 1d tensor. + Filtering is performed seperately for each channel + in the input using a depthwise convolution. + Arguments: + channels (int, sequence): Number of channels of the input tensors. Output will + have this number of channels as well. + kernel_size (int, sequence): Size of the gaussian kernel. + sigma (float, sequence): Standard deviation of the gaussian kernel. + """ + + def __init__(self, kernel_size=15, sigma=1.0): + super().__init__() + self.kernel_size = kernel_size + + # The gaussian kernel is the product of the + # gaussian function of each dimension. + kernel = 1 + meshgrid = paddle.arange(kernel_size) + + meshgrid = paddle.cast(meshgrid, dtype='float32') + + mean = (kernel_size - 1) / 2 + kernel = kernel / (sigma * math.sqrt(2 * math.pi)) + kernel = kernel * paddle.exp(-(((meshgrid - mean) / sigma)**2) / 2) + + # Make sure sum of values in gaussian kernel equals 1. + # kernel = kernel / paddle.max(kernel) + + self.kernel = paddle.reshape(kernel, [1, 1, -1]) + + def forward(self, inputs): + """ + Apply gaussian filter to input. + Arguments: + input (paddle.Tensor): Input to apply gaussian filter on. + Returns: + filtered (paddle.Tensor): Filtered output. + """ + _, c, _ = inputs.shape + inputs = F.pad(inputs, + pad=((self.kernel_size - 1) // 2, + (self.kernel_size - 1) // 2), + mode="reflect", + data_format='NCL') + + kernel = paddle.expand(self.kernel, shape=[c, 1, self.kernel_size]) + return F.conv1d(inputs, weight=kernel, groups=c) + + +def argrelmax(prob, threshold=0.7): + """ + Calculate arguments of relative maxima. + prob: np.array. boundary probability maps distributerd in [0, 1] + prob shape is (T) + ignore the peak whose value is under threshold + + Return: + Index of peaks for each batch + """ + # ignore the values under threshold + prob[prob < threshold] = 0.0 + + # calculate the relative maxima of boundary maps + # treat the first frame as boundary + peak = np.concatenate( + [ + np.ones((1), dtype=np.bool), + (prob[:-2] < prob[1:-1]) & (prob[2:] < prob[1:-1]), + np.zeros((1), dtype=np.bool), + ], + axis=0, + ) + + peak_idx = np.where(peak)[0].tolist() + + return peak_idx + + +def is_probability(x): + assert x.ndim == 3 + + if x.shape[1] == 1: + # sigmoid + if x.min() >= 0 and x.max() <= 1: + return True + else: + return False + else: + # softmax + _sum = np.sum(x, axis=1).astype(np.float32) + _ones = np.ones_like(_sum, dtype=np.float32) + return np.allclose(_sum, _ones) + + +def convert2probability(x): + """ + Args: x (N, C, T) + """ + assert x.ndim == 3 + + if is_probability(x): + return x + else: + if x.shape[1] == 1: + # sigmoid + prob = 1 / (1 + np.exp(-x)) + else: + # softmax + prob = np.exp(x) / np.sum(np.exp(x), axis=1) + return prob.astype(np.float32) + + +def convert2label(x): + assert x.ndim == 2 or x.ndim == 3 + + if x.ndim == 2: + return x.astype(np.int64) + else: + if not is_probability(x): + x = convert2probability(x) + + label = np.argmax(x, axis=1) + return label.astype(np.int64) + + +def refinement_with_boundary(outputs, boundaries, boundary_threshold): + """ + Get segments which is defined as the span b/w two boundaries, + and decide their classes by majority vote. + Args: + outputs: numpy array. shape (N, C, T) + the model output for frame-level class prediction. + boundaries: numpy array. shape (N, 1, T) + boundary prediction. + boundary_threshold: the threshold of the size of action segments. float(default=0.7) + Return: + preds: np.array. shape (N, T) + final class prediction considering boundaries. + """ + + preds = convert2label(outputs) + boundaries = convert2probability(boundaries) + + for i, (output, pred, boundary) in enumerate(zip(outputs, preds, + boundaries)): + idx = argrelmax(boundary[0, :], threshold=boundary_threshold) + + # add the index of the last action ending + T = pred.shape[0] + idx.append(T) + + # majority vote + for j in range(len(idx) - 1): + count = np.bincount(pred[idx[j]:idx[j + 1]]) + modes = np.where(count == count.max())[0] + if len(modes) == 1: + mode = modes + else: + if outputs.ndim == 3: + # if more than one majority class exist + prob_sum_max = 0 + for m in modes: + prob_sum = output[m, idx[j]:idx[j + 1]].sum() + if prob_sum_max < prob_sum: + mode = m + prob_sum_max = prob_sum + else: + # decide first mode when more than one majority class + # have the same number during oracle experiment + mode = modes[0] + + preds[i, idx[j]:idx[j + 1]] = mode + return preds + + +def relabeling(outputs, theta_t): + """ + Relabeling small action segments with their previous action segment + Args: + output: the results of action segmentation. (N, T) or (N, C, T) + theta_t: the threshold of the size of action segments. + Return: + relabeled output. (N, T) + """ + + preds = convert2label(outputs) + + for i in range(preds.shape[0]): + # shape (T,) + last = preds[i][0] + cnt = 1 + for j in range(1, preds.shape[1]): + if last == preds[i][j]: + cnt += 1 + else: + if cnt > theta_t: + cnt = 1 + last = preds[i][j] + else: + preds[i][j - cnt:j] = preds[i][j - cnt - 1] + cnt = 1 + last = preds[i][j] + + if cnt <= theta_t: + preds[i][j - cnt:j] = preds[i][j - cnt - 1] + + return preds + + +def smoothing(outputs, filter_func): + """ + Smoothing action probabilities with gaussian filter. + Args: + outputs: frame-wise action probabilities. (N, C, T) + Return: + predictions: final prediction. (N, T) + """ + + outputs = convert2probability(outputs) + outputs = filter_func(paddle.to_tensor(outputs)).numpy() + + preds = convert2label(outputs) + return preds + + +def ASRFPostProcessing(outputs_cls, + outputs_boundary, + refinement_method, + boundary_threshold=0.7, + theta_t=15, + kernel_size=15): + """ + ASRF post processing is to refine action boundary + Args: + outputs_cls: the results of action segmentation. (N, T) or (N, C, T) + outputs_boundary: action boundary probability. (N, 1, T) + refinement_method: the way of refine predict boundary and classification. str + boundary_threshold: the threshold of the size of action segments. float(default=0.7) + theta_t: the threshold of the size of action segments. int(default=15) + kernel_size: Size of the gaussian kernel. int(default=15) + Return: + preds output. (N, T) + """ + func = [ + "refinement_with_boundary", + "relabeling", + "smoothing", + ] + + if refinement_method == "smoothing": + filter_func = GaussianSmoothing(kernel_size) + preds = smoothing(outputs_cls, filter_func) + elif refinement_method == "relabeling": + preds = relabeling(outputs_cls, theta_t) + elif refinement_method == "refinement_with_boundary": + preds = refinement_with_boundary(outputs_cls, outputs_boundary, + boundary_threshold) + else: + preds = np.zeros((1, 1)) + assert refinement_method in func + + return paddle.to_tensor(preds) + + +def _calculate_fan_in_and_fan_out(tensor): + dimensions = len(tensor.shape) + if dimensions < 2: + raise ValueError("Fan in and fan out can not be computed \ + for tensor with fewer than 2 dimensions") + + if dimensions == 2: # Linear + fan_in = tensor.shape[1] + fan_out = tensor.shape[0] + else: + num_input_fmaps = tensor.shape[1] + num_output_fmaps = tensor.shape[0] + receptive_field_size = 1 + if tensor.dim() > 2: + receptive_field_size = tensor[0][0].numel() + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def calculate_gain(nonlinearity=None, a=None): + if nonlinearity == 'tanh': + return 5.0 / 3 + elif nonlinearity == 'relu': + return math.sqrt(2.0) + elif nonlinearity == 'leaky_relu': + if a is not None: + return math.sqrt(2.0 / (1 + a**2)) + else: + return math.sqrt(2.0 / (1 + 0.01**2)) + elif nonlinearity == 'selu': + return 3.0 / 4 + else: + return 1 + + +def KaimingUniform_like_torch(weight_npy, + mode='fan_in', + nonlinearity='leaky_relu'): + fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy) + if mode == 'fan_in': + fan_mode = fan_in + else: + fan_mode = fan_out + a = math.sqrt(5.0) + gain = calculate_gain(nonlinearity=nonlinearity, a=a) + std = gain / math.sqrt(fan_mode) + bound = math.sqrt(3.0) * std + return np.random.uniform(-bound, bound, weight_npy.shape) + + +def init_bias(weight_npy, bias_npy): + fan_in, fan_out = _calculate_fan_in_and_fan_out(weight_npy) + bound = 1.0 / math.sqrt(fan_in) + return np.random.uniform(-bound, bound, bias_npy.shape) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__init__.py new file mode 100644 index 0000000..49f71cc --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__init__.py @@ -0,0 +1,49 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .adds_head import AddsHead +from .asrf_head import ASRFHead +from .attention_lstm_head import AttentionLstmHead, ActionAttentionLstmHead +from .base import BaseHead +from .bbox_head import BBoxHeadAVA +from .cfbi_head import CollaborativeEnsemblerMS +from .i3d_head import I3DHead +from .movinet_head import MoViNetHead +from .ms_tcn_head import MSTCNHead +from .pptimesformer_head import ppTimeSformerHead +from .pptsm_head import ppTSMHead +from .pptsn_head import ppTSNHead +from .roi_head import AVARoIHead +from .single_straight3d import SingleRoIExtractor3D +from .slowfast_head import SlowFastHead +from .stgcn_head import STGCNHead +from .timesformer_head import TimeSformerHead +from .transnetv2_head import TransNetV2Head +from .tsm_head import TSMHead +from .tsn_head import TSNHead +from .ms_tcn_head import MSTCNHead +from .asrf_head import ASRFHead +from .ctrgcn_head import CTRGCNHead +from .movinet_head import MoViNetHead +from .agcn2s_head import AGCN2sHead +from .token_shift_head import TokenShiftHead + +__all__ = [ + 'BaseHead', 'TSNHead', 'TSMHead', 'ppTSMHead', 'ppTSNHead', 'SlowFastHead', + 'AttentionLstmHead', 'TimeSformerHead', 'STGCNHead', 'TransNetV2Head', + 'I3DHead', 'SingleRoIExtractor3D', 'AVARoIHead', 'BBoxHeadAVA', 'AddsHead', + 'ppTimeSformerHead', 'CollaborativeEnsemblerMS', 'MSTCNHead', 'ASRFHead', + 'MoViNetHead', 'CTRGCNHead', 'TokenShiftHead', 'ActionAttentionLstmHead', + 'AGCN2sHead' +] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..b559ad2 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/adds_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/adds_head.cpython-310.pyc new file mode 100644 index 0000000..2227f8f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/adds_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/agcn2s_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/agcn2s_head.cpython-310.pyc new file mode 100644 index 0000000..6fc341a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/agcn2s_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/asrf_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/asrf_head.cpython-310.pyc new file mode 100644 index 0000000..77d72d8 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/asrf_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/attention_lstm_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/attention_lstm_head.cpython-310.pyc new file mode 100644 index 0000000..ec74413 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/attention_lstm_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..8ca7372 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/bbox_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/bbox_head.cpython-310.pyc new file mode 100644 index 0000000..e862576 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/bbox_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/cfbi_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/cfbi_head.cpython-310.pyc new file mode 100644 index 0000000..e0af3e0 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/cfbi_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ctrgcn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ctrgcn_head.cpython-310.pyc new file mode 100644 index 0000000..77001ca Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ctrgcn_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/i3d_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/i3d_head.cpython-310.pyc new file mode 100644 index 0000000..78535f4 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/i3d_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/movinet_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/movinet_head.cpython-310.pyc new file mode 100644 index 0000000..981fd7a Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/movinet_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ms_tcn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ms_tcn_head.cpython-310.pyc new file mode 100644 index 0000000..69bf0c6 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/ms_tcn_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptimesformer_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptimesformer_head.cpython-310.pyc new file mode 100644 index 0000000..966829f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptimesformer_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsm_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsm_head.cpython-310.pyc new file mode 100644 index 0000000..b8382f0 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsm_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsn_head.cpython-310.pyc new file mode 100644 index 0000000..90b5293 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/pptsn_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_extractor.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_extractor.cpython-310.pyc new file mode 100644 index 0000000..e70db29 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_extractor.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_head.cpython-310.pyc new file mode 100644 index 0000000..194ca51 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/roi_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/single_straight3d.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/single_straight3d.cpython-310.pyc new file mode 100644 index 0000000..e3ab758 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/single_straight3d.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/slowfast_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/slowfast_head.cpython-310.pyc new file mode 100644 index 0000000..f7c5b61 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/slowfast_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/stgcn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/stgcn_head.cpython-310.pyc new file mode 100644 index 0000000..6acc1c5 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/stgcn_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/timesformer_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/timesformer_head.cpython-310.pyc new file mode 100644 index 0000000..a8276cf Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/timesformer_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/token_shift_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/token_shift_head.cpython-310.pyc new file mode 100644 index 0000000..141da04 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/token_shift_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/transnetv2_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/transnetv2_head.cpython-310.pyc new file mode 100644 index 0000000..c993c17 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/transnetv2_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsm_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsm_head.cpython-310.pyc new file mode 100644 index 0000000..991d912 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsm_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsn_head.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsn_head.cpython-310.pyc new file mode 100644 index 0000000..abb2d14 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/heads/__pycache__/tsn_head.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/adds_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/adds_head.py new file mode 100644 index 0000000..3b1cd24 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/adds_head.py @@ -0,0 +1,146 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np +import paddle.nn as nn +from paddlevideo.utils import get_dist_info +import paddle +from ..builder import build_loss +from ..registry import HEADS + +MIN_DEPTH = 1e-3 +MAX_DEPTH = 80 + + +@HEADS.register() +class AddsHead(nn.Layer): + """TimeSformerHead Head. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + std(float): Std(Scale) value in normal initilizar. Default: 0.01. + kwargs (dict, optional): Any keyword argument to initialize. + + """ + def __init__(self, + avg_reprojection, + disparity_smoothness, + no_ssim, + loss_cfg=dict(name='ADDSLoss'), + max_gt_depth=60, + pred_depth_scale_factor=1): + + super(AddsHead, self).__init__() + loss_cfg['avg_reprojection'] = avg_reprojection + loss_cfg['disparity_smoothness'] = disparity_smoothness + loss_cfg['no_ssim'] = no_ssim + self.max_gt_depth = max_gt_depth + self.pred_depth_scale_factor = pred_depth_scale_factor + self.loss_func = build_loss(loss_cfg) + + def forward(self): + raise NotImplemented + + def loss(self, inputs, outputs): + if self.training: + return self.loss_func(inputs, outputs) + else: + abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.get_metrics( + outputs['pred_disp'], outputs['gt']) + outputs['abs_rel'] = abs_rel + outputs['sq_rel'] = sq_rel + outputs['rmse'] = rmse + outputs['rmse_log'] = rmse_log + outputs['a1'] = a1 + outputs['a2'] = a2 + outputs['a3'] = a3 + return outputs + + def get_metrics(self, pred_disp, gt_depth): + gt_height, gt_width = gt_depth.shape[:2] + + pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) + pred_depth = 1 / pred_disp + + mask = gt_depth > 0 + + pred_depth = pred_depth[mask] + gt_depth = gt_depth[mask] + + pred_depth *= self.pred_depth_scale_factor + ratio = np.median(gt_depth) / np.median(pred_depth) + pred_depth *= ratio + + pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH + pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH + + mask2 = gt_depth <= self.max_gt_depth + pred_depth = pred_depth[mask2] + gt_depth = gt_depth[mask2] + + abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = self.compute_errors( + gt_depth, pred_depth) + + _, world_size = get_dist_info() + if world_size > 1: + # educe sum when valid + # TODO: there are some problems with multi gpu gather code. + abs_rel = paddle.to_tensor(abs_rel) + sq_rel = paddle.to_tensor(sq_rel) + rmse = paddle.to_tensor(rmse) + rmse_log = paddle.to_tensor(rmse_log) + a1 = paddle.to_tensor(a1) + a2 = paddle.to_tensor(a2) + a3 = paddle.to_tensor(a3) + abs_rel = paddle.distributed.all_reduce( + abs_rel, op=paddle.distributed.ReduceOp.SUM) / world_size + sq_rel = paddle.distributed.all_reduce( + sq_rel, op=paddle.distributed.ReduceOp.SUM) / world_size + rmse = paddle.distributed.all_reduce( + rmse, op=paddle.distributed.ReduceOp.SUM) / world_size + rmse_log = paddle.distributed.all_reduce( + rmse_log, op=paddle.distributed.ReduceOp.SUM) / world_size + a1 = paddle.distributed.all_reduce( + a1, op=paddle.distributed.ReduceOp.SUM) / world_size + a2 = paddle.distributed.all_reduce( + a2, op=paddle.distributed.ReduceOp.SUM) / world_size + a3 = paddle.distributed.all_reduce( + a3, op=paddle.distributed.ReduceOp.SUM) / world_size + return abs_rel.item(), sq_rel.item(), rmse.item(), rmse_log.item( + ), a1.item(), a2.item(), a3.item() + + return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 + + def compute_errors(self, gt, pred): + """Computation of error metrics between predicted and ground truth depths + """ + thresh = np.maximum((gt / pred), (pred / gt)) + a1 = (thresh < 1.25).mean() + a2 = (thresh < 1.25**2).mean() + a3 = (thresh < 1.25**3).mean() + + rmse = (gt - pred)**2 + rmse = np.sqrt(rmse.mean()) + + rmse_log = (np.log(gt) - np.log(pred))**2 + rmse_log = np.sqrt(rmse_log.mean()) + + abs_rel = np.mean(np.abs(gt - pred) / gt) + + sq_rel = np.mean(((gt - pred)**2) / gt) + + return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/agcn2s_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/agcn2s_head.py new file mode 100644 index 0000000..92cb5e4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/agcn2s_head.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn as nn + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +@HEADS.register() +class AGCN2sHead(BaseHead): + """ + Head for AGCN2s model. + Args: + in_channels: int, input feature channels. Default: 64. + num_classes: int, output the number of classes. + M: int, number of people. + drop_out: float, dropout ratio of layer. Default: 0. + """ + def __init__(self, in_channels=64, num_classes=10, M=2, **kwargs): + super().__init__(num_classes, in_channels, **kwargs) + self.in_channels = in_channels + self.M = M + weight_attr = paddle.ParamAttr( + name="linear_weight", + initializer=paddle.nn.initializer.Normal(mean=0.0, + std=math.sqrt( + 2. / num_classes))) + + self.fc = nn.Linear(self.in_channels * 4, + self.num_classes, + weight_attr=weight_attr) + + def forward(self, x): + """Define how the head is going to run. + """ + assert x.shape[ + 0] % self.M == 0, f'The first dimension of the output must be an integer multiple of the number of people M, but recieved shape[0]={x.shape[0]}, M={self.M}' + # N*M,C,T,V + N = x.shape[0] // self.M + c_new = x.shape[1] + x = x.reshape([N, self.M, c_new, -1]) + x = x.mean(3).mean(1) + + return self.fc(x) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/asrf_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/asrf_head.py new file mode 100644 index 0000000..c3aab77 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/asrf_head.py @@ -0,0 +1,212 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# https://github.com/yiskw713/asrf/libs/models/tcn.py +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +from paddle import ParamAttr + +from ..backbones.ms_tcn import SingleStageModel + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ +from ..framework.segmenters.utils import init_bias, KaimingUniform_like_torch + + +@HEADS.register() +class ASRFHead(BaseHead): + + def __init__(self, + num_classes, + num_features, + num_stages, + num_layers, + num_stages_asb=None, + num_stages_brb=None): + super().__init__(num_classes=num_classes, in_channels=num_features) + if not isinstance(num_stages_asb, int): + num_stages_asb = num_stages + + if not isinstance(num_stages_brb, int): + num_stages_brb = num_stages + + self.num_layers = num_layers + self.num_stages_asb = num_stages_asb + self.num_stages_brb = num_stages_brb + self.num_features = num_features + + # cls score + self.overlap = 0.5 + + self.conv_cls = nn.Conv1D(self.num_features, self.num_classes, 1) + self.conv_boundary = nn.Conv1D(self.num_features, 1, 1) + + # action segmentation branch + asb = [ + SingleStageModel(self.num_layers, self.num_features, + self.num_classes, self.num_classes) + for _ in range(self.num_stages_asb - 1) + ] + + # boundary regression branch + brb = [ + SingleStageModel(self.num_layers, self.num_features, 1, 1) + for _ in range(self.num_stages_brb - 1) + ] + self.brb = nn.LayerList(brb) + self.asb = nn.LayerList(asb) + + self.activation_asb = nn.Softmax(axis=1) + self.activation_brb = nn.Sigmoid() + + def init_weights(self): + """ + initialize model layers' weight + """ + # init weight + for layer in self.sublayers(): + if isinstance(layer, nn.Conv1D): + layer.weight.set_value( + KaimingUniform_like_torch(layer.weight).astype('float32')) + if layer.bias is not None: + layer.bias.set_value( + init_bias(layer.weight, layer.bias).astype('float32')) + + def forward(self, x): + """ + ASRF head + """ + out_cls = self.conv_cls(x) + out_boundary = self.conv_boundary(x) + + outputs_cls = [out_cls] + outputs_boundary = [out_boundary] + + for as_stage in self.asb: + out_cls = as_stage(self.activation_asb(out_cls)) + outputs_cls.append(out_cls) + + for br_stage in self.brb: + out_boundary = br_stage(self.activation_brb(out_boundary)) + outputs_boundary.append(out_boundary) + + return outputs_cls, outputs_boundary + + def get_F1_score(self, predicted, groundTruth): + recog_content = list(predicted.numpy()) + gt_content = list(groundTruth[0].numpy()) + + # cls score + correct = 0 + total = 0 + edit = 0 + + for i in range(len(gt_content)): + total += 1 + + if gt_content[i] == recog_content[i]: + correct += 1 + + edit_num = self.edit_score(recog_content, gt_content) + edit += edit_num + + tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap) + + # cls metric + + precision = tp / float(tp + fp) + recall = tp / float(fp + fn) + + if precision + recall > 0.0: + f1 = 2.0 * (precision * recall) / (precision + recall) + else: + f1 = 0.0 + f1 = np.nan_to_num(f1) + return f1 + + def get_labels_start_end_time(self, frame_wise_labels): + labels = [] + starts = [] + ends = [] + last_label = frame_wise_labels[0] + labels.append(frame_wise_labels[0]) + starts.append(0) + for i in range(len(frame_wise_labels)): + if frame_wise_labels[i] != last_label: + labels.append(frame_wise_labels[i]) + starts.append(i) + ends.append(i) + last_label = frame_wise_labels[i] + ends.append(i + 1) + return labels, starts, ends + + def levenstein(self, p, y, norm=False): + m_row = len(p) + n_col = len(y) + D = np.zeros([m_row + 1, n_col + 1], np.float) + for i in range(m_row + 1): + D[i, 0] = i + for i in range(n_col + 1): + D[0, i] = i + + for j in range(1, n_col + 1): + for i in range(1, m_row + 1): + if y[j - 1] == p[i - 1]: + D[i, j] = D[i - 1, j - 1] + else: + D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1, + D[i - 1, j - 1] + 1) + + if norm: + score = (1 - D[-1, -1] / max(m_row, n_col)) * 100 + else: + score = D[-1, -1] + + return score + + def edit_score(self, recognized, ground_truth, norm=True): + P, _, _ = self.get_labels_start_end_time(recognized) + Y, _, _ = self.get_labels_start_end_time(ground_truth) + return self.levenstein(P, Y, norm) + + def f_score(self, recognized, ground_truth, overlap): + p_label, p_start, p_end = self.get_labels_start_end_time(recognized) + y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth) + + tp = 0 + fp = 0 + + hits = np.zeros(len(y_label)) + + for j in range(len(p_label)): + intersection = np.minimum(p_end[j], y_end) - np.maximum( + p_start[j], y_start) + union = np.maximum(p_end[j], y_end) - np.minimum( + p_start[j], y_start) + IoU = (1.0 * intersection / union) * ( + [p_label[j] == y_label[x] for x in range(len(y_label))]) + # Get the best scoring segment + idx = np.array(IoU).argmax() + + if IoU[idx] >= overlap and not hits[idx]: + tp += 1 + hits[idx] = 1 + else: + fp += 1 + fn = len(y_label) - sum(hits) + return float(tp), float(fp), float(fn) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/attention_lstm_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/attention_lstm_head.py new file mode 100644 index 0000000..24c31ad --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/attention_lstm_head.py @@ -0,0 +1,288 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr +from paddle.nn.initializer import Normal +from paddle.regularizer import L2Decay +import paddle.nn.functional as F + +from ...metrics.youtube8m import eval_util as youtube8m_metrics +from ..registry import HEADS +from ..weight_init import weight_init_ +from .base import BaseHead + + +@HEADS.register() +class AttentionLstmHead(BaseHead): + """AttentionLstmHead. + Args: TODO + """ + def __init__(self, + num_classes=3862, + feature_num=2, + feature_dims=[1024, 128], + embedding_size=512, + lstm_size=1024, + in_channels=2048, + loss_cfg=dict(name='CrossEntropyLoss')): + super(AttentionLstmHead, self).__init__(num_classes, in_channels, + loss_cfg) + self.num_classes = num_classes + self.feature_dims = feature_dims + self.embedding_size = embedding_size + self.lstm_size = lstm_size + self.feature_num = len(self.feature_dims) + for i in range(self.feature_num): # 0:rgb, 1:audio + fc_feature = paddle.nn.Linear(in_features=self.feature_dims[i], + out_features=self.embedding_size) + self.add_sublayer("fc_feature{}".format(i), fc_feature) + + bi_lstm = paddle.nn.LSTM(input_size=self.embedding_size, + hidden_size=self.lstm_size, + direction="bidirectional") + self.add_sublayer("bi_lstm{}".format(i), bi_lstm) + + drop_rate = 0.5 + self.dropout = paddle.nn.Dropout(drop_rate) + + att_fc = paddle.nn.Linear(in_features=self.lstm_size * 2, + out_features=1) + self.add_sublayer("att_fc{}".format(i), att_fc) + self.softmax = paddle.nn.Softmax() + + self.fc_out1 = paddle.nn.Linear(in_features=self.lstm_size * 4, + out_features=8192, + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), + initializer=Normal())) + self.relu = paddle.nn.ReLU() + self.fc_out2 = paddle.nn.Linear(in_features=8192, + out_features=4096, + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), + initializer=Normal())) + self.fc_logit = paddle.nn.Linear(in_features=4096, + out_features=self.num_classes, + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), + initializer=Normal())) + self.sigmoid = paddle.nn.Sigmoid() + + def init_weights(self): + pass + + def forward(self, inputs): + # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)] + # deal with features with different length + # 1. padding to same lenght, make a tensor + # 2. make a mask tensor with the same shpae with 1 + # 3. compute output using mask tensor, s.t. output is nothing todo with padding + assert (len(inputs) == self.feature_num + ), "Input tensor does not contain {} features".format( + self.feature_num) + att_outs = [] + for i in range(len(inputs)): + # 1. fc + m = getattr(self, "fc_feature{}".format(i)) + output_fc = m(inputs[i][0]) + output_fc = paddle.tanh(output_fc) + + # 2. bi_lstm + m = getattr(self, "bi_lstm{}".format(i)) + lstm_out, _ = m(inputs=output_fc, sequence_length=inputs[i][1]) + + lstm_dropout = self.dropout(lstm_out) + + # 3. att_fc + m = getattr(self, "att_fc{}".format(i)) + lstm_weight = m(lstm_dropout) + + # 4. softmax replace start, for it's relevant to sum in time step + lstm_exp = paddle.exp(lstm_weight) + lstm_mask = paddle.mean(inputs[i][2], axis=2) + lstm_mask = paddle.unsqueeze(lstm_mask, axis=2) + lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask) + lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1) + exponent = -1 + lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent) + lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2) + lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator) + lstm_weight = lstm_softmax + # softmax replace end + + lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight) + + # 5. sequence_pool's replace start, for it's relevant to sum in time step + lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask) + fea_lens = inputs[i][1] + fea_len = int(fea_lens[0]) + lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1) + # sequence_pool's replace end + att_outs.append(lstm_pool) + att_out = paddle.concat(att_outs, axis=1) + fc_out1 = self.fc_out1(att_out) + fc_out1_act = self.relu(fc_out1) + fc_out2 = self.fc_out2(fc_out1_act) + fc_out2_act = paddle.tanh(fc_out2) + fc_logit = self.fc_logit(fc_out2_act) + output = self.sigmoid(fc_logit) + return fc_logit, output + + def loss(self, lstm_logit, labels, **kwargs): + labels.stop_gradient = True + losses = dict() + bce_logit_loss = paddle.nn.BCEWithLogitsLoss(reduction='sum') + sum_cost = bce_logit_loss(lstm_logit, labels) + return sum_cost + + def metric(self, lstm_output, labels): + pred = lstm_output.numpy() + label = labels.numpy() + hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label) + perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate( + pred, label) + gap = youtube8m_metrics.calculate_gap(pred, label) + return hit_at_one, perr, gap + + +@HEADS.register() +class ActionAttentionLstmHead(BaseHead): + """AttentionLstmHead for FootballAction + Args: TODO + """ + def __init__(self, + num_classes=8, + feature_num=2, + feature_dims=[2048, 1024], + embedding_size=512, + lstm_size=1024, + in_channels=2048, + loss_cfg=dict(name='CrossEntropyLoss')): + super(ActionAttentionLstmHead, self).__init__(num_classes, in_channels, + loss_cfg) + self.num_classes = num_classes + self.feature_dims = feature_dims + self.embedding_size = embedding_size + self.lstm_size = lstm_size + self.feature_num = len(self.feature_dims) + for i in range(self.feature_num): # 0:rgb, 1:audio + bi_lstm = paddle.nn.LSTM(input_size=self.feature_dims[i], + hidden_size=self.feature_dims[i], + direction="bidirectional") + self.add_sublayer("bi_lstm{}".format(i), bi_lstm) + + drop_rate = 0.5 + self.dropout = paddle.nn.Dropout(drop_rate) + + att_fc = paddle.nn.Linear(in_features=self.feature_dims[i] * 2, + out_features=1) + self.add_sublayer("att_fc{}".format(i), att_fc) + self.softmax = paddle.nn.Softmax() + + self.fc1 = paddle.nn.Linear(in_features=2 * sum(self.feature_dims), + out_features=8192, + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), + initializer=Normal())) + self.bn1 = paddle.nn.BatchNorm(num_channels=8192) + self.dropout1 = paddle.nn.Dropout(0.5) + self.fc2 = paddle.nn.Linear(in_features=8192, + out_features=4096, + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), + initializer=Normal())) + self.bn2 = paddle.nn.BatchNorm(num_channels=4096) + self.dropout2 = paddle.nn.Dropout(0.5) + self.fc3 = paddle.nn.Linear( + in_features=4096, + out_features=self.num_classes, + ) + self.fc4 = paddle.nn.Linear( + in_features=4096, + out_features=1, + ) + + def init_weights(self): + pass + + def forward(self, inputs): + # inputs = [(rgb_data, rgb_len, rgb_mask), (audio_data, audio_len, audio_mask)] + # deal with features with different length + # 1. padding to same lenght, make a tensor + # 2. make a mask tensor with the same shpae with 1 + # 3. compute output using mask tensor, s.t. output is nothing todo with padding + assert (len(inputs) == self.feature_num + ), "Input tensor does not contain {} features".format( + self.feature_num) + att_outs = [] + for i in range(len(inputs)): + m = getattr(self, "bi_lstm{}".format(i)) + lstm_out, _ = m(inputs=inputs[i][0], sequence_length=inputs[i][1]) + + lstm_dropout = self.dropout(lstm_out) + + # 3. att_fc + m = getattr(self, "att_fc{}".format(i)) + lstm_weight = m(lstm_dropout) + + # 4. softmax replace start, for it's relevant to sum in time step + lstm_exp = paddle.exp(lstm_weight) + lstm_mask = paddle.mean(inputs[i][2], axis=2) + lstm_mask = paddle.unsqueeze(lstm_mask, axis=2) + lstm_exp_with_mask = paddle.multiply(x=lstm_exp, y=lstm_mask) + lstm_sum_with_mask = paddle.sum(lstm_exp_with_mask, axis=1) + exponent = -1 + lstm_denominator = paddle.pow(lstm_sum_with_mask, exponent) + lstm_denominator = paddle.unsqueeze(lstm_denominator, axis=2) + lstm_softmax = paddle.multiply(x=lstm_exp, y=lstm_denominator) + lstm_weight = lstm_softmax + # softmax replace end + + lstm_scale = paddle.multiply(x=lstm_dropout, y=lstm_weight) + + # 5. sequence_pool's replace start, for it's relevant to sum in time step + lstm_scale_with_mask = paddle.multiply(x=lstm_scale, y=lstm_mask) + # fea_lens = inputs[i][1] + # fea_len = int(fea_lens[0]) + lstm_pool = paddle.sum(lstm_scale_with_mask, axis=1) + # sequence_pool's replace end + att_outs.append(lstm_pool) + att_out = paddle.concat(att_outs, axis=1) + y = self.fc1(att_out) + y = self.bn1(y) + y = F.relu(y) + y = self.dropout1(y) + y = self.fc2(y) + y = self.bn2(y) + y = F.relu(y) + y = self.dropout2(y) + out1 = self.fc3(y) + out1 = F.softmax(out1) + out2 = self.fc4(y) + out2 = F.sigmoid(out2) + return out1, out2 + + def loss(self, logits, iou, labels, labels_iou, **kwargs): + alpha = 10 + softmax_loss = F.cross_entropy(logits, labels) + labels_iou = labels_iou.astype('float32') + mse_loss = paddle.sum(F.square_error_cost(iou, labels_iou), axis=-1) + sum_loss = softmax_loss + alpha * mse_loss + return sum_loss + + def metric(self, scores, labels): + top1 = paddle.metric.accuracy(input=scores, label=labels, k=1) + top5 = paddle.metric.accuracy(input=scores, label=labels, k=5) + return top1, top5 diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/base.py new file mode 100644 index 0000000..99a1408 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/base.py @@ -0,0 +1,178 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from abc import abstractmethod + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ..builder import build_loss +from paddlevideo.utils import get_logger, get_dist_info + +logger = get_logger("paddlevideo") + + +class BaseHead(nn.Layer): + """Base class for head part. + + All head should subclass it. + All subclass should overwrite: + + - Methods: ```init_weights```, initializing weights. + - Methods: ```forward```, forward function. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channels in input feature. + loss_cfg (dict): Config for building loss. Default: dict(type='CrossEntropyLoss'). + ls_eps (float): label smoothing epsilon. Default: 0. . + + """ + def __init__( + self, + num_classes=None, + in_channels=None, + loss_cfg=dict( + name="CrossEntropyLoss" + ), #TODO(shipping): only pass a name or standard build cfg format. + #multi_class=False, NOTE(shipping): not supported now. + ls_eps=0.): + + super().__init__() + self.num_classes = num_classes + self.in_channels = in_channels + self.loss_func = build_loss(loss_cfg) + #self.multi_class = multi_class NOTE(shipping): not supported now + self.ls_eps = ls_eps + + @abstractmethod + def forward(self, x): + """Define how the head is going to run. + """ + raise NotImplemented + + def loss(self, scores, labels, valid_mode=False, if_top5=True, **kwargs): + """Calculate the loss accroding to the model output ```scores```, + and the target ```labels```. + + Args: + scores (paddle.Tensor): The output of the model. + labels (paddle.Tensor): The target output of the model. + + Returns: + losses (dict): A dict containing field 'loss'(mandatory) and 'top1_acc', 'top5_acc'(optional). + + """ + if len(labels) == 1: #commonly case + labels = labels[0] + losses = dict() + if self.ls_eps != 0. and not valid_mode: # label_smooth + loss = self.label_smooth_loss(scores, labels, **kwargs) + else: + loss = self.loss_func(scores, labels, **kwargs) + if if_top5: + top1, top5 = self.get_acc(scores, labels, valid_mode) + losses['top1'] = top1 + losses['top5'] = top5 + losses['loss'] = loss + else: + top1 = self.get_acc(scores, labels, valid_mode, if_top5) + losses['top1'] = top1 + losses['loss'] = loss + return losses + # MRI目前二分类无top5 + elif len(labels) == 3: # mix_up + labels_a, labels_b, lam = labels + lam = lam[0] # get lam value + losses = dict() + if self.ls_eps != 0: + loss_a = self.label_smooth_loss(scores, labels_a, **kwargs) + loss_b = self.label_smooth_loss(scores, labels_b, **kwargs) + else: + loss_a = self.loss_func(scores, labels_a, **kwargs) + loss_b = self.loss_func(scores, labels_b, **kwargs) + loss = lam * loss_a + (1 - lam) * loss_b + + if if_top5: + top1a, top5a = self.get_acc(scores, labels_a, valid_mode) + top1b, top5b = self.get_acc(scores, labels_b, valid_mode) + top1 = lam * top1a + (1 - lam) * top1b + top5 = lam * top5a + (1 - lam) * top5b + losses['top1'] = top1 + losses['top5'] = top5 + losses['loss'] = loss + + else: + top1a = self.get_acc(scores, labels_a, valid_mode, if_top5) + top1b = self.get_acc(scores, labels_b, valid_mode, if_top5) + top1 = lam * top1a + (1 - lam) * top1b + losses['top1'] = top1 + losses['loss'] = loss + + return losses + else: + raise NotImplemented + + def label_smooth_loss(self, scores, labels, **kwargs): + """ + Args: + scores (paddle.Tensor): [N, num_classes] + labels (paddle.Tensor): [N, ] + Returns: + paddle.Tensor: [1,] + """ + if paddle.is_compiled_with_custom_device('npu'): + """ + Designed for the lack of temporary operators of NPU, + main idea is to split smooth loss into uniform distribution loss + and hard label calculation + """ + hard_loss = (1.0 - self.ls_eps) * F.cross_entropy(scores, labels) + uniform_loss = (self.ls_eps / self.num_classes) * ( + -F.log_softmax(scores, -1).sum(-1).mean(0)) + loss = hard_loss + uniform_loss + else: + labels = F.one_hot(labels, self.num_classes) + labels = F.label_smooth(labels, epsilon=self.ls_eps) + labels = paddle.squeeze(labels, axis=1) + loss = self.loss_func(scores, labels, soft_label=True, **kwargs) + return loss + + def get_acc(self, scores, labels, valid_mode, if_top5=True): + if if_top5: + top1 = paddle.metric.accuracy(input=scores, label=labels, k=1) + top5 = paddle.metric.accuracy(input=scores, label=labels, k=5) + _, world_size = get_dist_info() + #NOTE(shipping): deal with multi cards validate + if world_size > 1 and valid_mode: #reduce sum when valid + paddle.distributed.all_reduce( + top1, op=paddle.distributed.ReduceOp.SUM) + top1 = top1 / world_size + paddle.distributed.all_reduce( + top5, op=paddle.distributed.ReduceOp.SUM) + top5 = top5 / world_size + + return top1, top5 + else: + top1 = paddle.metric.accuracy(input=scores, label=labels, k=1) + _, world_size = get_dist_info() + #NOTE(shipping): deal with multi cards validate + if world_size > 1 and valid_mode: #reduce sum when valid + paddle.distributed.all_reduce( + top1, op=paddle.distributed.ReduceOp.SUM) + top1 = top1 / world_size + + return top1 diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/bbox_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/bbox_head.py new file mode 100644 index 0000000..688251e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/bbox_head.py @@ -0,0 +1,225 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +from .. import builder + +from ..registry import HEADS + +@HEADS.register() +class BBoxHeadAVA(nn.Layer): + """Simplest RoI head, with only two fc layers for classification and + regression respectively. """ + + def __init__( + self, + temporal_pool_type='avg', + spatial_pool_type='max', + in_channels=2048, + num_classes=81,# The first class is reserved, to classify bbox as pos / neg + dropout_ratio=0, + dropout_before_pool=True, + topk=(3, 5), + multilabel=True): + + super(BBoxHeadAVA, self).__init__() + assert temporal_pool_type in ['max', 'avg'] + assert spatial_pool_type in ['max', 'avg'] + self.temporal_pool_type = temporal_pool_type + self.spatial_pool_type = spatial_pool_type + + self.in_channels = in_channels + self.num_classes = num_classes + + self.dropout_ratio = dropout_ratio + self.dropout_before_pool = dropout_before_pool + + self.multilabel = multilabel + if topk is None: + self.topk = () + elif isinstance(topk, int): + self.topk = (topk, ) + elif isinstance(topk, tuple): + assert all([isinstance(k, int) for k in topk]) + self.topk = topk + else: + raise TypeError('topk should be int or tuple[int], ' + f'but get {type(topk)}') + # Class 0 is ignored when calculaing multilabel accuracy, + # so topk cannot be equal to num_classes + assert all([k < num_classes for k in self.topk]) + assert self.multilabel + + in_channels = self.in_channels + if self.temporal_pool_type == 'avg': + self.temporal_pool = nn.AdaptiveAvgPool3D((1, None, None)) + else: + self.temporal_pool = nn.AdaptiveMaxPool3D((1, None, None)) + if self.spatial_pool_type == 'avg': + self.spatial_pool = nn.AdaptiveAvgPool3D((None, 1, 1)) + else: + self.spatial_pool = nn.AdaptiveMaxPool3D((None, 1, 1)) + + if dropout_ratio > 0: + self.dropout = nn.Dropout(dropout_ratio) + + weight_attr = paddle.framework.ParamAttr(name="weight", + initializer=paddle.nn.initializer.Normal(mean=0.0, std=0.01)) + bias_attr = paddle.ParamAttr(name="bias", + initializer=paddle.nn.initializer.Constant(value=0.0)) + + self.fc_cls = nn.Linear(in_channels, num_classes, weight_attr=weight_attr, bias_attr=bias_attr) + + self.debug_imgs = None + + def forward(self, x,rois, rois_num): + roi = paddle.concat(rois) + roi_x1 = paddle.index_select(roi, index=paddle.to_tensor(0), axis=1) + roi_x2 = paddle.index_select(roi, index=paddle.to_tensor(2), axis=1) + roi_w = roi_x2 - roi_x1 + roi_y1 = paddle.index_select(roi, index=paddle.to_tensor(1), axis=1) + roi_y2 = paddle.index_select(roi, index=paddle.to_tensor(3), axis=1) + roi_h = roi_y2 - roi_y1 + roi_area = paddle.multiply(roi_w, roi_h) + A = roi_area + A1 = paddle.full(A.shape, 1, dtype='int32') + A2 = paddle.where(A == 0, paddle.zeros_like(A1), A1) + AE = paddle.expand(A2, [A.shape[0], x.shape[1]]) + rois_num = paddle.to_tensor(rois_num, dtype='int32') + if self.dropout_before_pool and self.dropout_ratio > 0 : + x = self.dropout(x) + x = self.temporal_pool(x) + x = self.spatial_pool(x) + if not self.dropout_before_pool and self.dropout_ratio > 0 : + x = self.dropout(x) + x = paddle.reshape(x, [x.shape[0], -1]) + x = paddle.multiply(x, paddle.cast(AE,"float32")) + cls_score = self.fc_cls(x) + # We do not predict bbox, so return None + return cls_score, None + + def get_targets(self, sampling_results, gt_bboxes, gt_labels, pos_weight): + pos_proposals = [res.pos_bboxes for res in sampling_results] + neg_proposals = [res.neg_bboxes for res in sampling_results] + pos_gt_labels = [res.pos_gt_labels for res in sampling_results] + cls_reg_targets = self.bbox_target(pos_proposals, neg_proposals, + pos_gt_labels, pos_weight) + return cls_reg_targets + + def bbox_target(self, pos_bboxes_list, neg_bboxes_list, gt_labels, pos_weight): + """Generate classification targets for bboxes. """ + labels, label_weights = [], [] + pos_weight = 1.0 if pos_weight <= 0 else pos_weight + + assert len(pos_bboxes_list) == len(neg_bboxes_list) == len(gt_labels) + length = len(pos_bboxes_list) + + for i in range(length): + pos_bboxes = pos_bboxes_list[i] + neg_bboxes = neg_bboxes_list[i] + gt_label = gt_labels[i] + num_pos = pos_bboxes.shape[0] + if neg_bboxes is not None: + num_neg = neg_bboxes.shape[0] + else: + num_neg = 0 + num_samples = num_pos + num_neg + neg_label = paddle.zeros([num_neg, gt_label.shape[1]]) + label = paddle.concat([gt_label,neg_label]) + labels.append(label) + + labels = paddle.concat(labels, 0) + return labels + + def recall_prec(self, pred_vec, target_vec): + correct = paddle.to_tensor(np.logical_and(pred_vec.numpy(), target_vec.numpy())) + correct = paddle.where(correct, + paddle.full(correct.shape,1,dtype='int32'), + paddle.full(correct.shape,0,dtype='int32')) + recall_correct = paddle.cast(paddle.sum(correct, axis=1), 'float32') + target_vec = paddle.where(target_vec, + paddle.full(target_vec.shape,1,dtype='int32'), + paddle.full(target_vec.shape,0,dtype='int32')) + recall_target = paddle.cast(paddle.sum(target_vec, axis=1),'float32') + recall = recall_correct / recall_target + pred_vec = paddle.where(pred_vec, + paddle.full(pred_vec.shape,1,dtype='int32'), + paddle.full(pred_vec.shape,0,dtype='int32')) + prec_target = paddle.cast(paddle.sum(pred_vec, axis=1) + 1e-6, 'float32') + prec = recall_correct / prec_target + recall_mean = paddle.mean(recall) + prec_mean = paddle.mean(prec) + return recall_mean, prec_mean + + def multilabel_accuracy(self, pred, target, thr=0.5): + pred = paddle.nn.functional.sigmoid(pred) + pred_vec = pred > thr + target_vec = target > 0.5 + recall_thr, prec_thr = self.recall_prec(pred_vec, target_vec) + recalls, precs = [], [] + for k in self.topk: + _, pred_label = paddle.topk(pred, k, 1, True, True) + pred_vec = paddle.full(pred.shape,0,dtype='bool') + num_sample = pred.shape[0] + for i in range(num_sample): + pred_vec[i, pred_label[i].numpy()] = 1 + recall_k, prec_k = self.recall_prec(pred_vec, target_vec) + recalls.append(recall_k) + precs.append(prec_k) + return recall_thr, prec_thr, recalls, precs + + def loss(self, + cls_score, + labels): + losses = dict() + if cls_score is not None: + # Only use the cls_score + labels = labels[:, 1:] + pos_inds_bool = paddle.sum(labels, axis=-1) > 0 + pos_inds = paddle.where(paddle.sum(labels, axis=-1) > 0, + paddle.full([labels.shape[0]],1,dtype='int32'), + paddle.full([labels.shape[0]],0,dtype='int32')) + pos_inds = paddle.nonzero(pos_inds, as_tuple=False) + cls_score = paddle.index_select(cls_score, pos_inds, axis=0) + cls_score = cls_score[:, 1:] + labels = paddle.index_select(labels, pos_inds, axis=0) + bce_loss = F.binary_cross_entropy_with_logits + loss = bce_loss(cls_score, labels, reduction='none') + losses['loss'] = paddle.mean(loss) + recall_thr, prec_thr, recall_k, prec_k = self.multilabel_accuracy( + cls_score, labels, thr=0.5) + losses['recall@thr=0.5'] = recall_thr + losses['prec@thr=0.5'] = prec_thr + for i, k in enumerate(self.topk): + losses[f'recall@top{k}'] = recall_k[i] + losses[f'prec@top{k}'] = prec_k[i] + return losses + + def get_det_bboxes(self, + rois, + cls_score, + img_shape, + flip=False, + crop_quadruple=None, + cfg=None): + if isinstance(cls_score, list): + cls_score = sum(cls_score) / float(len(cls_score)) + assert self.multilabel + m = paddle.nn.Sigmoid() + scores = m(cls_score) + bboxes = rois + return bboxes, scores diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/cfbi_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/cfbi_head.py new file mode 100644 index 0000000..f7cbd91 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/cfbi_head.py @@ -0,0 +1,448 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +class IA_gate(nn.Layer): + def __init__(self, in_dim, out_dim): + super(IA_gate, self).__init__() + self.IA = nn.Linear(in_dim, out_dim) + + def forward(self, x, IA_head): + a = self.IA(IA_head) + a = 1. + paddle.tanh(a) + a = paddle.unsqueeze(paddle.unsqueeze(a, axis=-1), axis=-1) + x = a * x + return x + + +class GCT(nn.Layer): + def __init__(self, num_channels, epsilon=1e-5, mode='l2', after_relu=False): + super(GCT, self).__init__() + x1 = paddle.zeros([1, num_channels, 1, 1]) + x2 = paddle.ones([1, num_channels, 1, 1]) + self.alpha = paddle.create_parameter( + shape=x2.shape, + dtype=x2.dtype, + default_initializer=nn.initializer.Assign(x2)) + self.alpha.stop_gradient = False + self.gamma = paddle.create_parameter( + shape=x1.shape, + dtype=x1.dtype, + default_initializer=nn.initializer.Assign(x1)) + self.gamma.stop_gradient = False + self.beta = paddle.create_parameter( + shape=x1.shape, + dtype=x1.dtype, + default_initializer=nn.initializer.Assign(x1)) + self.beta.stop_gradient = False + + self.epsilon = epsilon + self.mode = mode + self.after_relu = after_relu + + def forward(self, x): + + if self.mode == 'l2': + embedding = paddle.pow( + paddle.sum(paddle.pow(x, 2), axis=[2, 3], keepdim=True) + + self.epsilon, 0.5) * self.alpha + norm = self.gamma / paddle.pow( + (paddle.mean(paddle.pow(embedding, 2), axis=1, keepdim=True) + + self.epsilon), 0.5) + elif self.mode == 'l1': + if not self.after_relu: + _x = paddle.abs(x) + else: + _x = x + embedding = paddle.sum(_x, axis=(2, 3), keepdim=True) * self.alpha + norm = self.gamma / (paddle.mean( + paddle.abs(embedding), axis=1, keepdim=True) + self.epsilon) + else: + print('Unknown mode!') + exit() + + gate = 1. + paddle.tanh(embedding * norm + self.beta) + + return x * gate + + +class Bottleneck(nn.Layer): + def __init__(self, inplanes, outplanes, stride=1, dilation=1): + super(Bottleneck, self).__init__() + expansion = 4 + planes = int(outplanes / expansion) + + self.GCT1 = GCT(inplanes) + self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False) + self.bn1 = nn.GroupNorm(num_groups=32, num_channels=planes) + + self.conv2 = nn.Conv2D(planes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + padding=dilation, + bias_attr=False) + self.bn2 = nn.GroupNorm(num_groups=32, num_channels=planes) + + self.conv3 = nn.Conv2D(planes, + planes * expansion, + kernel_size=1, + bias_attr=False) + self.bn3 = nn.GroupNorm(num_groups=32, num_channels=planes * expansion) + self.relu = nn.ReLU() + if stride != 1 or inplanes != planes * expansion: + downsample = nn.Sequential( + nn.Conv2D(inplanes, + planes * expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.GroupNorm(num_groups=32, num_channels=planes * expansion), + ) + else: + downsample = None + self.downsample = downsample + + self.stride = stride + self.dilation = dilation + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + nn.initializer.KaimingNormal() + + def forward(self, x): + residual = x + + out = self.GCT1(x) + out = self.conv1(out) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class _ASPPModule(nn.Layer): + def __init__(self, inplanes, planes, kernel_size, padding, dilation): + super(_ASPPModule, self).__init__() + self.GCT = GCT(inplanes) + self.atrous_conv = nn.Conv2D(inplanes, + planes, + kernel_size=kernel_size, + stride=1, + padding=padding, + dilation=dilation, + bias_attr=False) + self.bn = nn.GroupNorm(num_groups=int(planes / 4), num_channels=planes) + self.relu = nn.ReLU() + + self._init_weight() + + def forward(self, x): + x = self.GCT(x) + x = self.atrous_conv(x) + x = self.bn(x) + + return self.relu(x) + + def _init_weight(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + nn.initializer.KaimingNormal() + elif isinstance(m, nn.GroupNorm): + m.weight.data = nn.initializer.Constant(1) + m.bias.data = nn.initializer.Constant(0) + + +class ASPP(nn.Layer): + def __init__(self): + super(ASPP, self).__init__() + + inplanes = 512 + dilations = [1, 6, 12, 18] + + self.aspp1 = _ASPPModule(inplanes, + 128, + 1, + padding=0, + dilation=dilations[0]) + self.aspp2 = _ASPPModule(inplanes, + 128, + 3, + padding=dilations[1], + dilation=dilations[1]) + self.aspp3 = _ASPPModule(inplanes, + 128, + 3, + padding=dilations[2], + dilation=dilations[2]) + self.aspp4 = _ASPPModule(inplanes, + 128, + 3, + padding=dilations[3], + dilation=dilations[3]) + + self.global_avg_pool = nn.Sequential( + nn.AdaptiveAvgPool2D((1, 1)), + nn.Conv2D(inplanes, 128, 1, stride=1, bias_attr=False), nn.ReLU()) + + self.GCT = GCT(640) + self.conv1 = nn.Conv2D(640, 256, 1, bias_attr=False) + self.bn1 = nn.GroupNorm(num_groups=32, num_channels=256) + self.relu = nn.ReLU() + self._init_weight() + + def forward(self, x): + x1 = self.aspp1(x) + x2 = self.aspp2(x) + x3 = self.aspp3(x) + x4 = self.aspp4(x) + x5 = self.global_avg_pool(x) + x5 = F.interpolate(x5, + size=x4.shape[2:], + mode='bilinear', + align_corners=True) + x = paddle.concat([x1, x2, x3, x4, x5], axis=1) + + x = self.GCT(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + return x + + def _init_weight(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + nn.initializer.KaimingNormal() + elif isinstance(m, nn.GroupNorm): + m.weight.data = nn.initializer.Constant(1) + m.bias.data = nn.initializer.Constant(0) + + +@HEADS.register() +class CollaborativeEnsemblerMS(nn.Layer): + def __init__( + self, + model_semantic_embedding_dim=256, + model_multi_local_distance=[[4, 8, 12, 16, 20, 24], + [2, 4, 6, 8, 10, 12], [2, 4, 6, 8, 10]], + model_head_embedding_dim=256, + model_refine_channels=64, + model_low_level_inplanes=256, + ): + super(CollaborativeEnsemblerMS, self).__init__() + in_dim_4x = model_semantic_embedding_dim * 3 + 3 + 2 * len( + model_multi_local_distance[0]) + in_dim_8x = model_semantic_embedding_dim * 3 + 3 + 2 * len( + model_multi_local_distance[1]) + in_dim_16x = model_semantic_embedding_dim * 3 + 3 + 2 * len( + model_multi_local_distance[2]) + attention_dim = model_semantic_embedding_dim * 4 + embed_dim = model_head_embedding_dim + refine_dim = model_refine_channels + low_level_dim = model_low_level_inplanes + + IA_in_dim = attention_dim + + self.relu = nn.ReLU() + + # stage 1 + + self.S1_IA1 = IA_gate(IA_in_dim, in_dim_4x) + self.S1_layer1 = Bottleneck(in_dim_4x, embed_dim) + + self.S1_IA2 = IA_gate(IA_in_dim, embed_dim) + self.S1_layer2 = Bottleneck(embed_dim, embed_dim, 1, 2) + + # stage2 + self.S2_IA1 = IA_gate(IA_in_dim, embed_dim) + self.S2_layer1 = Bottleneck(embed_dim, embed_dim * 2, 2) + + self.S2_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_8x) + self.S2_layer2 = Bottleneck(embed_dim * 2 + in_dim_8x, embed_dim * 2, 1, + 2) + + self.S2_IA3 = IA_gate(IA_in_dim, embed_dim * 2) + self.S2_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4) + + # stage3 + self.S3_IA1 = IA_gate(IA_in_dim, embed_dim * 2) + self.S3_layer1 = Bottleneck(embed_dim * 2, embed_dim * 2, 2) + + self.S3_IA2 = IA_gate(IA_in_dim, embed_dim * 2 + in_dim_16x) + self.S3_layer2 = Bottleneck(embed_dim * 2 + in_dim_16x, embed_dim * 2, + 1, 2) + + self.S3_IA3 = IA_gate(IA_in_dim, embed_dim * 2) + self.S3_layer3 = Bottleneck(embed_dim * 2, embed_dim * 2, 1, 4) + + self.ASPP_IA = IA_gate(IA_in_dim, embed_dim * 2) + self.ASPP = ASPP() + + # Decoder + self.GCT_sc = GCT(low_level_dim + embed_dim) + self.conv_sc = nn.Conv2D(low_level_dim + embed_dim, + refine_dim, + 1, + bias_attr=False) + self.bn_sc = nn.GroupNorm(num_groups=int(refine_dim / 4), + num_channels=refine_dim) + self.relu = nn.ReLU() + + self.IA10 = IA_gate(IA_in_dim, embed_dim + refine_dim) + self.conv1 = nn.Conv2D(embed_dim + refine_dim, + int(embed_dim / 2), + kernel_size=3, + padding=1, + bias_attr=False) + self.bn1 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2)) + + self.IA11 = IA_gate(IA_in_dim, int(embed_dim / 2)) + self.conv2 = nn.Conv2D(int(embed_dim / 2), + int(embed_dim / 2), + kernel_size=3, + padding=1, + bias_attr=False) + self.bn2 = nn.GroupNorm(num_groups=32, num_channels=int(embed_dim / 2)) + + # Output + self.IA_final_fg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1) + self.IA_final_bg = nn.Linear(IA_in_dim, int(embed_dim / 2) + 1) + + self.conv_sc.weight.data = nn.initializer.KaimingNormal() + self.conv1.weight.data = nn.initializer.KaimingNormal() + self.conv2.weight.data = nn.initializer.KaimingNormal() + + def forward(self, all_x, all_IA_head=None, low_level_feat=None): + x_4x, x_8x, x_16x = all_x + IA_head = all_IA_head[0] + + # stage 1 + x = self.S1_IA1(x_4x, IA_head) + x = self.S1_layer1(x) + + x = self.S1_IA2(x, IA_head) + x = self.S1_layer2(x) + + low_level_feat = paddle.concat( + [paddle.expand(low_level_feat, [x.shape[0], -1, -1, -1]), x], + axis=1) + + # stage 2 + x = self.S2_IA1(x, IA_head) + x = self.S2_layer1(x) + + x = paddle.concat([x, x_8x], axis=1) + x = self.S2_IA2(x, IA_head) + x = self.S2_layer2(x) + + x = self.S2_IA3(x, IA_head) + x = self.S2_layer3(x) + + # stage 3 + x = self.S3_IA1(x, IA_head) + x = self.S3_layer1(x) + + x = paddle.concat([x, x_16x], axis=1) + x = self.S3_IA2(x, IA_head) + x = self.S3_layer2(x) + + x = self.S3_IA3(x, IA_head) + x = self.S3_layer3(x) + + # ASPP + Decoder + x = self.ASPP_IA(x, IA_head) + x = self.ASPP(x) + + x = self.decoder(x, low_level_feat, IA_head) + + fg_logit = self.IA_logit(x, IA_head, self.IA_final_fg) + bg_logit = self.IA_logit(x, IA_head, self.IA_final_bg) + + pred = self.augment_background_logit(fg_logit, bg_logit) + + return pred + + def IA_logit(self, x, IA_head, IA_final): + n, c, h, w = x.shape + x = paddle.reshape(x, [1, n * c, h, w]) + IA_output = IA_final(IA_head) + IA_weight = IA_output[:, :c] + IA_bias = IA_output[:, -1] + IA_weight = paddle.reshape(IA_weight, [n, c, 1, 1]) + + IA_bias = paddle.reshape(IA_bias, [-1]) + logit = paddle.reshape( + F.conv2d(x, weight=IA_weight, bias=IA_bias, groups=n), [n, 1, h, w]) + return logit + + def decoder(self, x, low_level_feat, IA_head): + x = F.interpolate(x, + size=low_level_feat.shape[2:], + mode='bicubic', + align_corners=True) + + low_level_feat = self.GCT_sc(low_level_feat) + low_level_feat = self.conv_sc(low_level_feat) + low_level_feat = self.bn_sc(low_level_feat) + low_level_feat = self.relu(low_level_feat) + + x = paddle.concat([x, low_level_feat], axis=1) + x = self.IA10(x, IA_head) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.IA11(x, IA_head) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + return x + + def augment_background_logit(self, fg_logit, bg_logit): + # We augment the logit of absolute background by using the relative background logit of all the + # foreground objects. + obj_num = fg_logit.shape[0] + pred = fg_logit + if obj_num > 1: + bg_logit = bg_logit[1:obj_num, :, :, :] + aug_bg_logit = paddle.min(bg_logit, axis=0, keepdim=True) + pad = paddle.expand(paddle.zeros(aug_bg_logit.shape), + [obj_num - 1, -1, -1, -1]) + aug_bg_logit = paddle.concat([aug_bg_logit, pad], axis=0) + pred = pred + aug_bg_logit + pred = paddle.transpose(pred, [1, 0, 2, 3]) + return pred diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/ctrgcn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ctrgcn_head.py new file mode 100644 index 0000000..c551d0d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ctrgcn_head.py @@ -0,0 +1,65 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn as nn + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +@HEADS.register() +class CTRGCNHead(BaseHead): + """ + Head for CTR-GCN model. + Args: + in_channels: int, input feature channels. Default: 64. + num_classes: int, output the number of classes. + drop_out: float, dropout ratio of layer. Default: 0. + """ + + def __init__(self, in_channels=64, num_classes=10, drop_out=0, **kwargs): + super().__init__(num_classes, in_channels, **kwargs) + self.in_channels = in_channels + self.drop_out = drop_out + + self.fc = nn.Linear(self.in_channels * 4, self.num_classes) + if drop_out: + self.drop_out = nn.Dropout(self.drop_out) + else: + self.drop_out = lambda x: x + + def init_weights(self): + """Initiate the parameters. + """ + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + weight_init_(layer.weight, + 'Normal', + mean=0.0, + std=math.sqrt(2. / self.num_classes)) + + def forward(self, output_patch): + """Define how the head is going to run. + """ + x, N, M = output_patch + # N*M,C,T,V + _, c_new, T, V = x.shape + x = paddle.reshape(x, shape=[N, M, c_new, T * V]) + x = x.mean(3).mean(1) + x = self.drop_out(x) + + return self.fc(x) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/i3d_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/i3d_head.py new file mode 100644 index 0000000..269c818 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/i3d_head.py @@ -0,0 +1,95 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from paddle import ParamAttr + +from ..registry import HEADS +from ..weight_init import weight_init_ +from .base import BaseHead + + +@HEADS.register() +class I3DHead(BaseHead): + """Classification head for I3D. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict): Config for building loss. + Default: dict(name='CrossEntropyLoss') + spatial_type (str): Pooling type in spatial dimension. Default: 'avg'. + drop_ratio (float): Probability of dropout layer. Default: 0.5. + std (float): Std value for Initiation. Default: 0.01. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + def __init__(self, + num_classes, + in_channels, + loss_cfg=dict(name='CrossEntropyLoss'), + spatial_type='avg', + drop_ratio=0.5, + std=0.01, + **kwargs): + + super().__init__(num_classes, in_channels, loss_cfg, **kwargs) + + self.spatial_type = spatial_type + self.drop_ratio = drop_ratio + self.stdv = std + if self.drop_ratio != 0: + self.dropout = nn.Dropout(p=self.drop_ratio) + else: + self.dropout = None + self.fc = nn.Linear( + self.in_channels, + self.num_classes, + weight_attr=ParamAttr(learning_rate=10.0), + bias_attr=ParamAttr(learning_rate=10.0), + ) + + if self.spatial_type == 'avg': + # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels. + self.avg_pool = nn.AdaptiveAvgPool3D((1, 1, 1)) + else: + self.avg_pool = None + + def init_weights(self): + """Initiate the parameters from scratch.""" + weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv) + + def forward(self, x): + """Defines the computation performed at every call. + + Args: + x (torch.Tensor): The input data. + + Returns: + torch.Tensor: The classification scores for input samples. + """ + # [N, in_channels, 4, 7, 7] + if self.avg_pool is not None: + x = self.avg_pool(x) + # [N, in_channels, 1, 1, 1] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels, 1, 1, 1] + N = paddle.shape(x)[0] + x = x.reshape([N, -1]) + # [N, in_channels] + cls_score = self.fc(x) + # [N, num_classes] + return cls_score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/movinet_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/movinet_head.py new file mode 100644 index 0000000..924b014 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/movinet_head.py @@ -0,0 +1,15 @@ +import collections.abc + +container_abcs = collections.abc +from ..registry import HEADS +from .base import BaseHead +from ..builder import build_loss + + +@HEADS.register() +class MoViNetHead(BaseHead): + def __init__(self): + super().__init__() + + def forward(self, x, *args): + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/ms_tcn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ms_tcn_head.py new file mode 100644 index 0000000..e0f435f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/ms_tcn_head.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +from paddle import ParamAttr + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +@HEADS.register() +class MSTCNHead(BaseHead): + + def __init__(self, num_classes, in_channels): + super().__init__(num_classes, in_channels) + self.ce = nn.CrossEntropyLoss(ignore_index=-100) + self.mse = nn.MSELoss(reduction='none') + self.num_classes = num_classes + + # cls score + self.overlap = 0.5 + + def forward(self, x): + """MS-TCN no head + """ + return x + + def loss(self, output, video_gt): + """calculate loss + """ + output_transpose = paddle.transpose(output, [2, 0, 1]) + ce_x = paddle.reshape(output_transpose, + (output_transpose.shape[0] * + output_transpose.shape[1], self.num_classes)) + ce_y = video_gt[0, :] + ce_loss = self.ce(ce_x, ce_y) + loss = ce_loss + + mse = self.mse(F.log_softmax(output[:, :, 1:], axis=1), + F.log_softmax(output.detach()[:, :, :-1], axis=1)) + mse = paddle.clip(mse, min=0, max=16) + mse_loss = 0.15 * paddle.mean(mse) + loss += mse_loss + + return loss + + def get_F1_score(self, predicted, groundTruth): + recog_content = list(predicted.numpy()) + gt_content = list(groundTruth[0].numpy()) + + # cls score + correct = 0 + total = 0 + edit = 0 + + for i in range(len(gt_content)): + total += 1 + + if gt_content[i] == recog_content[i]: + correct += 1 + + edit_num = self.edit_score(recog_content, gt_content) + edit += edit_num + + tp, fp, fn = self.f_score(recog_content, gt_content, self.overlap) + + # cls metric + + precision = tp / float(tp + fp) + recall = tp / float(fp + fn) + + if precision + recall > 0.0: + f1 = 2.0 * (precision * recall) / (precision + recall) + else: + f1 = 0.0 + f1 = np.nan_to_num(f1) + return f1 + + def get_labels_start_end_time(self, frame_wise_labels): + labels = [] + starts = [] + ends = [] + last_label = frame_wise_labels[0] + labels.append(frame_wise_labels[0]) + starts.append(0) + for i in range(len(frame_wise_labels)): + if frame_wise_labels[i] != last_label: + labels.append(frame_wise_labels[i]) + starts.append(i) + ends.append(i) + last_label = frame_wise_labels[i] + ends.append(i + 1) + return labels, starts, ends + + def levenstein(self, p, y, norm=False): + m_row = len(p) + n_col = len(y) + D = np.zeros([m_row + 1, n_col + 1], np.float) + for i in range(m_row + 1): + D[i, 0] = i + for i in range(n_col + 1): + D[0, i] = i + + for j in range(1, n_col + 1): + for i in range(1, m_row + 1): + if y[j - 1] == p[i - 1]: + D[i, j] = D[i - 1, j - 1] + else: + D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1, + D[i - 1, j - 1] + 1) + + if norm: + score = (1 - D[-1, -1] / max(m_row, n_col)) * 100 + else: + score = D[-1, -1] + + return score + + def edit_score(self, recognized, ground_truth, norm=True): + P, _, _ = self.get_labels_start_end_time(recognized) + Y, _, _ = self.get_labels_start_end_time(ground_truth) + return self.levenstein(P, Y, norm) + + def f_score(self, recognized, ground_truth, overlap): + p_label, p_start, p_end = self.get_labels_start_end_time(recognized) + y_label, y_start, y_end = self.get_labels_start_end_time(ground_truth) + + tp = 0 + fp = 0 + + hits = np.zeros(len(y_label)) + + for j in range(len(p_label)): + intersection = np.minimum(p_end[j], y_end) - np.maximum( + p_start[j], y_start) + union = np.maximum(p_end[j], y_end) - np.minimum( + p_start[j], y_start) + IoU = (1.0 * intersection / union) * ( + [p_label[j] == y_label[x] for x in range(len(y_label))]) + # Get the best scoring segment + idx = np.array(IoU).argmax() + + if IoU[idx] >= overlap and not hits[idx]: + tp += 1 + hits[idx] = 1 + else: + fp += 1 + fn = len(y_label) - sum(hits) + return float(tp), float(fp), float(fn) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptimesformer_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptimesformer_head.py new file mode 100644 index 0000000..113bde8 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptimesformer_head.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.nn import Linear + +from ..registry import HEADS +from ..weight_init import trunc_normal_, weight_init_ +from .base import BaseHead +from paddle import ParamAttr +from paddle.regularizer import L2Decay + + +@HEADS.register() +class ppTimeSformerHead(BaseHead): + """TimeSformerHead Head. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + std(float): Std(Scale) value in normal initilizar. Default: 0.01. + kwargs (dict, optional): Any keyword argument to initialize. + + """ + def __init__(self, + num_classes, + in_channels, + loss_cfg=dict(name='CrossEntropyLoss'), + std=0.02, + **kwargs): + + super().__init__(num_classes, in_channels, loss_cfg, **kwargs) + self.std = std + self.fc = Linear(self.in_channels, + self.num_classes, + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + def init_weights(self): + """Initiate the FC layer parameters""" + + weight_init_(self.fc, + 'TruncatedNormal', + 'fc_0.w_0', + 'fc_0.b_0', + mean=0.0, + std=self.std) + # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal + trunc_normal_(self.fc.weight, std=self.std) + + def forward(self, x): + """Define how the head is going to run. + Args: + x (paddle.Tensor): The input data. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + # XXX: check dropout location! + # x.shape = [N, embed_dim] + + score = self.fc(x) + # [N, num_class] + # x = F.softmax(x) # NOTE remove + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsm_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsm_head.py new file mode 100644 index 0000000..45f50fd --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsm_head.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr +from paddle.nn import Linear +from paddle.regularizer import L2Decay +from .tsn_head import TSNHead +from ..registry import HEADS + +from ..weight_init import weight_init_ + + +@HEADS.register() +class ppTSMHead(TSNHead): + """ ppTSM Head + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + drop_ratio(float): drop ratio. Default: 0.8. + std(float): Std(Scale) value in normal initilizar. Default: 0.001. + kwargs (dict, optional): Any keyword argument to initialize. + """ + def __init__( + self, + num_classes, + in_channels, # NOTE: 2048 for >= R50, 512 for <= R34 + drop_ratio=0.8, + std=0.01, + data_format="NCHW", + num_seg=8, + **kwargs): + + super().__init__(num_classes, + in_channels, + drop_ratio=drop_ratio, + std=std, + data_format=data_format, + **kwargs) + + self.fc = Linear(self.in_channels, + self.num_classes, + weight_attr=ParamAttr(learning_rate=5.0, + regularizer=L2Decay(1e-4)), + bias_attr=ParamAttr(learning_rate=10.0, + regularizer=L2Decay(0.0))) + self.stdv = std + self.num_seg = num_seg + + def init_weights(self): + """Initiate the FC layer parameters""" + weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv) + + def forward(self, x, num_seg=None): + """Define how the head is going to run. + Args: + x (paddle.Tensor): The input data. + num_segs (int): Number of segments. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + + #XXX: check dropout location! + # [N * num_segs, in_channels, 7, 7] + x = self.avgpool2d(x) + # [N * num_segs, in_channels, 1, 1] + if self.dropout is not None: + x = self.dropout(x) + # [N * num_seg, in_channels, 1, 1] + num_seg = num_seg if num_seg is not None else self.num_seg + x = paddle.reshape(x, [-1, num_seg, x.shape[1]]) + # [N, num_seg, in_channels] + x = paddle.mean(x, axis=1) + # [N, in_channels] + x = paddle.reshape(x, shape=[-1, self.in_channels]) + # [N, in_channels] + score = self.fc(x) + # [N, num_class] + #x = F.softmax(x) #NOTE remove + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsn_head.py new file mode 100644 index 0000000..2655c90 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/pptsn_head.py @@ -0,0 +1,103 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout +from paddle.regularizer import L2Decay +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +@HEADS.register() +class ppTSNHead(BaseHead): + """ppTSN Head. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + drop_ratio(float): drop ratio. Default: 0.4. + std(float): Std(Scale) value in normal initilizar. Default: 0.01. + data_format(str): data format of input tensor in ['NCHW', 'NHWC']. Default: 'NCHW'. + fclr5(bool): Whether to increase the learning rate of the fully connected layer. Default: True + kwargs (dict, optional): Any keyword argument to initialize. + + """ + def __init__(self, + num_classes, + in_channels, + loss_cfg=dict(name='CrossEntropyLoss'), + drop_ratio=0.4, + std=0.01, + data_format="NCHW", + fclr5=True, + **kwargs): + + super().__init__(num_classes, in_channels, loss_cfg, **kwargs) + self.drop_ratio = drop_ratio + self.std = std + + # NOTE: global pool performance + self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format) + + if self.drop_ratio != 0: + self.dropout = Dropout(p=self.drop_ratio) + else: + self.dropout = None + self.fc = Linear( + self.in_channels, + self.num_classes, + weight_attr=ParamAttr(learning_rate=5.0 if fclr5 else 1.0, + regularizer=L2Decay(1e-4)), + bias_attr=ParamAttr(learning_rate=10.0 if fclr5 else 1.0, + regularizer=L2Decay(0.0))) + + def init_weights(self): + """Initiate the FC layer parameters""" + weight_init_(self.fc, + 'Normal', + 'fc_0.w_0', + 'fc_0.b_0', + mean=0., + std=self.std) + + def forward(self, x, num_seg=8): + """Define how the head is going to run. + + Args: + x (paddle.Tensor): The input data. + num_segs (int): Number of segments. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + + # XXX: check dropout location! + # [N * num_segs, in_channels, 7, 7] + x = self.avgpool2d(x) + # [N * num_segs, in_channels, 1, 1] + x = paddle.reshape(x, [-1, num_seg, x.shape[1]]) + # [N, num_seg, in_channels] + x = paddle.mean(x, axis=1) + # [N, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + x = paddle.reshape(x, shape=[-1, self.in_channels]) + # [N, in_channels] + score = self.fc(x) + # [N, num_class] + # x = F.softmax(x) # NOTE remove + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_extractor.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_extractor.py new file mode 100644 index 0000000..3aaef23 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_extractor.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + +#@register +class RoIAlign(object): + + def __init__(self, + resolution=14, + spatial_scale=0.0625, + sampling_ratio=0, + aligned=False): + super(RoIAlign, self).__init__() + self.resolution = resolution + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + self.aligned = aligned + + def __call__(self, feats, roi, rois_num): + roi = paddle.concat(roi) if len(roi) > 1 else roi[0] + rois_num = paddle.to_tensor(rois_num, dtype='int32') + rois_num = paddle.cast(rois_num, dtype='int32') + if len(feats) == 1: + roi_feat = paddle.vision.ops.roi_align(feats, + roi, + rois_num, + self.resolution, + self.spatial_scale, + self.sampling_ratio, + self.aligned) + else: + rois_feat_list = [] + roi_feat = paddle.vision.ops.roi_align(feats, + roi, + rois_num, + self.resolution, + self.spatial_scale, + self.sampling_ratio, + self.aligned) + + return roi_feat diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_head.py new file mode 100644 index 0000000..be34a33 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/roi_head.py @@ -0,0 +1,177 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +from .. import builder +from ..registry import HEADS + + +def bbox2result(bboxes, labels, num_classes, img_shape, thr=0.01): + """Convert detection results to a list of numpy arrays. """ + if len(bboxes) == 0: + return list(np.zeros((num_classes - 1, 0, 5), dtype=np.float32)) + else: + bboxes = bboxes[0] + labels = labels + img_shape_np = img_shape + img_h, img_w = img_shape_np[0][0], img_shape_np[0][1] + + img_w = paddle.cast(img_w, dtype='int32') + img_h = paddle.cast(img_h, dtype='int32') + + bboxes[:, 0::2] /= img_w + bboxes[:, 1::2] /= img_h + + # We only handle multilabel now + assert labels.shape[-1] > 1 + + scores = labels # rename + thr = (thr, ) * num_classes if isinstance(thr, float) else thr + assert scores.shape[1] == num_classes + assert len(thr) == num_classes + + result = [] + for i in range(num_classes - 1): + #step1. 对该类, 每个bbox的得分是否大于阈值 + where = scores[:, i + 1] > thr[i + 1] + + where = paddle.nonzero(where) # index + bboxes_select = paddle.index_select(x=bboxes, index=where) + bboxes_select = bboxes_select[:, :4] + + scores_select = paddle.index_select(x=scores, index=where) + scores_select = scores_select[:, i + 1:i + 2] + + result.append( + #对于step1中得分大于阈值的bbox(可能为空), 将bbox及在该类的score放入result列表. + paddle.concat((bboxes_select, scores_select), axis=1).numpy()) + + return result + + +@HEADS.register() +class AVARoIHead(nn.Layer): + + def __init__(self, + assigner, + sampler, + pos_weight=1.0, + action_thr=0.0, + bbox_roi_extractor=None, + bbox_head=None, + train_cfg=None, + test_cfg=None): + super().__init__() + self.assigner = assigner + self.sampler = sampler + self.pos_weight = pos_weight + self.action_thr = action_thr + self.init_assigner_sampler() + if bbox_head is not None: + self.init_bbox_head(bbox_roi_extractor, bbox_head) + + def init_assigner_sampler(self): + """Initialize assigner and sampler.""" + self.bbox_assigner = None + self.bbox_sampler = None + self.bbox_assigner = builder.build_assigner(self.assigner) + self.bbox_sampler = builder.build_sampler(self.sampler, context=self) + + def init_bbox_head(self, bbox_roi_extractor, bbox_head): + """Initialize ``bbox_head``""" + self.bbox_roi_extractor = builder.build_roi_extractor( + bbox_roi_extractor) + self.bbox_head = builder.build_head(bbox_head) + + def _bbox_forward(self, x, rois, rois_num): + bbox_feat = self.bbox_roi_extractor(x, rois, rois_num) + cls_score, bbox_pred = self.bbox_head( + bbox_feat, rois, rois_num + ) #deal with: when roi's width or height = 0 , roi_align is wrong + bbox_results = dict(cls_score=cls_score, + bbox_pred=bbox_pred, + bbox_feats=bbox_feat) + return bbox_results + + def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels): + """Run forward function and calculate loss for box head in training.""" + rois = [res.bboxes for res in sampling_results] + rois_num = [res.bboxes.shape[0] for res in sampling_results] + bbox_results = self._bbox_forward(x, rois, rois_num) + bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes, + gt_labels, self.pos_weight) + loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_targets) + bbox_results.update(loss_bbox=loss_bbox) + return bbox_results + + def train_step(self, x, img_metas, proposal_list, gt_bboxes, gt_labels): + #1. assign gts and sample proposals + num_imgs = len(img_metas[0]) + sampling_results = [] + for i in range(num_imgs): + assign_result = self.bbox_assigner.assign(proposal_list[i], + gt_bboxes[i], + gt_labels[i]) + sampling_result = self.bbox_sampler.sample(assign_result, + proposal_list[i], + gt_bboxes[i], + gt_labels[i]) + sampling_results.append(sampling_result) + + #2. forward and loss + bbox_results = self._bbox_forward_train(x, sampling_results, gt_bboxes, + gt_labels) + losses = dict() + losses.update(bbox_results['loss_bbox']) + + return losses + + def simple_test(self, x, proposal_list, img_shape, rescale=False): + x_shape = x[0].shape + #assert x_shape[0] == 1, 'only accept 1 sample at test mode' + + det_bboxes, det_labels = self.simple_test_bboxes(x, + img_shape, + proposal_list, + self.action_thr, + rescale=rescale) + + bbox_results = bbox2result(det_bboxes, det_labels, + self.bbox_head.num_classes, img_shape, + self.action_thr) + return [bbox_results] + + def simple_test_bboxes(self, + x, + img_shape, + proposals, + action_thr, + rescale=False): + """Test only det bboxes without augmentation.""" + rois = [proposals] + rois_num = [rois[0].shape[0]] + bbox_results = self._bbox_forward(x, rois, rois_num) + cls_score = bbox_results['cls_score'] + crop_quadruple = np.array([0, 0, 1, 1]) + flip = False + det_bboxes, det_labels = self.bbox_head.get_det_bboxes( + rois, + cls_score, + img_shape, + flip=flip, + crop_quadruple=crop_quadruple) + + return det_bboxes, det_labels diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/single_straight3d.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/single_straight3d.py new file mode 100644 index 0000000..805d93e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/single_straight3d.py @@ -0,0 +1,79 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import paddle.nn as nn +import numpy as np +from ..registry import ROI_EXTRACTORS +from .roi_extractor import RoIAlign + + +@ROI_EXTRACTORS.register() +class SingleRoIExtractor3D(nn.Layer): + """Extract RoI features from a single level feature map. """ + def __init__(self, + roi_layer_type='RoIAlign', + featmap_stride=16, + output_size=16, + sampling_ratio=0, + pool_mode='avg', + aligned=True, + with_temporal_pool=True, + with_global=False): + super().__init__() + self.roi_layer_type = roi_layer_type + assert self.roi_layer_type in ['RoIPool', 'RoIAlign'] + self.featmap_stride = featmap_stride + self.spatial_scale = 1. / self.featmap_stride + self.output_size = output_size + self.sampling_ratio = sampling_ratio + self.pool_mode = pool_mode + self.aligned = aligned + self.with_temporal_pool = with_temporal_pool + self.with_global = with_global + + self.roi_layer = RoIAlign(resolution=self.output_size, + spatial_scale=self.spatial_scale, + sampling_ratio=self.sampling_ratio, + aligned=self.aligned) + + def init_weights(self): + pass + + # The shape of feat is N, C, T, H, W + def forward(self, feat, rois, rois_num): + if len(feat) >= 2: + assert self.with_temporal_pool + if self.with_temporal_pool: + xi = 0 + for x in feat: + xi = xi + 1 + y = paddle.mean(x, 2, keepdim=True) + feat = [paddle.mean(x, 2, keepdim=True) for x in feat] + feat = paddle.concat(feat, axis=1) # merge slow and fast + roi_feats = [] + for t in range(feat.shape[2]): + if type(t) == paddle.static.Variable: + index = paddle.to_tensor(t) + else: + data_index = np.array([t]).astype('int32') + index = paddle.to_tensor(data_index) + + frame_feat = paddle.index_select(feat, index, axis=2) + frame_feat = paddle.squeeze(frame_feat, + axis=2) #axis=2,避免N=1时, 第一维度被删除. + roi_feat = self.roi_layer(frame_feat, rois, rois_num) + roi_feats.append(roi_feat) + + ret = paddle.stack(roi_feats, axis=2) + return ret diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/slowfast_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/slowfast_head.py new file mode 100644 index 0000000..bd18baf --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/slowfast_head.py @@ -0,0 +1,137 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..registry import HEADS +from .base import BaseHead + +import paddle +import paddle.nn.functional as F + +from ..weight_init import weight_init_ + + +@HEADS.register() +class SlowFastHead(BaseHead): + """ + ResNe(X)t 3D head. + This layer performs a fully-connected projection during training, when the + input size is 1x1x1. It performs a convolutional projection during testing + when the input size is larger than 1x1x1. If the inputs are from multiple + different pathways, the inputs will be concatenated after pooling. + """ + def __init__(self, + width_per_group, + alpha, + beta, + num_classes, + num_frames, + crop_size, + dropout_rate, + pool_size_ratio=[[1, 1, 1], [1, 1, 1]], + loss_cfg=dict(name='CrossEntropyLoss'), + multigrid_short=False, + **kwargs): + """ + ResNetBasicHead takes p pathways as input where p in [1, infty]. + + Args: + dim_in (list): the list of channel dimensions of the p inputs to the + ResNetHead. + num_classes (int): the channel dimensions of the p outputs to the + ResNetHead. + pool_size (list): the list of kernel sizes of p spatial temporal + poolings, temporal pool kernel size, spatial pool kernel size, + spatial pool kernel size in order. + dropout_rate (float): dropout rate. If equal to 0.0, perform no + dropout. + """ + super().__init__(num_classes, loss_cfg, **kwargs) + self.multigrid_short = multigrid_short + self.width_per_group = width_per_group + self.alpha = alpha + self.beta = beta + self.num_classes = num_classes + self.num_frames = num_frames + self.crop_size = crop_size + self.dropout_rate = dropout_rate + self.pool_size_ratio = pool_size_ratio + + self.dim_in = [ + self.width_per_group * 32, + self.width_per_group * 32 // self.beta, + ] + self.pool_size = [None, None] if self.multigrid_short else [ + [ + self.num_frames // self.alpha // self.pool_size_ratio[0][0], + self.crop_size // 32 // self.pool_size_ratio[0][1], + self.crop_size // 32 // self.pool_size_ratio[0][2], + ], + [ + self.num_frames // self.pool_size_ratio[1][0], + self.crop_size // 32 // self.pool_size_ratio[1][1], + self.crop_size // 32 // self.pool_size_ratio[1][2], + ], + ] + + assert (len({len(self.pool_size), len(self.dim_in) + }) == 1), "pathway dimensions are not consistent." + self.num_pathways = len(self.pool_size) + + self.dropout = paddle.nn.Dropout(p=self.dropout_rate) + + self.projection = paddle.nn.Linear( + in_features=sum(self.dim_in), + out_features=self.num_classes, + ) + + def init_weights(self): + weight_init_(self.projection, + "Normal", + bias_value=0.0, + mean=0.0, + std=0.01) + + def forward(self, inputs): + assert (len(inputs) == self.num_pathways + ), "Input tensor does not contain {} pathway".format( + self.num_pathways) + pool_out = [] + for pathway in range(self.num_pathways): + if self.pool_size[pathway] is None: + tmp_out = F.adaptive_avg_pool3d(x=inputs[pathway], + output_size=(1, 1, 1), + data_format="NCDHW") + else: + tmp_out = F.avg_pool3d(x=inputs[pathway], + kernel_size=self.pool_size[pathway], + stride=1, + data_format="NCDHW") + pool_out.append(tmp_out) + + x = paddle.concat(x=pool_out, axis=1) + x = paddle.transpose(x=x, perm=(0, 2, 3, 4, 1)) + + # Perform dropout. + if self.dropout_rate > 0.0: + x = self.dropout(x) + + x = self.projection(x) + + # Performs fully convlutional inference. + if not self.training: # attr of base class + x = F.softmax(x, axis=4) + x = paddle.mean(x, axis=[1, 2, 3]) + + x = paddle.reshape(x, shape=(x.shape[0], -1)) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/stgcn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/stgcn_head.py new file mode 100644 index 0000000..fc80d66 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/stgcn_head.py @@ -0,0 +1,50 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +@HEADS.register() +class STGCNHead(BaseHead): + """ + Head for ST-GCN model. + Args: + in_channels: int, input feature channels. Default: 256. + num_classes: int, number classes. Default: 10. + """ + def __init__(self, in_channels=256, num_classes=10, **kwargs): + super().__init__(num_classes, in_channels, **kwargs) + self.fcn = nn.Conv2D(in_channels=in_channels, + out_channels=num_classes, + kernel_size=1) + + def init_weights(self): + """Initiate the parameters. + """ + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + weight_init_(layer, 'Normal', std=0.02) + + def forward(self, x): + """Define how the head is going to run. + """ + x = self.fcn(x) + x = paddle.reshape_(x, (x.shape[0], -1)) # N,C,1,1 --> N,C + + return x diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/timesformer_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/timesformer_head.py new file mode 100644 index 0000000..d02a3cc --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/timesformer_head.py @@ -0,0 +1,70 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.nn import Linear + +from ..registry import HEADS +from ..weight_init import trunc_normal_, weight_init_ +from .base import BaseHead + + +@HEADS.register() +class TimeSformerHead(BaseHead): + """TimeSformerHead Head. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + std(float): Std(Scale) value in normal initilizar. Default: 0.01. + kwargs (dict, optional): Any keyword argument to initialize. + + """ + def __init__(self, + num_classes, + in_channels, + loss_cfg=dict(name='CrossEntropyLoss'), + std=0.02, + **kwargs): + + super().__init__(num_classes, in_channels, loss_cfg, **kwargs) + self.std = std + self.fc = Linear(self.in_channels, self.num_classes) + + def init_weights(self): + """Initiate the FC layer parameters""" + + weight_init_(self.fc, + 'TruncatedNormal', + 'fc_0.w_0', + 'fc_0.b_0', + mean=0.0, + std=self.std) + # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal + trunc_normal_(self.fc.weight, std=self.std) + + def forward(self, x): + """Define how the head is going to run. + Args: + x (paddle.Tensor): The input data. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + # XXX: check dropout location! + # x.shape = [N, embed_dim] + + score = self.fc(x) + # [N, num_class] + # x = F.softmax(x) # NOTE remove + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/token_shift_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/token_shift_head.py new file mode 100644 index 0000000..52e9309 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/token_shift_head.py @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.nn import Linear +import paddle + +from ..registry import HEADS +from ..weight_init import trunc_normal_, weight_init_ +from .base import BaseHead + + +@HEADS.register() +class TokenShiftHead(BaseHead): + """TokenShift Transformer Head. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + num_seg(int): The number of segments. Default: 8. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + ls_eps (float): Label smoothing epsilon. Default: 0.01. + std (float): Std(Scale) Value in normal initilizar. Default: 0.02. + kwargs (dict, optional): Any keyword argument to initialize. + + """ + def __init__(self, + num_classes, + in_channels, + num_seg=8, + loss_cfg=dict(name='CrossEntropyLoss'), + ls_eps=0.01, + std=0.02, + **kwargs): + + super().__init__(num_classes, in_channels, loss_cfg, ls_eps) + self.num_seg = num_seg + self.std = std + self.fc = Linear(self.in_channels, self.num_classes) + + def init_weights(self): + """Initiate the FC layer parameters""" + + weight_init_(self.fc, + 'TruncatedNormal', + 'fc_0.w_0', + 'fc_0.b_0', + mean=0.0, + std=self.std) + # NOTE: Temporarily use trunc_normal_ instead of TruncatedNormal + trunc_normal_(self.fc.weight, std=self.std) + + def forward(self, x): + """Define how the head is going to run. + Args: + x (paddle.Tensor): The input data. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + # XXX: check dropout location! + # x.shape = [N, embed_dim] + score = self.fc(x) + # [N*T, num_class] + _, _m = score.shape + _t = self.num_seg + score = score.reshape([-1, _t, _m]) + score = paddle.mean(score, 1) # averaging predictions for every frame + score = paddle.squeeze(score, axis=1) + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/transnetv2_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/transnetv2_head.py new file mode 100644 index 0000000..2ea67d4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/transnetv2_head.py @@ -0,0 +1,45 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import BaseHead +from ..registry import HEADS +from ..losses import TransNetV2Loss +from ...metrics.transnetv2_metric import create_scene_based_summaries + +@HEADS.register() +class TransNetV2Head(BaseHead): + """TransNetV2 Head. + """ + def __init__(self, + num_classes, + in_channels, + loss_cfg=dict(name="TransNetV2Loss") + ): + super().__init__(num_classes, + in_channels, + loss_cfg) + + def loss(self, one_hot_pred, one_hot_gt, + many_hot_pred=None, many_hot_gt=None, reg_losses=None): + losses = dict() + loss = self.loss_func(scores, labels, **kwargs) + + f1 = self.get_score(one_hot_pred, one_hot_gt) + losses['f1'] = f1 + losses['loss'] = loss + return losses + + def get_score(self, one_hot_pred, one_hot_gt): + f1 = create_scene_based_summaries(one_hot_pred, one_hot_gt) + return f1 diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsm_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsm_head.py new file mode 100644 index 0000000..9559301 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsm_head.py @@ -0,0 +1,99 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +from paddle import ParamAttr +from paddle.nn import Linear +import paddle.nn.functional as F +from paddle.regularizer import L2Decay +from .tsn_head import TSNHead +from ..registry import HEADS + +from ..weight_init import weight_init_ + + +@HEADS.register() +class TSMHead(TSNHead): + """ TSM Head + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + drop_ratio(float): drop ratio. Default: 0.5. + std(float): Std(Scale) value in normal initilizar. Default: 0.001. + kwargs (dict, optional): Any keyword argument to initialize. + """ + def __init__(self, + num_classes, + in_channels, + drop_ratio=0.5, + std=0.001, + data_format="NCHW", + **kwargs): + super().__init__(num_classes, + in_channels, + drop_ratio=drop_ratio, + std=std, + data_format=data_format, + **kwargs) + + self.fc = Linear(self.in_channels, + self.num_classes, + weight_attr=ParamAttr(learning_rate=5.0, + regularizer=L2Decay(1e-4)), + bias_attr=ParamAttr(learning_rate=10.0, + regularizer=L2Decay(0.0))) + + assert (data_format in [ + 'NCHW', 'NHWC' + ]), f"data_format must be 'NCHW' or 'NHWC', but got {data_format}" + + self.data_format = data_format + + self.stdv = std + + def init_weights(self): + """Initiate the FC layer parameters""" + weight_init_(self.fc, 'Normal', 'fc_0.w_0', 'fc_0.b_0', std=self.stdv) + + def forward(self, x, num_seg): + """Define how the tsm-head is going to run. + + Args: + x (paddle.Tensor): The input data. + num_segs (int): Number of segments. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + # x.shape = [N * num_segs, in_channels, 7, 7] + + x = self.avgpool2d(x) # [N * num_segs, in_channels, 1, 1] + + if self.dropout is not None: + x = self.dropout(x) # [N * num_seg, in_channels, 1, 1] + + if self.data_format == 'NCHW': + x = paddle.reshape(x, x.shape[:2]) + else: + x = paddle.reshape(x, x.shape[::3]) + score = self.fc(x) # [N * num_seg, num_class] + score = paddle.reshape( + score, [-1, num_seg, score.shape[1]]) # [N, num_seg, num_class] + score = paddle.mean(score, axis=1) # [N, num_class] + score = paddle.reshape(score, + shape=[-1, self.num_classes]) # [N, num_class] + # score = F.softmax(score) #NOTE remove + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsn_head.py b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsn_head.py new file mode 100644 index 0000000..f2f906b --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/heads/tsn_head.py @@ -0,0 +1,93 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.nn import AdaptiveAvgPool2D, Linear, Dropout + +from .base import BaseHead +from ..registry import HEADS +from ..weight_init import weight_init_ + + +@HEADS.register() +class TSNHead(BaseHead): + """TSN Head. + + Args: + num_classes (int): The number of classes to be classified. + in_channels (int): The number of channles in input feature. + loss_cfg (dict): Config for building config. Default: dict(name='CrossEntropyLoss'). + drop_ratio(float): drop ratio. Default: 0.4. + std(float): Std(Scale) value in normal initilizar. Default: 0.01. + kwargs (dict, optional): Any keyword argument to initialize. + + """ + def __init__(self, + num_classes, + in_channels, + loss_cfg=dict(name='CrossEntropyLoss'), + drop_ratio=0.4, + std=0.01, + data_format="NCHW", + **kwargs): + + super().__init__(num_classes, in_channels, loss_cfg, **kwargs) + self.drop_ratio = drop_ratio + self.std = std + + #NOTE: global pool performance + self.avgpool2d = AdaptiveAvgPool2D((1, 1), data_format=data_format) + + if self.drop_ratio != 0: + self.dropout = Dropout(p=self.drop_ratio) + else: + self.dropout = None + + self.fc = Linear(self.in_channels, self.num_classes) + + def init_weights(self): + """Initiate the FC layer parameters""" + + weight_init_(self.fc, + 'Normal', + 'fc_0.w_0', + 'fc_0.b_0', + mean=0., + std=self.std) + + def forward(self, x, num_seg): + """Define how the head is going to run. + Args: + x (paddle.Tensor): The input data. + num_segs (int): Number of segments. + Returns: + score: (paddle.Tensor) The classification scores for input samples. + """ + + #XXX: check dropout location! + # [N * num_segs, in_channels, 7, 7] + + x = self.avgpool2d(x) + # [N * num_segs, in_channels, 1, 1] + x = paddle.reshape(x, [-1, num_seg, x.shape[1]]) + # [N, num_seg, in_channels] + x = paddle.mean(x, axis=1) + # [N, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + score = self.fc(x) + # [N, num_class] + #x = F.softmax(x) #NOTE remove + return score diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__init__.py new file mode 100644 index 0000000..d784c4c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .base import BaseWeightedLoss +from .bmn_loss import BMNLoss +from .cross_entropy_loss import CrossEntropyLoss +from .depth_loss import ADDSLoss +from .transnetv2_loss import TransNetV2Loss +from .actbert_loss import ActBertLoss +from .asrf_loss import ASRFLoss +from .distillation_loss import DistillationCELoss, DistillationDMLLoss +from .yowo_loss import RegionLoss + +__all__ = [ + 'CrossEntropyLoss', 'BMNLoss', 'TransNetV2Loss', 'ActBertLoss', 'ADDSLoss', + 'BaseWeightedLoss', 'ASRFLoss', 'DistillationCELoss', 'DistillationDMLLoss', + 'RegionLoss' +] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..72125a0 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/actbert_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/actbert_loss.cpython-310.pyc new file mode 100644 index 0000000..f4ac760 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/actbert_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/asrf_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/asrf_loss.cpython-310.pyc new file mode 100644 index 0000000..ad94ef8 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/asrf_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/base.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000..20b1111 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/base.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/bmn_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/bmn_loss.cpython-310.pyc new file mode 100644 index 0000000..d22aa00 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/bmn_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/cross_entropy_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/cross_entropy_loss.cpython-310.pyc new file mode 100644 index 0000000..f3d06b5 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/cross_entropy_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/depth_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/depth_loss.cpython-310.pyc new file mode 100644 index 0000000..1f884a1 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/depth_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/distillation_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/distillation_loss.cpython-310.pyc new file mode 100644 index 0000000..b766587 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/distillation_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/transnetv2_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/transnetv2_loss.cpython-310.pyc new file mode 100644 index 0000000..6463772 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/transnetv2_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/yowo_loss.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/yowo_loss.cpython-310.pyc new file mode 100644 index 0000000..d69a0a5 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/losses/__pycache__/yowo_loss.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/actbert_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/actbert_loss.py new file mode 100644 index 0000000..10ffea6 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/actbert_loss.py @@ -0,0 +1,75 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ..registry import LOSSES +from .base import BaseWeightedLoss + + +@LOSSES.register() +class ActBertLoss(BaseWeightedLoss): + """Loss for ActBert model + """ + def __init__(self, vocab_size=30522, a_target_size=700): + super().__init__() + self.vocab_size = vocab_size + self.a_target_size = a_target_size + self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1) + self.vis_criterion = nn.KLDivLoss(reduction="none") + + def forward(self, prediction_scores_t, prediction_scores_v, prediction_scores_a, seq_relationship_score, \ + text_labels, image_label, image_target, action_label, next_sentence_label): + """ + Args: + text_label: text label(with mask). Shape: [batch_size, seqence_length] + image_label: image label(with mask). Shape: [batch_size, region_length] + image_target: label of image feature distribution, + Shape: [batch_size, region_length-1, num_image_class](minus 1 for xxx). + action label: action label(with mask), Shape: [batch_size, action_length] + next_sentence_label: is next sentence or not. Shape: [batch_size] + """ + prediction_scores_v = prediction_scores_v[:, + 1:] #8,37,1601 --> 8,36,1601 + + img_loss = self.vis_criterion( + F.log_softmax(prediction_scores_v, axis=2), + image_target #8,36,1601 + ) + masked_img_loss = paddle.sum( + img_loss * (image_label == 1).unsqueeze(2).astype('float32')) / max( + paddle.sum((image_label == 1).astype('float32')), 1e-6) + + masked_text_loss = self.loss_fct( + prediction_scores_t.reshape([-1, self.vocab_size]), #8,36,30522 + text_labels.reshape([-1]), #8,36 # label -1 will be ignored + ) + + masked_action_loss = self.loss_fct( + prediction_scores_a.reshape([-1, self.a_target_size]), #8,5,700 + action_label.reshape([-1]), #8,5 + ) + + next_sentence_loss = self.loss_fct( + seq_relationship_score.reshape([-1, 2]), + next_sentence_label.reshape([-1]) #8,2 + ) + + total_loss = masked_text_loss.unsqueeze(0) + masked_img_loss.unsqueeze( + 0) + masked_action_loss.unsqueeze(0) + next_sentence_loss.unsqueeze( + 0) + return total_loss diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/asrf_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/asrf_loss.py new file mode 100644 index 0000000..ce5d6b1 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/asrf_loss.py @@ -0,0 +1,401 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# https://github.com/yiskw713/asrf/libs/loss_fn/__init__.py + +import numpy as np +import pandas as pd +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import sys +import os + +from ..registry import LOSSES + + +class TMSE(nn.Layer): + """ + Temporal MSE Loss Function + Proposed in Y. A. Farha et al. MS-TCN: Multi-Stage Temporal Convolutional Network for ActionSegmentation in CVPR2019 + arXiv: https://arxiv.org/pdf/1903.01945.pdf + """ + + def __init__(self, threshold=4, ignore_index=255): + super().__init__() + self.threshold = threshold + self.ignore_index = ignore_index + self.mse = nn.MSELoss(reduction="none") + + def forward(self, preds, gts): + + total_loss = 0.0 + batch_size = preds.shape[0] + for pred, gt in zip(preds, gts): + pred = paddle.gather(pred, + paddle.nonzero(gt != self.ignore_index)[:, 0]) + + loss = self.mse(F.log_softmax(pred[:, 1:], axis=1), + F.log_softmax(pred[:, :-1], axis=1)) + + loss = paddle.clip(loss, min=0, max=self.threshold**2) + total_loss += paddle.mean(loss) + + return total_loss / batch_size + + +class GaussianSimilarityTMSE(nn.Layer): + """ + Temporal MSE Loss Function with Gaussian Similarity Weighting + """ + + def __init__(self, threshold=4, sigma=1.0, ignore_index=255): + super().__init__() + self.threshold = threshold + self.ignore_index = ignore_index + self.mse = nn.MSELoss(reduction="none") + self.sigma = sigma + + def forward(self, preds, gts, sim_index): + """ + Args: + preds: the output of model before softmax. (N, C, T) + gts: Ground Truth. (N, T) + sim_index: similarity index. (N, C, T) + Return: + the value of Temporal MSE weighted by Gaussian Similarity. + """ + total_loss = 0.0 + batch_size = preds.shape[0] + for pred, gt, sim in zip(preds, gts, sim_index): + pred = paddle.gather(pred, + paddle.nonzero(gt != self.ignore_index)[:, 0], + axis=1) + sim = paddle.gather(sim, + paddle.nonzero(gt != self.ignore_index)[:, 0], + axis=1) + + # calculate gaussian similarity + diff = sim[:, 1:] - sim[:, :-1] + similarity = paddle.exp( + (-1 * paddle.norm(diff, axis=0)) / (2 * self.sigma**2)) + + # calculate temporal mse + loss = self.mse(F.log_softmax(pred[:, 1:], axis=1), + F.log_softmax(pred[:, :-1], axis=1)) + loss = paddle.clip(loss, min=0, max=self.threshold**2) + + # gaussian similarity weighting + loss = similarity * loss + + total_loss += paddle.mean(loss) + + return total_loss / batch_size + + +class FocalLoss(nn.Layer): + + def __init__(self, + weight=None, + size_average=True, + batch_average=True, + ignore_index=255, + gamma=2.0, + alpha=0.25): + super().__init__() + + self.gamma = gamma + self.alpha = alpha + self.batch_average = batch_average + self.criterion = nn.CrossEntropyLoss(weight=weight, + ignore_index=ignore_index, + size_average=size_average) + + def forward(self, logit, target): + n, _, _ = logit.size() + + logpt = -self.criterion(logit, target.long()) + pt = paddle.exp(logpt) + + if self.alpha is not None: + logpt *= self.alpha + + loss = -((1 - pt)**self.gamma) * logpt + + if self.batch_average: + loss /= n + + return loss + + +class ActionSegmentationLoss(nn.Layer): + """ + Loss Function for Action Segmentation + You can choose the below loss functions and combine them. + - Cross Entropy Loss (CE) + - Focal Loss + - Temporal MSE (TMSE) + - Gaussian Similarity TMSE (GSTMSE) + """ + + def __init__(self, + num_classes, + file_path, + label_path, + ce=True, + focal=True, + tmse=False, + gstmse=False, + weight=None, + threshold=4., + ignore_index=255, + ce_weight=1.0, + focal_weight=1.0, + tmse_weight=0.15, + gstmse_weight=0.15): + super().__init__() + self.criterions = [] + self.weights = [] + + self.num_classes = num_classes + self.file_path = file_path + self.label_path = label_path + if weight: + class_weight = self.get_class_weight() + else: + class_weight = None + + if ce: + self.criterions.append( + nn.CrossEntropyLoss(weight=class_weight, + ignore_index=ignore_index)) + self.weights.append(ce_weight) + + if focal: + self.criterions.append(FocalLoss(ignore_index=ignore_index)) + self.weights.append(focal_weight) + + if tmse: + self.criterions.append( + TMSE(threshold=threshold, ignore_index=ignore_index)) + self.weights.append(tmse_weight) + + if gstmse: + self.criterions.append( + GaussianSimilarityTMSE(threshold=threshold, + ignore_index=ignore_index)) + self.weights.append(gstmse_weight) + + if len(self.criterions) == 0: + print("You have to choose at least one loss function.") + sys.exit(1) + + def get_class_weight(self): + """ + Class weight for CrossEntropy + Class weight is calculated in the way described in: + D. Eigen and R. Fergus, “Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture,” in ICCV, + openaccess: https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Eigen_Predicting_Depth_Surface_ICCV_2015_paper.pdf + """ + # load file list + file_ptr = open(self.file_path, 'r') + info = file_ptr.read().split('\n')[:-1] + file_ptr.close() + + nums = [0 for i in range(self.num_classes)] + for i in range(len(info)): + video_name = info[i] + file_name = video_name.split('.')[0] + ".npy" + label_file_path = os.path.join(self.label_path, file_name) + label = np.load(label_file_path).astype(np.int64) + num, cnt = np.unique(label, return_counts=True) + for n, c in zip(num, cnt): + nums[n] += c + + class_num = paddle.to_tensor(nums, dtype="float32") + total = class_num.sum().item() + frequency = class_num / total + median = paddle.median(frequency) + class_weight = median / frequency + return class_weight + + def forward(self, preds, gts, sim_index): + """ + Args: + preds: paddle.float (N, C, T). + gts: paddle.int64 (N, T). + sim_index: paddle.float (N, C', T). + """ + loss = 0.0 + for criterion, weight in zip(self.criterions, self.weights): + if isinstance(criterion, GaussianSimilarityTMSE): + loss += weight * criterion(preds, gts, sim_index) + elif isinstance(criterion, nn.CrossEntropyLoss): + preds_t = paddle.transpose(preds, perm=[0, 2, 1]) + loss += weight * criterion(preds_t, gts) + else: + loss += weight * criterion(preds, gts) + + return loss + + +class BoundaryRegressionLoss(nn.Layer): + """ + Boundary Regression Loss + bce: Binary Cross Entropy Loss for Boundary Prediction + mse: Mean Squared Error + """ + + def __init__(self, + file_path, + label_path, + bce=True, + focal=False, + mse=False, + weight=None, + pos_weight=None): + super().__init__() + + self.criterions = [] + self.file_path = file_path + self.label_path = label_path + + pos_weight = self.get_pos_weight() + + if bce: + self.criterions.append( + nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight)) + + if focal: + self.criterions.append(FocalLoss()) + + if mse: + self.criterions.append(nn.MSELoss()) + + if len(self.criterions) == 0: + print("You have to choose at least one loss function.") + sys.exit(1) + + def get_pos_weight(self, norm=None): + """ + pos_weight for binary cross entropy with logits loss + pos_weight is defined as reciprocal of ratio of positive samples in the dataset + """ + # load file list + file_ptr = open(self.file_path, 'r') + info = file_ptr.read().split('\n')[:-1] + file_ptr.close() + + n_classes = 2 # boundary or not + nums = [0 for i in range(n_classes)] + for i in range(len(info)): + video_name = info[i] + file_name = video_name.split('.')[0] + ".npy" + label_file_path = os.path.join(self.label_path, file_name) + label = np.load(label_file_path).astype(np.int64) + num, cnt = np.unique(label, return_counts=True) + for n, c in zip(num, cnt): + nums[n] += c + + pos_ratio = nums[1] / sum(nums) + pos_weight = 1 / pos_ratio + + if norm is not None: + pos_weight /= norm + + return paddle.to_tensor(pos_weight, dtype="float32") + + def forward(self, preds, gts): + """ + Args: + preds: paddle.float (N, 1, T). + gts: paddle.float (N, 1, T). + """ + loss = 0.0 + batch_size = float(preds.shape[0]) + + for criterion in self.criterions: + for pred, gt in zip(preds, gts): + loss += criterion(pred, gt) + + return loss / batch_size + + +@LOSSES.register() +class ASRFLoss(nn.Layer): + + def __init__(self, + lambda_bound_loss, + num_classes, + file_path, + label_path, + boundary_path, + ce=True, + asl_focal=True, + tmse=False, + gstmse=False, + asl_weight=None, + threshold=4., + ignore_index=255, + ce_weight=1.0, + focal_weight=1.0, + tmse_weight=0.15, + gstmse_weight=0.15, + bce=True, + brl_focal=False, + mse=False, + brl_weight=None): + super().__init__() + self.criterion_cls = ActionSegmentationLoss(ce=ce, + focal=asl_focal, + tmse=tmse, + gstmse=gstmse, + weight=asl_weight, + threshold=threshold, + ignore_index=ignore_index, + ce_weight=ce_weight, + focal_weight=focal_weight, + tmse_weight=tmse_weight, + gstmse_weight=gstmse_weight, + file_path=file_path, + label_path=label_path, + num_classes=num_classes) + self.criterion_boundary = BoundaryRegressionLoss( + bce=bce, + focal=brl_focal, + mse=mse, + weight=brl_weight, + file_path=file_path, + label_path=boundary_path) + self.lambda_bound_loss = lambda_bound_loss + + def forward(self, x, output_cls, label, outputs_boundary, boundary): + loss = 0.0 + if isinstance(output_cls, list): + n = len(output_cls) + for out in output_cls: + loss += self.criterion_cls(out, label, x) / n + else: + loss += self.criterion_cls(output_cls, label, x) + + if isinstance(outputs_boundary, list): + n = len(outputs_boundary) + for out in outputs_boundary: + loss += self.lambda_bound_loss * self.criterion_boundary( + out, boundary) / n + else: + loss += self.lambda_bound_loss * self.criterion_boundary( + outputs_boundary, boundary) + + return loss diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/base.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/base.py new file mode 100644 index 0000000..7284252 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/base.py @@ -0,0 +1,49 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +import paddle +import paddle.nn as nn + +#XXX use _forward?? or forward?? +class BaseWeightedLoss(nn.Layer): + """Base class for loss. + + All subclass should overwrite the ``_forward()`` method which returns the + normal loss without loss weights. + + Args: + loss_weight (float): Factor scalar multiplied on the loss. + Default: 1.0. + """ + + def __init__(self, loss_weight=1.0): + super().__init__() + self.loss_weight = loss_weight + + @abstractmethod + def _forward(self, *args, **kwargs): + pass + + def forward(self, *args, **kwargs): + """Defines the computation performed at every call. + Args: + *args: The positional arguments for the corresponding + loss. + **kwargs: The keyword arguments for the corresponding + loss. + Returns: + paddle.Tensor: The calculated loss. + """ + return self._forward(*args, **kwargs) * self.loss_weight diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/bmn_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/bmn_loss.py new file mode 100644 index 0000000..e434850 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/bmn_loss.py @@ -0,0 +1,155 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn.functional as F + +from ..registry import LOSSES +from .base import BaseWeightedLoss + + +@LOSSES.register() +class BMNLoss(BaseWeightedLoss): + """Loss for BMN model + Args: + tscale (int): sequence length, default 100. + dscale (int): max duration length, default 100. + """ + def __init__(self, dscale, tscale): + super().__init__() + self.dscale = dscale + self.tscale = tscale + + def _get_mask(self, dscale, tscale): + bm_mask = [] + for idx in range(dscale): + mask_vector = [1 for i in range(tscale - idx) + ] + [0 for i in range(idx)] + bm_mask.append(mask_vector) + bm_mask = np.array(bm_mask, dtype='float32') + bm_mask = paddle.to_tensor(bm_mask) + bm_mask.stop_gradient = True + return bm_mask + + def tem_loss_func(self, pred_start, pred_end, gt_start, gt_end): + def bi_loss(pred_score, gt_label, datatype): + pred_score = paddle.reshape(x=pred_score, shape=[-1]) + gt_label = paddle.reshape(x=gt_label, shape=[-1]) + gt_label.stop_gradient = True + pmask = paddle.cast(x=(gt_label > 0.5), dtype=datatype) + num_entries = paddle.cast(paddle.shape(pmask), dtype=datatype) + num_positive = paddle.cast(paddle.sum(pmask), dtype=datatype) + ratio = num_entries / num_positive + coef_0 = 0.5 * ratio / (ratio - 1) + coef_1 = 0.5 * ratio + epsilon = 0.000001 + loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask) + loss_pos = coef_1 * paddle.mean(loss_pos) + loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon), + (1.0 - pmask)) + loss_neg = coef_0 * paddle.mean(loss_neg) + loss = -1 * (loss_pos + loss_neg) + return loss + + loss_start = bi_loss(pred_start, gt_start, pred_start.dtype) + loss_end = bi_loss(pred_end, gt_end, pred_start.dtype) + loss = loss_start + loss_end + return loss + + def pem_reg_loss_func(self, pred_score, gt_iou_map, mask): + gt_iou_map = paddle.multiply(gt_iou_map, mask) + + u_hmask = paddle.cast(x=gt_iou_map > 0.7, dtype=pred_score.dtype) + u_mmask = paddle.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3) + u_mmask = paddle.cast(x=u_mmask, dtype=pred_score.dtype) + u_lmask = paddle.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.) + u_lmask = paddle.cast(x=u_lmask, dtype=pred_score.dtype) + u_lmask = paddle.multiply(u_lmask, mask) + + num_h = paddle.cast(paddle.sum(u_hmask), dtype=pred_score.dtype) + num_m = paddle.cast(paddle.sum(u_mmask), dtype=pred_score.dtype) + num_l = paddle.cast(paddle.sum(u_lmask), dtype=pred_score.dtype) + + r_m = num_h / num_m + u_smmask = paddle.uniform(shape=[ + gt_iou_map.shape[1], gt_iou_map.shape[2] + ], + min=0.0, + max=1.0).astype(pred_score.dtype) + u_smmask = paddle.multiply(u_mmask, u_smmask) + u_smmask = paddle.cast(x=(u_smmask > (1. - r_m)), + dtype=pred_score.dtype) + + r_l = num_h / num_l + u_slmask = paddle.uniform(shape=[ + gt_iou_map.shape[1], gt_iou_map.shape[2] + ], + min=0.0, + max=1.0).astype(pred_score.dtype) + u_slmask = paddle.multiply(u_lmask, u_slmask) + u_slmask = paddle.cast(x=(u_slmask > (1. - r_l)), + dtype=pred_score.dtype) + + weights = u_hmask + u_smmask + u_slmask + weights.stop_gradient = True + loss = F.square_error_cost(pred_score, gt_iou_map) + loss = paddle.multiply(loss, weights) + loss = 0.5 * paddle.sum(loss) / paddle.sum(weights) + + return loss + + def pem_cls_loss_func(self, pred_score, gt_iou_map, mask): + gt_iou_map = paddle.multiply(gt_iou_map, mask) + gt_iou_map.stop_gradient = True + pmask = paddle.cast(x=(gt_iou_map > 0.9), dtype=pred_score.dtype) + nmask = paddle.cast(x=(gt_iou_map <= 0.9), dtype=pred_score.dtype) + nmask = paddle.multiply(nmask, mask) + + num_positive = paddle.sum(pmask) + num_entries = num_positive + paddle.sum(nmask) + ratio = num_entries / num_positive + coef_0 = 0.5 * ratio / (ratio - 1) + coef_1 = 0.5 * ratio + epsilon = 0.000001 + loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask) + loss_pos = coef_1 * paddle.sum(loss_pos) + loss_neg = paddle.multiply(paddle.log(1.0 - pred_score + epsilon), + nmask) + loss_neg = coef_0 * paddle.sum(loss_neg) + loss = -1 * (loss_pos + loss_neg) / num_entries + return loss + + def forward(self, pred_bm, pred_start, pred_end, gt_iou_map, gt_start, + gt_end): + pred_bm_reg = paddle.squeeze(paddle.slice(pred_bm, + axes=[1], + starts=[0], + ends=[1]), + axis=[1]) + pred_bm_cls = paddle.squeeze(paddle.slice(pred_bm, + axes=[1], + starts=[1], + ends=[2]), + axis=[1]) + + bm_mask = self._get_mask(self.dscale, self.tscale) + + pem_reg_loss = self.pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask) + pem_cls_loss = self.pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask) + + tem_loss = self.tem_loss_func(pred_start, pred_end, gt_start, gt_end) + + loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss + return loss diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/cross_entropy_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/cross_entropy_loss.py new file mode 100644 index 0000000..953f77c --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/cross_entropy_loss.py @@ -0,0 +1,36 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F + +from ..registry import LOSSES +from .base import BaseWeightedLoss + + +@LOSSES.register() +class CrossEntropyLoss(BaseWeightedLoss): + """Cross Entropy Loss.""" + def _forward(self, score, labels, **kwargs): + """Forward function. + Args: + score (paddle.Tensor): The class score. + labels (paddle.Tensor): The ground truth labels. + kwargs: Any keyword argument to be used to calculate + CrossEntropy loss. + Returns: + loss (paddle.Tensor): The returned CrossEntropy loss. + """ + loss = F.cross_entropy(score, labels, **kwargs) + return loss diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/depth_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/depth_loss.py new file mode 100644 index 0000000..ba9a2cb --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/depth_loss.py @@ -0,0 +1,290 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +from ..registry import LOSSES +from .base import BaseWeightedLoss + + +def get_smooth_loss(disp, img): + """Computes the smoothness loss for a disparity image + The color image is used for edge-aware smoothness + """ + grad_disp_x = paddle.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:]) + grad_disp_y = paddle.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :]) + + grad_img_x = paddle.mean(paddle.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), + 1, + keepdim=True) + grad_img_y = paddle.mean(paddle.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), + 1, + keepdim=True) + + grad_disp_x *= paddle.exp(-grad_img_x) + grad_disp_y *= paddle.exp(-grad_img_y) + + return grad_disp_x.mean() + grad_disp_y.mean() + + +class DiffLoss(nn.Layer): + def __init__(self): + super(DiffLoss, self).__init__() + + def forward(self, input1, input2): + batch_size = input1.shape[0] + input1 = input1.reshape([batch_size, -1]) + input2 = input2.reshape([batch_size, -1]) + + input1_l2 = input1 + input2_l2 = input2 + + diff_loss = 0 + dim = input1.shape[1] + for i in range(input1.shape[0]): + diff_loss = diff_loss + paddle.mean( + ((input1_l2[i:i + 1, :].mm(input2_l2[i:i + 1, :].T)).pow(2)) / + dim) + + diff_loss = diff_loss / input1.shape[0] + + return diff_loss + + +class MSE(nn.Layer): + def __init__(self): + super(MSE, self).__init__() + + def forward(self, pred, real): + diffs = paddle.add(real, -pred) + n = paddle.numel(diffs) + mse = paddle.sum(diffs.pow(2)) / n + + return mse + + +class SIMSE(nn.Layer): + def __init__(self): + super(SIMSE, self).__init__() + + def forward(self, pred, real): + diffs = paddle.add(real, -pred) + n = paddle.numel(diffs) + simse = paddle.sum(diffs).pow(2) / (n**2) + + return simse + + +class SSIM(nn.Layer): + """Layer to compute the SSIM loss between a pair of images + """ + def __init__(self): + super(SSIM, self).__init__() + self.mu_x_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.mu_y_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.sig_x_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.sig_y_pool = nn.AvgPool2D(3, 1, exclusive=False) + self.sig_xy_pool = nn.AvgPool2D(3, 1, exclusive=False) + + self.refl = nn.Pad2D(1, mode='reflect') + + self.C1 = 0.01**2 + self.C2 = 0.03**2 + + def forward(self, x, y): + x = self.refl(x) + y = self.refl(y) + + mu_x = self.mu_x_pool(x) + mu_y = self.mu_y_pool(y) + + sigma_x = self.sig_x_pool(x**2) - mu_x**2 + sigma_y = self.sig_y_pool(y**2) - mu_y**2 + sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y + + SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2) + SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2) + + return paddle.clip((1 - SSIM_n / SSIM_d) / 2, 0, 1) + + +@LOSSES.register() +class ADDSLoss(BaseWeightedLoss): + def __init__(self, avg_reprojection, disparity_smoothness, no_ssim): + super(ADDSLoss, self).__init__() + self.avg_reprojection = avg_reprojection + self.disparity_smoothness = disparity_smoothness + self.no_ssim = no_ssim + + self.loss_diff = DiffLoss() + self.loss_recon1 = MSE() + self.loss_recon2 = SIMSE() + self.loss_similarity = MSE() + + def compute_reprojection_loss(self, pred, target): + """Computes reprojection loss between a batch of predicted and target images + """ + abs_diff = paddle.abs(target - pred) + l1_loss = abs_diff.mean(1, True) + + if not self.no_ssim: + self.ssim = SSIM() + + if self.no_ssim: + reprojection_loss = l1_loss + else: + ssim_loss = self.ssim(pred, target).mean(1, True) + reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss + + return reprojection_loss + + def compute_losses(self, inputs, outputs, is_night): + """Compute the reprojection and smoothness losses for a minibatch + """ + losses = {} + total_loss = 0 + + for scale in outputs['scales']: + loss = 0 + reprojection_losses = [] + + source_scale = 0 + + disp = outputs[("disp", scale)] + if is_night: + color = inputs[("color_n", 0, scale)] + target = inputs[("color_n", 0, source_scale)] + else: + color = inputs[("color", 0, scale)] + target = inputs[("color", 0, source_scale)] + + for frame_id in outputs['frame_ids'][1:]: + pred = outputs[("color", frame_id, scale)] + reprojection_losses.append( + self.compute_reprojection_loss(pred, target)) + + reprojection_losses = paddle.concat(reprojection_losses, 1) + + identity_reprojection_losses = [] + for frame_id in outputs['frame_ids'][1:]: + if is_night: + pred = inputs[("color_n", frame_id, source_scale)] + else: + pred = inputs[("color", frame_id, source_scale)] + identity_reprojection_losses.append( + self.compute_reprojection_loss(pred, target)) + + identity_reprojection_losses = paddle.concat( + identity_reprojection_losses, 1) + + if self.avg_reprojection: + identity_reprojection_loss = identity_reprojection_losses.mean( + 1, keepdim=True) + else: + # save both images, and do min all at once below + identity_reprojection_loss = identity_reprojection_losses + + if self.avg_reprojection: + reprojection_loss = reprojection_losses.mean(1, keepdim=True) + else: + reprojection_loss = reprojection_losses + + # add random numbers to break ties + identity_reprojection_loss = identity_reprojection_loss + paddle.randn( + identity_reprojection_loss.shape) * 0.00001 + + combined = paddle.concat( + (identity_reprojection_loss, reprojection_loss), axis=1) + if combined.shape[1] == 1: + to_optimise = combined + else: + to_optimise = paddle.min(combined, axis=1) + + loss = loss + to_optimise.mean() + + mean_disp = disp.mean(2, True).mean(3, True) + norm_disp = disp / (mean_disp + 1e-7) + smooth_loss = get_smooth_loss(norm_disp, color) + + loss = loss + self.disparity_smoothness * smooth_loss / (2**scale) + total_loss = total_loss + loss + losses["loss/{}".format(scale)] = loss + + total_loss /= len(outputs['scales']) + losses["loss"] = total_loss + return losses + + def forward(self, inputs, outputs): + + losses_day = self.compute_losses(inputs, outputs, 'day') + losses_night = self.compute_losses(inputs, outputs['outputs_night'], + 'night') + + loss = 0 + losses = [] + # diff + target_diff1 = 0.5 * self.loss_diff( + outputs['result'][0], outputs['result'][2]) # 10 when batchsize=1 + target_diff2 = 0.5 * self.loss_diff(outputs['result_night'][0], + outputs['result_night'][2]) + losses.append(target_diff1) + losses.append(target_diff2) + loss = loss + target_diff1 + loss = loss + target_diff2 + + target_diff3 = 1 * self.loss_diff( + outputs['result'][1], outputs['result'][3]) # 10 when batchsize=1 + target_diff4 = 1 * self.loss_diff(outputs['result_night'][1], + outputs['result_night'][3]) + losses.append(target_diff3) + losses.append(target_diff4) + loss = loss + target_diff3 + loss = loss + target_diff4 + + # recon + target_mse = 1 * self.loss_recon1(outputs['result'][5], + inputs["color_aug", 0, 0]) + loss = loss + target_mse + + target_simse = 1 * self.loss_recon2(outputs['result'][5], + inputs["color_aug", 0, 0]) + loss = loss + target_simse + + losses.append(target_mse) + losses.append(target_simse) + target_mse_night = 1 * self.loss_recon1(outputs['result_night'][5], + inputs["color_n_aug", 0, 0]) + loss = loss + target_mse_night + + target_simse_night = 1 * self.loss_recon2(outputs['result_night'][5], + inputs["color_n_aug", 0, 0]) + loss = loss + target_simse_night + + losses.append(target_mse_night) + losses.append(target_simse_night) + + # depth loss + pseudo_label = outputs[("disp", 0)].detach() + depth_loss = 1 * self.loss_similarity( + outputs['outputs_night'][("disp", 0)], pseudo_label) + loss = loss + depth_loss + + losses.append(depth_loss) + + outputs['loss'] = loss + losses_day['loss'] + losses_night['loss'] + outputs['losses_day'] = losses_day['loss'] + outputs['losses_night'] = losses_night['loss'] + + return outputs diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/distillation_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/distillation_loss.py new file mode 100644 index 0000000..d27f941 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/distillation_loss.py @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn + +from ..registry import LOSSES +from .base import BaseWeightedLoss + + +@LOSSES.register() +class DistillationCELoss(BaseWeightedLoss): + """Distillation Entropy Loss.""" + def _forward(self, score, labels, **kwargs): + """Forward function. + Args: + score (paddle.Tensor): The class score. + labels (paddle.Tensor): The ground truth labels. + kwargs: Any keyword argument to be used to calculate + CrossEntropy loss. + Returns: + loss (paddle.Tensor): The returned CrossEntropy loss. + """ + if len(labels) == 1: + label = labels[0] + loss = F.cross_entropy(score, label, **kwargs) + # Deal with VideoMix + elif len(labels) == 3: + label_a, label_b, lam = labels + loss_a = F.cross_entropy(score, label_a, **kwargs) + loss_b = F.cross_entropy(score, label_b, **kwargs) + loss = lam * loss_a + (1 - lam) * loss_b + loss = paddle.mean(loss) #lam shape is bs + return loss + + +@LOSSES.register() +class DistillationDMLLoss(BaseWeightedLoss): + """ + DistillationDMLLoss + """ + def __init__(self, act="softmax", eps=1e-12, **kargs): + super().__init__(**kargs) + if act is not None: + assert act in ["softmax", "sigmoid"] + if act == "softmax": + self.act = nn.Softmax(axis=-1) + elif act == "sigmoid": + self.act = nn.Sigmoid() + else: + self.act = None + self.eps = eps + + def _kldiv(self, x, target): + class_num = x.shape[-1] + cost = target * paddle.log( + (target + self.eps) / (x + self.eps)) * class_num + return cost + + def _forward(self, x, target): + if self.act is not None: + x = self.act(x) + target = self.act(target) + loss = self._kldiv(x, target) + self._kldiv(target, x) + loss = loss / 2 + loss = paddle.mean(loss) + return loss diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/transnetv2_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/transnetv2_loss.py new file mode 100644 index 0000000..624c468 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/transnetv2_loss.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from ..registry import LOSSES +from .base import BaseWeightedLoss + + +@LOSSES.register() +class TransNetV2Loss(BaseWeightedLoss): + """Loss for TransNetV2 model + """ + def __init__(self, transition_weight=5.0, many_hot_loss_weight=0.1): + self.transition_weight = transition_weight + self.many_hot_loss_weight = many_hot_loss_weight + super().__init__() + + def _forward(self, one_hot_pred, one_hot_gt, + many_hot_pred=None, many_hot_gt=None, reg_losses=None): + assert transition_weight != 1 + + one_hot_pred = one_hot_pred[:, :, 0] + + one_hot_gt = one_hot_gt.astype('float32') + one_hot_loss = F.binary_cross_entropy_with_logits(logit=one_hot_pred, label=one_hot_gt, reduction='none') + + one_hot_loss *= 1 + one_hot_gt * (transition_weight - 1) + + one_hot_loss = paddle.mean(one_hot_loss) + + many_hot_loss = 0. + if many_hot_loss_weight != 0. and many_hot_pred is not None: + many_hot_loss = many_hot_loss_weight * paddle.mean( + F.binary_cross_entropy_with_logits(logit=many_hot_pred[:, :, 0], + label=many_hot_gt.astype('float32'), reduction='none')) + + total_loss = one_hot_loss + many_hot_loss + + if reg_losses is not None: + for name, value in reg_losses.items(): + if value is not None: + total_loss += value + + return total_loss \ No newline at end of file diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/losses/yowo_loss.py b/Bank_second_part/detect_process/paddlevideo/modeling/losses/yowo_loss.py new file mode 100644 index 0000000..5ca3290 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/losses/yowo_loss.py @@ -0,0 +1,251 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from paddle.static import Variable + +from ..registry import LOSSES +from .base import BaseWeightedLoss +from ..framework.localizers.yowo_utils import build_targets + + +class FocalLoss(nn.Layer): + """ + This criterion is a implemenation of Focal Loss, which is proposed in + Focal Loss for Dense Object Detection. + + Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class]) + + The losses are averaged across observations for each minibatch. + + Args: + alpha(1D Tensor, Variable) : the scalar factor for this criterion + gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5), + putting more focus on hard, misclassified examples + size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch. + However, if the field size_average is set to False, the losses are + instead summed for each minibatch. + + """ + + def __init__(self, class_num, alpha=None, gamma=2, size_average=True): + super(FocalLoss, self).__init__() + + if alpha is None: + self.alpha = paddle.ones( + [class_num, 1]) + self.alpha.stop_gradient = False + else: + if isinstance(alpha, Variable): + self.alpha = alpha + else: + self.alpha = (alpha) + self.alpha.stop_gradient = False + self.gamma = gamma + self.class_num = class_num + self.size_average = size_average + + def forward(self, inputs, targets): + N = inputs.shape[0] + C = inputs.shape[1] + P = F.softmax(inputs, axis=1) + + tmp = numpy.zeros((N, C)) + class_mask = paddle.to_tensor(tmp, place=inputs.place) + class_mask.stop_gradient = False + ids = paddle.reshape(targets, [-1, 1]) + class_mask = F.one_hot(ids.squeeze(-1), class_mask.shape[1]) + + if "Place" not in str(inputs.place) and "Place" not in str(self.alpha.place): + self.alpha = self.alpha.cuda() + + alpha = self.alpha[paddle.reshape(ids.detach(), [-1])] + + probs = paddle.reshape((P * class_mask).sum(1), [-1, 1]) + + log_p = probs.log() + + batch_loss = -alpha * (paddle.pow((1 - probs), self.gamma)) * log_p + + if self.size_average: + loss = batch_loss.mean() + else: + loss = batch_loss.sum() + return loss + + +@LOSSES.register() +class RegionLoss(BaseWeightedLoss): + # for our model anchors has 10 values and number of anchors is 5 + # parameters: 24, 10 float values, 24, 5 + def __init__(self, num_classes, anchors, num_anchors, object_scale, noobject_scale, class_scale, coord_scale): + super().__init__() + self.num_classes = num_classes + self.anchors = [float(x) for x in anchors] + self.num_anchors = num_anchors + self.anchor_step = len(self.anchors) // self.num_anchors # each anchor has 2 parameters + self.object_scale = object_scale + self.noobject_scale = noobject_scale + self.class_scale = class_scale + self.coord_scale = coord_scale + self.focalloss = FocalLoss(class_num=self.num_classes, gamma=2, size_average=False) + self.thresh = 0.6 + + def convert2cpu(self, gpu_matrix): + # return paddle.to_tensor((gpu_matrix.shape), dtype="float32").copy_(gpu_matrix) + return gpu_matrix.cpu() + + def forward(self, output, target): + # output : B*A*(4+1+num_classes)*H*W 8*5*29*24*24 + # B: number of batches + # A: number of anchors + # 4: 4 parameters for each bounding box + # 1: confidence score + # num_classes + # H: height of the image (in grids) + # W: width of the image (in grids) + # for each grid cell, there are A*(4+1+num_classes) parameters + nB = output.detach().shape[0] # batch + nA = self.num_anchors # anchor_num + nC = self.num_classes + nH = output.detach().shape[2] + nW = output.detach().shape[3] + + # resize the output (all parameters for each anchor can be reached) + output = paddle.reshape(output, [nB, nA, (5 + nC), nH, nW]) + # anchor's parameter tx + + x = F.sigmoid( + paddle.reshape(paddle.index_select(output, paddle.to_tensor([0], dtype='int64').cuda(), axis=2), + [nB, nA, nH, nW])) + x.stop_gradient = False + # anchor's parameter ty + y = F.sigmoid( + paddle.reshape(paddle.index_select(output, paddle.to_tensor([1], dtype='int64').cuda(), axis=2), + [nB, nA, nH, nW])) + y.stop_gradient = False + # anchor's parameter tw + w = paddle.reshape(paddle.index_select(output, paddle.to_tensor([2], dtype='int64').cuda(), axis=2), + [nB, nA, nH, nW]) + w.stop_gradient = False + # anchor's parameter th + h = paddle.reshape(paddle.index_select(output, paddle.to_tensor([3], dtype='int64').cuda(), axis=2), + [nB, nA, nH, nW]) + h.stop_gradient = False + # confidence score for each anchor + conf = F.sigmoid( + paddle.reshape(paddle.index_select(output, paddle.to_tensor([4], dtype='int64').cuda(), axis=2), + [nB, nA, nH, nW])) + conf.stop_gradient = False + # anchor's parameter class label + cls = paddle.index_select(output, paddle.linspace(5, 5 + nC - 1, nC, 'int64').cuda(), axis=2) + cls.stop_gradient = False + # resize the data structure so that for every anchor there is a class label in the last dimension + cls = paddle.reshape(paddle.transpose(paddle.reshape(cls, [nB * nA, nC, nH * nW]), [0, 2, 1]), + [nB * nA * nH * nW, nC]) + + # for the prediction of localization of each bounding box, there exist 4 parameters (tx, ty, tw, th) + # pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW) + pred_boxes = paddle.zeros([4, nB * nA * nH * nW], dtype='float32').cuda() + # tx and ty + grid_x = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nW - 1, nW), [nH, 1]), [nB * nA, 1, 1]), + [nB * nA * nH * nW]).cuda() + grid_y = paddle.reshape(paddle.tile(paddle.tile(paddle.linspace(0, nH - 1, nH), [nW, 1]).t(), [nB * nA, 1, 1]), + [nB * nA * nH * nW]).cuda() + # for each anchor there are anchor_step variables (with the structure num_anchor*anchor_step) + # for each row(anchor), the first variable is anchor's width, second is anchor's height + # pw and ph + anchor_w = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]), + paddle.to_tensor([0], dtype='int64'), axis=1).cuda() + anchor_h = paddle.index_select(paddle.reshape(paddle.to_tensor(self.anchors), [nA, self.anchor_step]), + paddle.to_tensor([1], dtype='int64'), axis=1).cuda() + # for each pixel (grid) repeat the above process (obtain width and height of each grid) + anchor_w = paddle.reshape(paddle.tile(paddle.tile(anchor_w, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW]) + anchor_h = paddle.reshape(paddle.tile(paddle.tile(anchor_h, [nB, 1]), [1, 1, nH * nW]), [nB * nA * nH * nW]) + # prediction of bounding box localization + # x.data and y.data: top left corner of the anchor + # grid_x, grid_y: tx and ty predictions made by yowo + + x_data = paddle.reshape(x.detach(), [-1]) + y_data = paddle.reshape(y.detach(), [-1]) + w_data = paddle.reshape(w.detach(), [-1]) + h_data = paddle.reshape(h.detach(), [-1]) + + pred_boxes[0] = paddle.cast(x_data, dtype='float32') + paddle.cast(grid_x, dtype='float32') # bx + pred_boxes[1] = paddle.cast(y_data, dtype='float32') + paddle.cast(grid_y, dtype='float32') # by + pred_boxes[2] = paddle.exp(paddle.cast(w_data, dtype='float32')) * paddle.cast(anchor_w, dtype='float32') # bw + pred_boxes[3] = paddle.exp(paddle.cast(h_data, dtype='float32')) * paddle.cast(anchor_h, dtype='float32') # bh + # the size -1 is inferred from other dimensions + # pred_boxes (nB*nA*nH*nW, 4) + + pred_boxes = self.convert2cpu( + paddle.cast(paddle.reshape(paddle.transpose(pred_boxes, (1, 0)), [-1, 4]), dtype='float32')) + + nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes, + target.detach(), + self.anchors, nA, + nC, \ + nH, nW, + self.noobject_scale, + self.object_scale, + self.thresh) + cls_mask = (cls_mask == 1) + # keep those with high box confidence scores (greater than 0.25) as our final predictions + nProposals = int((conf > 0.25).sum().detach().item()) + + tx = (tx).cuda() + tx.stop_gradient = False + ty = ty.cuda() + ty.stop_gradient = False + tw = tw.cuda() + tw.stop_gradient = False + th = th.cuda() + th.stop_gradient = False + tconf = tconf.cuda() + tconf.stop_gradient = False + + tcls = paddle.reshape(tcls, [-1]).astype('int64')[paddle.reshape(cls_mask, [-1])].cuda() + tcls.stop_gradient = False + + coord_mask = coord_mask.cuda() + coord_mask.stop_gradient = False + conf_mask = conf_mask.cuda().sqrt() + coord_mask.stop_gradient = False + cls_mask = paddle.tile(paddle.reshape(cls_mask, [-1, 1]), [1, nC]).cuda() + cls_mask.stop_gradient = False + + cls = paddle.reshape(cls[cls_mask], [-1, nC]) + + # losses between predictions and targets (ground truth) + # In total 6 aspects are considered as losses: + # 4 for bounding box location, 2 for prediction confidence and classification seperately + L1_loss = nn.SmoothL1Loss(reduction='sum') + loss_x = self.coord_scale * L1_loss(paddle.cast(x, dtype="float32") * coord_mask, tx * coord_mask) / 2.0 + loss_y = self.coord_scale * L1_loss(paddle.cast(y, dtype="float32") * coord_mask, ty * coord_mask) / 2.0 + loss_w = self.coord_scale * L1_loss(paddle.cast(w * coord_mask, dtype="float32"), tw * coord_mask) / 2.0 + loss_h = self.coord_scale * L1_loss(paddle.cast(h * coord_mask, dtype="float32"), th * coord_mask) / 2.0 + loss_conf = nn.MSELoss(reduction='sum')(paddle.cast(conf, dtype="float32") * conf_mask, tconf * conf_mask) / 2.0 + + # try focal loss with gamma = 2 + loss_cls = self.class_scale * self.focalloss(cls, tcls) + + # sum of loss + loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls + + return loss, nCorrect + + diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/registry.py b/Bank_second_part/detect_process/paddlevideo/modeling/registry.py new file mode 100644 index 0000000..b8140e1 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/registry.py @@ -0,0 +1,31 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import Registry + +BACKBONES = Registry('backbone') +HEADS = Registry('head') +RECOGNIZERS = Registry('recognizer') +SEGMENTERS = Registry('Segmenters') +LOCALIZERS = Registry('localizer') +PARTITIONERS = Registry('partitioner') +LOSSES = Registry('loss') +ROI_EXTRACTORS = Registry('roi_extractor') +DETECTORS = Registry('detectors') +BBOX_ASSIGNERS = Registry('bbox_assigner') +BBOX_SAMPLERS = Registry('bbox_sampler') +BBOX_CODERS = Registry('bbox_coder') +ESTIMATORS = Registry('estimator') +MULTIMODAL = Registry('multimodal') +SEGMENT = Registry('segment') diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__init__.py b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__init__.py new file mode 100644 index 0000000..0cf7f15 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .random_sampler import RandomSampler + +__all__ = ['RandomSampler'] diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..0155487 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/random_sampler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/random_sampler.cpython-310.pyc new file mode 100644 index 0000000..5598aaa Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/__pycache__/random_sampler.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/samplers/random_sampler.py b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/random_sampler.py new file mode 100644 index 0000000..4808454 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/samplers/random_sampler.py @@ -0,0 +1,146 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import numpy as np +from ..registry import BBOX_SAMPLERS + +class SamplingResult(): + """Bbox sampling result. """ + + def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, + gt_flags): + self.pos_inds = pos_inds + self.neg_inds = neg_inds + self.pos_bboxes = paddle.index_select(bboxes,pos_inds) + + # neg_inds may be empty + if neg_inds.shape[0]!=0: + self.neg_bboxes = paddle.index_select(bboxes,neg_inds) + else: + self.neg_bboxes=None + + self.pos_is_gt = paddle.index_select(gt_flags,pos_inds) + self.num_gts = gt_bboxes.shape[0] + self.pos_assigned_gt_inds = paddle.index_select(assign_result.gt_inds,pos_inds) - 1 + + if float(gt_bboxes.numel()) == 0: + assert self.pos_assigned_gt_inds.numel() == 0 + self.pos_gt_bboxes = paddle.empty_like(gt_bboxes).view(-1, 4) + else: + if len(gt_bboxes.shape) < 2: + gt_bboxes = gt_bboxes.view(-1, 4) + + self.pos_gt_bboxes = paddle.index_select(gt_bboxes, self.pos_assigned_gt_inds) + + if assign_result.labels is not None: + self.pos_gt_labels = paddle.index_select(assign_result.labels, pos_inds) + else: + self.pos_gt_labels = None + + @property + def bboxes(self): + if self.neg_bboxes is not None: + ret = paddle.concat([self.pos_bboxes, self.neg_bboxes]) + else: + # neg bbox may be empty + ret = self.pos_bboxes + return ret + + + +@BBOX_SAMPLERS.register() +class RandomSampler(): + def __init__(self, + num, + pos_fraction, + neg_pos_ub=-1, + add_gt_as_proposals=True, + **kwargs): + self.num = num + self.pos_fraction = pos_fraction + self.neg_pos_ub = neg_pos_ub + self.add_gt_as_proposals = add_gt_as_proposals + + def sample(self, + assign_result, + bboxes, + gt_bboxes, + gt_labels=None, + **kwargs): + """Sample positive and negative bboxes. """ + + if len(bboxes.shape) < 2: + bboxes = bboxes[None, :] + + bboxes = bboxes[:, :4] + + gt_flags = paddle.full([bboxes.shape[0], ], 0, dtype='int32') + if self.add_gt_as_proposals and len(gt_bboxes) > 0: + if gt_labels is None: + raise ValueError( + 'gt_labels must be given when add_gt_as_proposals is True') + bboxes = paddle.concat([gt_bboxes, bboxes]) + assign_result.add_gt_(gt_labels) + gt_ones = paddle.full([gt_bboxes.shape[0], ], 1, dtype='int32') + gt_flags = paddle.concat([gt_ones, gt_flags]) + + #1. 得到正样本的数量, inds + num_expected_pos = int(self.num * self.pos_fraction) + pos_inds = self._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs) + pos_inds = paddle.to_tensor(np.unique(pos_inds.numpy())) + + #2. 得到负样本的数量, inds + num_sampled_pos = pos_inds.numel() + num_expected_neg = self.num - num_sampled_pos + neg_inds = self._sample_neg( + assign_result, num_expected_neg, bboxes=bboxes, **kwargs) + neg_inds = paddle.to_tensor(np.unique(neg_inds.numpy())) + + #3. 得到sampling result + sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, + assign_result, gt_flags) + return sampling_result + def random_choice(self, gallery, num): + """Random select some elements from the gallery. """ + assert len(gallery) >= num + + perm = paddle.arange(gallery.numel())[:num] + perm = paddle.randperm(gallery.numel())[:num] + rand_inds = paddle.index_select(gallery, perm) + return rand_inds + + def _sample_pos(self, assign_result, num_expected, **kwargs): + """Randomly sample some positive samples.""" + #1.首先看一下给的bboxes里面有哪些label是大于0的 得到了他们的index + pos_inds = paddle.nonzero(assign_result.gt_inds, as_tuple=False) + + #2. 只要这个pos_inds的数目不是0个 这些就都可以是positive sample + # 当pos_inds的数目小于num_expected(想要的sample的最大数目), 就直接用这个pos_inds + # 反之就从这么多index里随机采样num_expected个出来 + if float(pos_inds.numel()) != 0: + pos_inds = pos_inds.squeeze() + if float(pos_inds.numel()) <= num_expected: + return pos_inds + else: + return self.random_choice(pos_inds, num_expected) + + def _sample_neg(self, assign_result, num_expected, **kwargs): + """Randomly sample some negative samples.""" + neg_inds = paddle.nonzero(assign_result.gt_inds == 0, as_tuple=False) + if float(neg_inds.numel()) != 0: + neg_inds = neg_inds.squeeze() + if (float(neg_inds.numel())) <= float(num_expected): + return neg_inds + else: + return self.random_choice(neg_inds, num_expected) diff --git a/Bank_second_part/detect_process/paddlevideo/modeling/weight_init.py b/Bank_second_part/detect_process/paddlevideo/modeling/weight_init.py new file mode 100644 index 0000000..4722895 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/modeling/weight_init.py @@ -0,0 +1,157 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn.initializer as init +import numpy as np +from scipy import special + + +def weight_init_(layer, + func, + weight_name=None, + bias_name=None, + bias_value=0.0, + **kwargs): + """ + In-place params init function. + Usage: + .. code-block:: python + + import paddle + import numpy as np + + data = np.ones([3, 4], dtype='float32') + linear = paddle.nn.Linear(4, 4) + input = paddle.to_tensor(data) + print(linear.weight) + linear(input) + + weight_init_(linear, 'Normal', 'fc_w0', 'fc_b0', std=0.01, mean=0.1) + print(linear.weight) + """ + + if hasattr(layer, 'weight') and layer.weight is not None: + getattr(init, func)(**kwargs)(layer.weight) + if weight_name is not None: + # override weight name + layer.weight.name = weight_name + + if hasattr(layer, 'bias') and layer.bias is not None: + init.Constant(bias_value)(layer.bias) + if bias_name is not None: + # override bias name + layer.bias.name = bias_name + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.") + + with paddle.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to [2l-1, 2u-1]. + tmp = np.random.uniform(2 * l - 1, 2 * u - 1, + size=list(tensor.shape)).astype(np.float32) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tmp = special.erfinv(tmp) + + # Transform to proper mean, std + tmp *= (std * math.sqrt(2.0)) + tmp += mean + + # Clamp to ensure it's in the proper range + tmp = np.clip(tmp, a, b) + tensor.set_value(paddle.to_tensor(tmp)) + + return tensor + + +def _calculate_fan_in_and_fan_out(tensor): + dimensions = tensor.dim() + if dimensions < 2: + raise ValueError( + "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" + ) + + num_input_fmaps = tensor.shape[1] + num_output_fmaps = tensor.shape[0] + receptive_field_size = 1 + if tensor.dim() > 2: + receptive_field_size = tensor[0][0].numel() + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +def kaiming_normal_(tensor, a=0., mode='fan_in', nonlinearity='leaky_relu'): + def _calculate_correct_fan(tensor, mode): + mode = mode.lower() + valid_modes = ['fan_in', 'fan_out'] + if mode not in valid_modes: + raise ValueError( + "Mode {} not supported, please use one of {}".format( + mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + return fan_in if mode == 'fan_in' else fan_out + + def calculate_gain(nonlinearity, param=None): + linear_fns = [ + 'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', + 'conv_transpose2d', 'conv_transpose3d' + ] + if nonlinearity in linear_fns or nonlinearity == 'sigmoid': + return 1 + elif nonlinearity == 'tanh': + return 5.0 / 3 + elif nonlinearity == 'relu': + return math.sqrt(2.0) + elif nonlinearity == 'leaky_relu': + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance( + param, int) or isinstance(param, float): + negative_slope = param + else: + raise ValueError( + "negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope**2)) + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + + fan = _calculate_correct_fan(tensor, mode) + gain = calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + with paddle.no_grad(): + paddle.nn.initializer.Normal(0, std)(tensor) + return tensor diff --git a/Bank_second_part/detect_process/paddlevideo/solver/__init__.py b/Bank_second_part/detect_process/paddlevideo/solver/__init__.py new file mode 100644 index 0000000..01cf9cd --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/solver/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import build_optimizer +from .lr import build_lr diff --git a/Bank_second_part/detect_process/paddlevideo/solver/custom_lr.py b/Bank_second_part/detect_process/paddlevideo/solver/custom_lr.py new file mode 100644 index 0000000..bbf8d74 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/solver/custom_lr.py @@ -0,0 +1,338 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from paddle.optimizer.lr import * +import numpy as np +""" +PaddleVideo Learning Rate Schedule: +You can use paddle.optimizer.lr +or define your custom_lr in this file. +""" + + +class CustomWarmupCosineDecay(LRScheduler): + r""" + We combine warmup and stepwise-cosine which is used in slowfast model. + + Args: + warmup_start_lr (float): start learning rate used in warmup stage. + warmup_epochs (int): the number epochs of warmup. + cosine_base_lr (float|int, optional): base learning rate in cosine schedule. + max_epoch (int): total training epochs. + num_iters(int): number iterations of each epoch. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . + Returns: + ``CosineAnnealingDecay`` instance to schedule learning rate. + """ + + def __init__(self, + warmup_start_lr, + warmup_epochs, + cosine_base_lr, + max_epoch, + num_iters, + last_epoch=-1, + verbose=False): + self.warmup_start_lr = warmup_start_lr + self.warmup_epochs = warmup_epochs + self.cosine_base_lr = cosine_base_lr + self.max_epoch = max_epoch + self.num_iters = num_iters + #call step() in base class, last_lr/last_epoch/base_lr will be update + super(CustomWarmupCosineDecay, self).__init__(last_epoch=last_epoch, + verbose=verbose) + + def step(self, epoch=None): + """ + ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` . + The new learning rate will take effect on next ``optimizer.step`` . + Args: + epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. + Returns: + None + """ + if epoch is None: + if self.last_epoch == -1: + self.last_epoch += 1 + else: + self.last_epoch += 1 / self.num_iters # update step with iters + else: + self.last_epoch = epoch + self.last_lr = self.get_lr() + + if self.verbose: + print('Epoch {}: {} set learning rate to {}.'.format( + self.last_epoch, self.__class__.__name__, self.last_lr)) + + def _lr_func_cosine(self, cur_epoch, cosine_base_lr, max_epoch): + return cosine_base_lr * (math.cos(math.pi * cur_epoch / max_epoch) + + 1.0) * 0.5 + + def get_lr(self): + """Define lr policy""" + lr = self._lr_func_cosine(self.last_epoch, self.cosine_base_lr, + self.max_epoch) + lr_end = self._lr_func_cosine(self.warmup_epochs, self.cosine_base_lr, + self.max_epoch) + + # Perform warm up. + if self.last_epoch < self.warmup_epochs: + lr_start = self.warmup_start_lr + alpha = (lr_end - lr_start) / self.warmup_epochs + lr = self.last_epoch * alpha + lr_start + return lr + + +class CustomWarmupPiecewiseDecay(LRScheduler): + r""" + This op combine warmup and stepwise-cosine which is used in slowfast model. + + Args: + warmup_start_lr (float): start learning rate used in warmup stage. + warmup_epochs (int): the number epochs of warmup. + step_base_lr (float|int, optional): base learning rate in step schedule. + max_epoch (int): total training epochs. + num_iters(int): number iterations of each epoch. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . + Returns: + ``CustomWarmupPiecewiseDecay`` instance to schedule learning rate. + """ + + def __init__(self, + warmup_start_lr, + warmup_epochs, + step_base_lr, + lrs, + gamma, + steps, + max_epoch, + num_iters, + last_epoch=0, + verbose=False): + self.warmup_start_lr = warmup_start_lr + self.warmup_epochs = warmup_epochs + self.step_base_lr = step_base_lr + self.lrs = lrs + self.gamma = gamma + self.steps = steps + self.max_epoch = max_epoch + self.num_iters = num_iters + self.last_epoch = last_epoch + self.last_lr = self.warmup_start_lr # used in first iter + self.verbose = verbose + self._var_name = None + + def step(self, epoch=None, rebuild=False): + """ + ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` . + The new learning rate will take effect on next ``optimizer.step`` . + Args: + epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. + Returns: + None + """ + if epoch is None: + if not rebuild: + self.last_epoch += 1 / self.num_iters # update step with iters + else: + self.last_epoch = epoch + self.last_lr = self.get_lr() + + if self.verbose: + print( + 'step Epoch {}: {} set learning rate to {}.self.num_iters={}, 1/self.num_iters={}' + .format(self.last_epoch, self.__class__.__name__, self.last_lr, + self.num_iters, 1 / self.num_iters)) + + def _lr_func_steps_with_relative_lrs(self, cur_epoch, lrs, base_lr, steps, + max_epoch): + # get step index + steps = steps + [max_epoch] + for ind, step in enumerate(steps): + if cur_epoch < step: + break + if self.verbose: + print( + '_lr_func_steps_with_relative_lrs, cur_epoch {}: {}, steps {}, ind {}, step{}, max_epoch{}' + .format(cur_epoch, self.__class__.__name__, steps, ind, step, + max_epoch)) + + return lrs[ind - 1] * base_lr + + def get_lr(self): + """Define lr policy""" + lr = self._lr_func_steps_with_relative_lrs( + self.last_epoch, + self.lrs, + self.step_base_lr, + self.steps, + self.max_epoch, + ) + lr_end = self._lr_func_steps_with_relative_lrs( + self.warmup_epochs, + self.lrs, + self.step_base_lr, + self.steps, + self.max_epoch, + ) + + # Perform warm up. + if self.last_epoch < self.warmup_epochs: + lr_start = self.warmup_start_lr + alpha = (lr_end - lr_start) / self.warmup_epochs + lr = self.last_epoch * alpha + lr_start + if self.verbose: + print( + 'get_lr, Epoch {}: {}, lr {}, lr_end {}, self.lrs{}, self.step_base_lr{}, self.steps{}, self.max_epoch{}' + .format(self.last_epoch, self.__class__.__name__, lr, lr_end, + self.lrs, self.step_base_lr, self.steps, + self.max_epoch)) + + return lr + + +class CustomPiecewiseDecay(PiecewiseDecay): + + def __init__(self, **kargs): + kargs.pop('num_iters') + super().__init__(**kargs) + + +class CustomWarmupCosineStepDecay(LRScheduler): + + def __init__(self, + warmup_iters, + warmup_ratio=0.1, + min_lr=0, + base_lr=3e-5, + max_epoch=30, + last_epoch=-1, + num_iters=None, + verbose=False): + + self.warmup_ratio = warmup_ratio + self.min_lr = min_lr + self.warmup_epochs = warmup_iters + self.warmup_iters = warmup_iters * num_iters + self.cnt_iters = 0 + self.cnt_epoch = 0 + self.num_iters = num_iters + self.tot_iters = max_epoch * num_iters + self.max_epoch = max_epoch + self.cosine_base_lr = base_lr # initial lr for all param groups + self.regular_lr = self.get_regular_lr() + super().__init__(last_epoch=last_epoch, verbose=verbose) + + def annealing_cos(self, start, end, factor, weight=1): + cos_out = math.cos(math.pi * factor) + 1 + return end + 0.5 * weight * (start - end) * cos_out + + def get_regular_lr(self): + progress = self.cnt_epoch + max_progress = self.max_epoch + target_lr = self.min_lr + return self.annealing_cos(self.cosine_base_lr, target_lr, progress / + max_progress) # self.cosine_base_lr + + def get_warmup_lr(self, cur_iters): + k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio) + warmup_lr = self.regular_lr * (1 - k) # 3e-5 * (1-k) + return warmup_lr + + def step(self, epoch=None): + self.regular_lr = self.get_regular_lr() + self.last_lr = self.get_lr() + self.cnt_epoch = (self.cnt_iters + + 1) // self.num_iters # update step with iters + self.cnt_iters += 1 + + if self.verbose: + print('Epoch {}: {} set learning rate to {}.'.format( + self.last_epoch, self.__class__.__name__, self.last_lr)) + + def get_lr(self): + """Define lr policy""" + cur_iter = self.cnt_iters + if cur_iter >= self.warmup_iters: + return self.regular_lr + else: + warmup_lr = self.get_warmup_lr(cur_iter) + return warmup_lr + + +class CustomWarmupAdjustDecay(LRScheduler): + r""" + We combine warmup and stepwise-cosine which is used in slowfast model. + + Args: + step_base_lr (float): start learning rate used in warmup stage. + warmup_epochs (int): the number epochs of warmup. + lr_decay_rate (float|int, optional): base learning rate decay rate. + step (int): step in change learning rate. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . + Returns: + ``CosineAnnealingDecay`` instance to schedule learning rate. + """ + + def __init__(self, + step_base_lr, + warmup_epochs, + lr_decay_rate, + boundaries, + num_iters=None, + last_epoch=-1, + verbose=False): + self.step_base_lr = step_base_lr + self.warmup_epochs = warmup_epochs + self.lr_decay_rate = lr_decay_rate + self.boundaries = boundaries + self.num_iters = num_iters + #call step() in base class, last_lr/last_epoch/base_lr will be update + super(CustomWarmupAdjustDecay, self).__init__(last_epoch=last_epoch, + verbose=verbose) + + def step(self, epoch=None): + """ + ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` . + The new learning rate will take effect on next ``optimizer.step`` . + Args: + epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. + Returns: + None + """ + if epoch is None: + if self.last_epoch == -1: + self.last_epoch += 1 + else: + self.last_epoch += 1 / self.num_iters # update step with iters + else: + self.last_epoch = epoch + + self.last_lr = self.get_lr() + + if self.verbose: + print('Epoch {}: {} set learning rate to {}.'.format( + self.last_epoch, self.__class__.__name__, self.last_lr)) + + def get_lr(self): + if self.last_epoch < self.warmup_epochs: + lr = self.step_base_lr * (self.last_epoch + 1) / self.warmup_epochs + else: + lr = self.step_base_lr * (self.lr_decay_rate**np.sum( + self.last_epoch >= np.array(self.boundaries))) + return lr diff --git a/Bank_second_part/detect_process/paddlevideo/solver/lr.py b/Bank_second_part/detect_process/paddlevideo/solver/lr.py new file mode 100644 index 0000000..3a56fad --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/solver/lr.py @@ -0,0 +1,52 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from paddle.optimizer.lr import LRScheduler + +from . import custom_lr + + +def build_lr(cfg: Dict, num_iters: int) -> LRScheduler: + """Build a learning rate scheduler accroding to ```OPTIMIZER``` configuration, and it always pass into the optimizer. + In configuration: + learning_rate: + name: 'PiecewiseDecay' + boundaries: [20, 60] + values: [0.00025, 0.000025, 0.0000025] + + Args: + cfg (Dict): learning rate configuration. + num_iters (int): The number of iterations that may be used when calculating the learning rate + + Returns: + LRScheduler: learning rate scheduler. + """ + + cfg_copy = cfg.copy() + + #when learning_rate is LRScheduler + if cfg_copy.get('learning_rate') and isinstance(cfg_copy['learning_rate'], + dict): + cfg_copy['learning_rate'] = build_lr( + cfg_copy['learning_rate'], + num_iters) #not support only inner iter_step + + lr_name = cfg_copy.pop('name') + if cfg_copy.get('iter_step'): + cfg_copy['num_iters'] = num_iters + cfg_copy.pop('iter_step') + + return getattr(custom_lr, lr_name)(**cfg_copy) diff --git a/Bank_second_part/detect_process/paddlevideo/solver/optimizer.py b/Bank_second_part/detect_process/paddlevideo/solver/optimizer.py new file mode 100644 index 0000000..46ff916 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/solver/optimizer.py @@ -0,0 +1,132 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Dict + +import paddle +from paddle.optimizer.lr import LRScheduler +from paddle.regularizer import L1Decay, L2Decay +from paddlevideo.utils import get_logger + + +def build_optimizer(cfg: Dict, + lr_scheduler: LRScheduler, + model: paddle.nn.Layer, + use_amp: bool = False, + amp_level: str = None) -> paddle.optimizer.Optimizer: + """Build an optimizer and learning rate scheduler to optimize parameters accroding to ```OPTIMIZER``` field in configuration. + + In configuration: + OPTIMIZER: + name: Momentum + momentum: 0.9 + weight_decay: 0.001 + or + + OPTIMIZER: + name: Momentum + momentum: 0.9 + weight_decay: + name: "L1" + value: 0.001 + + Momentum optimizer will be applied to optimize network and L1Decay regularizer will be applied to avoid overfit. + + OPTIMIZER: + name: Adam + weight_decay: + name: "L2" + value: 0.001 + + Adam optimizer will be applied to optimize network and L2Decay regularizer will applied to avoid overfit. + + Refer to ```https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/regularizer/L2Decay_en.html``` for more details. + + Args: + cfg (Dict): optimizer configuration. + lr_scheduler (LRScheduler): learning rate scheduler. + model (paddle.nn.Layer, optional): model which contains parameters to be optimized. Defaults to None. + use_amp (bool, optional): Whether use amp. Defaults to False. + amp_level (str, optional): amp level when amp is enabled. Defaults to None. + + + Returns: + paddle.optimizer.Optimizer: an optimizer for the input model. + """ + logger = get_logger("paddlevideo") + cfg_copy = cfg.copy() + # NOTE: check none and illegal cfg!!! + opt_name = cfg_copy.pop('name') + # deal with weight decay + if cfg_copy.get('weight_decay'): + if isinstance(cfg_copy.get('weight_decay'), + float): # just an float factor + cfg_copy['weight_decay'] = cfg_copy.get('weight_decay') + elif 'L1' in cfg_copy.get('weight_decay').get( + 'name').upper(): # specify L2 wd and it's float factor + cfg_copy['weight_decay'] = L1Decay( + cfg_copy.get('weight_decay').get('value')) + elif 'L2' in cfg_copy.get('weight_decay').get( + 'name').upper(): # specify L1 wd and it's float factor + cfg_copy['weight_decay'] = L2Decay( + cfg_copy.get('weight_decay').get('value')) + else: + raise ValueError + + # deal with grad clip + if cfg_copy.get('grad_clip'): + if isinstance(cfg_copy.get('grad_clip'), float): + cfg_copy['grad_clip'] = cfg_copy.get('grad_clip').get('value') + elif 'global' in cfg_copy.get('grad_clip').get('name').lower(): + cfg_copy['grad_clip'] = paddle.nn.ClipGradByGlobalNorm( + cfg_copy.get('grad_clip').get('value')) + else: + raise ValueError + + # Set for optimizers that cannot be applied to l2decay, i.e. AdamW + if cfg_copy.get('no_weight_decay_name'): + no_weight_decay_name = cfg_copy.pop('no_weight_decay_name') + no_weight_decay_name_list = no_weight_decay_name.split(' ') + + # NOTE: use param.name not name + no_weight_decay_param_list = [ + param.name for name, param in model.named_parameters() + if any(key_word in name for key_word in no_weight_decay_name_list) + ] # get the full param name of no weight decay + + _apply_decay_param_fun = lambda name: name not in no_weight_decay_param_list + cfg_copy['apply_decay_param_fun'] = _apply_decay_param_fun + logger.info( + f"No weight Decay list :({len(no_weight_decay_param_list)})", + no_weight_decay_param_list) + + cfg_copy.pop('learning_rate') + + # set multi_precision + optimizer_setting = { + 'learning_rate': lr_scheduler, + 'parameters': model.parameters(), + **cfg_copy + } + optimizer_init_args = inspect.getargspec( + getattr(paddle.optimizer, opt_name).__init__).args + if use_amp and amp_level == "O2" and "multi_precision" in optimizer_init_args: + # support "multi_precision" arg in optimizer's __init__ function. + optimizer_setting.update({"multi_precision": True}) + logger.info( + "Set multi_precision=True for optimizer when use_amp=True and amp_level='O2'" + ) + + return getattr(paddle.optimizer, opt_name)(**optimizer_setting) diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/__init__.py b/Bank_second_part/detect_process/paddlevideo/tasks/__init__.py new file mode 100644 index 0000000..4d43f09 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/tasks/__init__.py @@ -0,0 +1,20 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .train import train_model +from .test import test_model +from .train_dali import train_dali +from .train_multigrid import train_model_multigrid + +__all__ = ['train_model', 'test_model', 'train_dali', 'train_model_multigrid'] diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/test.py b/Bank_second_part/detect_process/paddlevideo/tasks/test.py new file mode 100644 index 0000000..31c8653 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/tasks/test.py @@ -0,0 +1,90 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddlevideo.utils import get_logger, load + +from ..loader.builder import build_dataloader, build_dataset +from ..metrics import build_metric +from ..modeling.builder import build_model + +logger = get_logger("paddlevideo") + + +@paddle.no_grad() +def test_model(cfg, weights, parallel=True): + """Test model entry + + Args: + cfg (dict): configuration. + weights (str): weights path to load. + parallel (bool): Whether to do multi-cards testing. Default: True. + + """ + + if cfg.get('use_npu', False): + places = paddle.set_device('npu') + elif cfg.get('use_xpu', False): + places = paddle.set_device('xpu') + else: + places = paddle.set_device('gpu') + + # 1. Construct model. + if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'): + cfg.MODEL.backbone.pretrained = '' # disable pretrain model init + model = build_model(cfg.MODEL) + + if parallel: + model = paddle.DataParallel(model) + + # 2. Construct dataset and dataloader. + cfg.DATASET.test.test_mode = True + dataset = build_dataset((cfg.DATASET.test, cfg.PIPELINE.test)) + batch_size = cfg.DATASET.get("test_batch_size", 8) + + # default num worker: 0, which means no subprocess will be created + num_workers = cfg.DATASET.get('num_workers', 0) + num_workers = cfg.DATASET.get('test_num_workers', num_workers) + dataloader_setting = dict(batch_size=batch_size, + num_workers=num_workers, + places=places, + drop_last=False, + shuffle=False) + + data_loader = build_dataloader( + dataset, **dataloader_setting) if cfg.model_name not in ['CFBI' + ] else dataset + + model.eval() + + state_dicts = load(weights) + model.set_state_dict(state_dicts) + + # add params to metrics + cfg.METRIC.data_size = len(dataset) + cfg.METRIC.batch_size = batch_size + Metric = build_metric(cfg.METRIC) + + if cfg.MODEL.framework == "FastRCNN": + Metric.set_dataset_info(dataset.info, len(dataset)) + + for batch_id, data in enumerate(data_loader): + if cfg.model_name in [ + 'CFBI' + ]: # for VOS task, dataset for video and dataloader for frames in each video + Metric.update(batch_id, data, model) + else: + outputs = model(data, mode='test') + Metric.update(batch_id, data, outputs) + Metric.accumulate() diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/train.py b/Bank_second_part/detect_process/paddlevideo/tasks/train.py new file mode 100644 index 0000000..451ec5d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/tasks/train.py @@ -0,0 +1,426 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path as osp +import time + +import paddle +import paddle.amp as amp +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +from paddlevideo.utils import (add_profiler_step, build_record, get_logger, + load, log_batch, log_epoch, mkdir, save) + +from ..loader.builder import build_dataloader, build_dataset +from ..metrics.ava_utils import collect_results_cpu +from ..modeling.builder import build_model +from ..solver import build_lr, build_optimizer +from ..utils import do_preciseBN + + +def train_model(cfg, + weights=None, + parallel=True, + validate=True, + use_amp=False, + amp_level=None, + max_iters=None, + use_fleet=False, + profiler_options=None): + """Train model entry + + Args: + cfg (dict): configuration. + weights (str, optional): weights path for finetuning. Defaults to None. + parallel (bool, optional): whether multi-cards training. Defaults to True. + validate (bool, optional): whether to do evaluation. Defaults to True. + use_amp (bool, optional): whether to use automatic mixed precision during training. Defaults to False. + amp_level (str, optional): amp optmization level, must be 'O1' or 'O2' when use_amp is True. Defaults to None. + max_iters (int, optional): max running iters in an epoch. Defaults to None. + use_fleet (bool, optional): whether to use fleet. Defaults to False. + profiler_options (str, optional): configuration for the profiler function. Defaults to None. + + """ + if use_fleet: + fleet.init(is_collective=True) + + logger = get_logger("paddlevideo") + batch_size = cfg.DATASET.get('batch_size', 8) + valid_batch_size = cfg.DATASET.get('valid_batch_size', batch_size) + + # gradient accumulation settings + use_gradient_accumulation = cfg.get('GRADIENT_ACCUMULATION', None) + if use_gradient_accumulation and dist.get_world_size() >= 1: + global_batch_size = cfg.GRADIENT_ACCUMULATION.get( + 'global_batch_size', None) + num_gpus = dist.get_world_size() + + assert isinstance( + global_batch_size, int + ), f"global_batch_size must be int, but got {type(global_batch_size)}" + assert batch_size <= global_batch_size, \ + f"global_batch_size({global_batch_size}) must not be less than batch_size({batch_size})" + + cur_global_batch_size = batch_size * num_gpus # The number of batches calculated by all GPUs at one time + assert global_batch_size % cur_global_batch_size == 0, \ + f"The global batchsize({global_batch_size}) must be divisible by cur_global_batch_size({cur_global_batch_size})" + cfg.GRADIENT_ACCUMULATION[ + "num_iters"] = global_batch_size // cur_global_batch_size + # The number of iterations required to reach the global batchsize + logger.info( + f"Using gradient accumulation training strategy, " + f"global_batch_size={global_batch_size}, " + f"num_gpus={num_gpus}, " + f"num_accumulative_iters={cfg.GRADIENT_ACCUMULATION.num_iters}") + + if cfg.get('use_npu', False): + places = paddle.set_device('npu') + elif cfg.get('use_xpu', False): + places = paddle.set_device('xpu') + else: + places = paddle.set_device('gpu') + + # default num worker: 0, which means no subprocess will be created + num_workers = cfg.DATASET.get('num_workers', 0) + valid_num_workers = cfg.DATASET.get('valid_num_workers', num_workers) + model_name = cfg.model_name + output_dir = cfg.get("output_dir", f"./output/{model_name}") + mkdir(output_dir) + + # 1. Construct model + model = build_model(cfg.MODEL) + + if cfg.get('to_static', False): + specs = None + model = paddle.jit.to_static(model, input_spec=specs) + logger.info( + "Successfully to apply @to_static with specs: {}".format(specs)) + + # 2. Construct dataset and dataloader for training and evaluation + train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train)) + train_dataloader_setting = dict( + batch_size=batch_size, + num_workers=num_workers, + collate_fn_cfg=cfg.get('MIX', None), + places=places) + train_loader = build_dataloader(train_dataset, **train_dataloader_setting) + + if validate: + valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid)) + validate_dataloader_setting = dict( + batch_size=valid_batch_size, + num_workers=valid_num_workers, + places=places, + drop_last=False, + shuffle=cfg.DATASET.get( + 'shuffle_valid', + False) # NOTE: attention_LSTM needs to shuffle valid data. + ) + valid_loader = build_dataloader(valid_dataset, + **validate_dataloader_setting) + + # 3. Construct learning rate scheduler(lr) and optimizer + lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader)) + optimizer = build_optimizer( + cfg.OPTIMIZER, lr, model=model, use_amp=use_amp, amp_level=amp_level) + + # 4. Construct scalar and convert parameters for amp(optional) + if use_amp: + scaler = amp.GradScaler( + init_loss_scaling=2.0**16, + incr_every_n_steps=2000, + decr_every_n_nan_or_inf=1) + # convert model parameters to fp16 when amp_level is O2(pure fp16) + model, optimizer = amp.decorate( + models=model, + optimizers=optimizer, + level=amp_level, + master_weight=True, + save_dtype=None) + # NOTE: save_dtype is set to float32 now. + logger.info(f"Training in amp mode, amp_level={amp_level}.") + else: + assert amp_level is None, f"amp_level must be None when training in fp32 mode, but got {amp_level}." + logger.info("Training in fp32 mode.") + + # 5. Resume(optional) + resume_epoch = cfg.get("resume_epoch", 0) + if resume_epoch: + filename = osp.join(output_dir, + model_name + f"_epoch_{resume_epoch:05d}") + resume_model_dict = load(filename + '.pdparams') + resume_opt_dict = load(filename + '.pdopt') + model.set_state_dict(resume_model_dict) + optimizer.set_state_dict(resume_opt_dict) + logger.info("Resume from checkpoint: {}".format(filename)) + + # 6. Finetune(optional) + if weights: + assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it." + model_dict = load(weights) + model.set_state_dict(model_dict) + logger.info("Finetune from checkpoint: {}".format(weights)) + + # 7. Parallelize(optional) + if parallel: + model = paddle.DataParallel(model) + + if use_fleet: + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + + # 8. Train Model + best = 0.0 + for epoch in range(0, cfg.epochs): + if epoch < resume_epoch: + logger.info( + f"| epoch: [{epoch + 1}] <= resume_epoch: [{resume_epoch}], continue..." + ) + continue + model.train() + + record_list = build_record(cfg.MODEL) + tic = time.time() + for i, data in enumerate(train_loader): + """Next two line of code only used in test_tipc, + ignore it most of the time""" + if max_iters is not None and i >= max_iters: + break + + record_list['reader_time'].update(time.time() - tic) + + # Collect performance information when profiler_options is activate + add_profiler_step(profiler_options) + + # 8.1 forward + # AMP # + if use_amp: + with amp.auto_cast( + custom_black_list={"reduce_mean", "conv3d"}, + level=amp_level): + outputs = model(data, mode='train') + avg_loss = outputs['loss'] + if use_gradient_accumulation: + # clear grad at when epoch begins + if i == 0: + optimizer.clear_grad() + # Loss normalization + avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters + # Loss scaling + scaled = scaler.scale(avg_loss) + # 8.2 backward + scaled.backward() + # 8.3 minimize + if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0: + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: # general case + # Loss scaling + scaled = scaler.scale(avg_loss) + # 8.2 backward + scaled.backward() + # 8.3 minimize + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + outputs = model(data, mode='train') + avg_loss = outputs['loss'] + if use_gradient_accumulation: + # clear grad at when epoch begins + if i == 0: + optimizer.clear_grad() + # Loss normalization + avg_loss /= cfg.GRADIENT_ACCUMULATION.num_iters + # 8.2 backward + avg_loss.backward() + # 8.3 minimize + if (i + 1) % cfg.GRADIENT_ACCUMULATION.num_iters == 0: + optimizer.step() + optimizer.clear_grad() + else: # general case + # 8.2 backward + avg_loss.backward() + # 8.3 minimize + optimizer.step() + optimizer.clear_grad() + + # log record + record_list['lr'].update(optimizer.get_lr(), batch_size) + for name, value in outputs.items(): + if name in record_list: + record_list[name].update(value, batch_size) + + record_list['batch_time'].update(time.time() - tic) + tic = time.time() + + if i % cfg.get("log_interval", 10) == 0: + ips = "ips: {:.5f} instance/sec,".format( + batch_size / record_list["batch_time"].val) + cur_progress = ((i + 1) + epoch * len(train_loader)) / ( + len(train_loader) * cfg.epochs) + eta = int(record_list["batch_time"].sum * + (1 - cur_progress) / cur_progress + 0.5) + log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips, + eta) + + # learning rate iter step + if cfg.OPTIMIZER.learning_rate.get("iter_step"): + lr.step() + + # learning rate epoch step + if not cfg.OPTIMIZER.learning_rate.get("iter_step"): + lr.step() + + ips = "avg_ips: {:.5f} instance/sec.".format( + batch_size * record_list["batch_time"].count / + record_list["batch_time"].sum) + log_epoch(record_list, epoch + 1, "train", ips) + + def evaluate(best): + model.eval() + results = [] + record_list = build_record(cfg.MODEL) + record_list.pop('lr') + tic = time.time() + if parallel: + rank = dist.get_rank() + # single_gpu_test and multi_gpu_test + for i, data in enumerate(valid_loader): + """Next two line of code only used in test_tipc, + ignore it most of the time""" + if max_iters is not None and i >= max_iters: + break + + if use_amp: + with amp.auto_cast( + custom_black_list={"reduce_mean", "conv3d"}, + level=amp_level): + outputs = model(data, mode='valid') + else: + outputs = model(data, mode='valid') + + if cfg.MODEL.framework == "FastRCNN": + results.extend(outputs) + + # log_record + if cfg.MODEL.framework != "FastRCNN": + for name, value in outputs.items(): + if name in record_list: + record_list[name].update(value, batch_size) + + record_list['batch_time'].update(time.time() - tic) + tic = time.time() + + if i % cfg.get("log_interval", 10) == 0: + ips = "ips: {:.5f} instance/sec.".format( + valid_batch_size / record_list["batch_time"].val) + log_batch(record_list, i, epoch + 1, cfg.epochs, "val", ips) + + if cfg.MODEL.framework == "FastRCNN": + if parallel: + results = collect_results_cpu(results, len(valid_dataset)) + if not parallel or (parallel and rank == 0): + eval_res = valid_dataset.evaluate(results) + for name, value in eval_res.items(): + record_list[name].update(value, valid_batch_size) + + ips = "avg_ips: {:.5f} instance/sec.".format( + valid_batch_size * record_list["batch_time"].count / + record_list["batch_time"].sum) + log_epoch(record_list, epoch + 1, "val", ips) + + best_flag = False + if cfg.MODEL.framework == "FastRCNN" and (not parallel or + (parallel and rank == 0)): + if record_list["mAP@0.5IOU"].val > best: + best = record_list["mAP@0.5IOU"].val + best_flag = True + return best, best_flag + + if cfg.MODEL.framework == "YOWOLocalizer" and (not parallel or + (parallel and rank == 0)): + if record_list["fscore"].avg > best: + best = record_list["fscore"].avg + best_flag = True + return best, best_flag + + # forbest2, cfg.MODEL.framework != "FastRCNN": + for top_flag in ['hit_at_one', 'top1', 'rmse', "F1@0.50"]: + if record_list.get(top_flag): + if top_flag != 'rmse' and record_list[top_flag].avg > best: + best = record_list[top_flag].avg + best_flag = True + elif top_flag == 'rmse' and ( + best == 0.0 or record_list[top_flag].avg < best): + best = record_list[top_flag].avg + best_flag = True + + return best, best_flag + + # use precise bn to improve acc + if cfg.get("PRECISEBN") and ( + epoch % cfg.PRECISEBN.preciseBN_interval == 0 + or epoch == cfg.epochs - 1): + do_preciseBN(model, train_loader, parallel, + min(cfg.PRECISEBN.num_iters_preciseBN, + len(train_loader)), use_amp, amp_level) + + # 9. Validation + if validate and (epoch % cfg.get("val_interval", 1) == 0 + or epoch == cfg.epochs - 1): + with paddle.no_grad(): + best, save_best_flag = evaluate(best) + # save best + if save_best_flag: + save(optimizer.state_dict(), + osp.join(output_dir, model_name + "_best.pdopt")) + save_student_model_flag = True if "Distillation" in cfg.MODEL.framework else False + save( + model.state_dict(), + osp.join(output_dir, model_name + "_best.pdparams"), + save_student_model=save_student_model_flag) + if model_name == "AttentionLstm": + logger.info( + f"Already save the best model (hit_at_one){best}") + elif cfg.MODEL.framework == "FastRCNN": + logger.info( + f"Already save the best model (mAP@0.5IOU){int(best * 10000) / 10000}" + ) + elif cfg.MODEL.framework == "DepthEstimator": + logger.info( + f"Already save the best model (rmse){int(best * 10000) / 10000}" + ) + elif cfg.MODEL.framework in ['MSTCN', 'ASRF']: + logger.info( + f"Already save the best model (F1@0.50){int(best * 10000) / 10000}" + ) + elif cfg.MODEL.framework in ['YOWOLocalizer']: + logger.info( + f"Already save the best model (fsocre){int(best * 10000) / 10000}" + ) + else: + logger.info( + f"Already save the best model (top1 acc){int(best * 10000) / 10000}" + ) + + # 10. Save model and optimizer + if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1: + save(optimizer.state_dict(), + osp.join(output_dir, + model_name + f"_epoch_{epoch + 1:05d}.pdopt")) + save(model.state_dict(), + osp.join(output_dir, + model_name + f"_epoch_{epoch + 1:05d}.pdparams")) + + logger.info(f'training {model_name} finished') diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/train_dali.py b/Bank_second_part/detect_process/paddlevideo/tasks/train_dali.py new file mode 100644 index 0000000..8dd0a20 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/tasks/train_dali.py @@ -0,0 +1,143 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import os.path as osp + +import paddle +from ..modeling.builder import build_model +from ..solver import build_lr, build_optimizer +from ..utils import do_preciseBN +from paddlevideo.utils import get_logger, coloring +from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch, + save, load, mkdir) +from paddlevideo.loader import TSN_Dali_loader, get_input_data +""" +We only supported DALI training for TSN model now. +""" + + +def train_dali(cfg, weights=None, parallel=True): + """Train model entry + + Args: + cfg (dict): configuration. + weights (str): weights path for finetuning. + parallel (bool): Whether multi-cards training. Default: True. + + """ + + logger = get_logger("paddlevideo") + batch_size = cfg.DALI_LOADER.get('batch_size', 8) + places = paddle.set_device('gpu') + model_name = cfg.model_name + output_dir = cfg.get("output_dir", f"./output/{model_name}") + mkdir(output_dir) + + # 1. Construct model + model = build_model(cfg.MODEL) + if parallel: + model = paddle.DataParallel(model) + + # 2. Construct dali dataloader + train_loader = TSN_Dali_loader(cfg.DALI_LOADER).build_dali_reader() + + # 3. Construct solver. + lr = build_lr(cfg.OPTIMIZER.learning_rate, None) + optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model) + + # Resume + resume_epoch = cfg.get("resume_epoch", 0) + if resume_epoch: + filename = osp.join(output_dir, + model_name + f"_epoch_{resume_epoch:05d}") + resume_model_dict = load(filename + '.pdparams') + resume_opt_dict = load(filename + '.pdopt') + model.set_state_dict(resume_model_dict) + optimizer.set_state_dict(resume_opt_dict) + + # Finetune: + if weights: + assert resume_epoch == 0, f"Conflict occurs when finetuning, please switch resume function off by setting resume_epoch to 0 or not indicating it." + model_dict = load(weights) + model.set_state_dict(model_dict) + + # 4. Train Model + for epoch in range(0, cfg.epochs): + if epoch < resume_epoch: + logger.info( + f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... " + ) + continue + model.train() + record_list = build_record(cfg.MODEL) + tic = time.time() + for i, data in enumerate(train_loader): + data = get_input_data(data) + record_list['reader_time'].update(time.time() - tic) + # 4.1 forward + outputs = model(data, mode='train') + # 4.2 backward + avg_loss = outputs['loss'] + avg_loss.backward() + # 4.3 minimize + optimizer.step() + optimizer.clear_grad() + + # log record + record_list['lr'].update(optimizer._global_learning_rate(), + batch_size) + for name, value in outputs.items(): + record_list[name].update(value, batch_size) + + record_list['batch_time'].update(time.time() - tic) + tic = time.time() + + if i % cfg.get("log_interval", 10) == 0: + ips = "ips: {:.5f} instance/sec.".format( + batch_size / record_list["batch_time"].val) + log_batch(record_list, i, epoch + 1, cfg.epochs, "train", ips) + + # learning rate iter step + if cfg.OPTIMIZER.learning_rate.get("iter_step"): + lr.step() + + # learning rate epoch step + if not cfg.OPTIMIZER.learning_rate.get("iter_step"): + lr.step() + + ips = "ips: {:.5f} instance/sec.".format( + batch_size * record_list["batch_time"].count / + record_list["batch_time"].sum) + log_epoch(record_list, epoch + 1, "train", ips) + + # use precise bn to improve acc + if cfg.get("PRECISEBN") and (epoch % cfg.PRECISEBN.preciseBN_interval + == 0 or epoch == cfg.epochs - 1): + do_preciseBN( + model, train_loader, parallel, + min(cfg.PRECISEBN.num_iters_preciseBN, len(train_loader))) + + # 5. Save model and optimizer + if epoch % cfg.get("save_interval", 1) == 0 or epoch == cfg.epochs - 1: + save( + optimizer.state_dict(), + osp.join(output_dir, + model_name + f"_epoch_{epoch+1:05d}.pdopt")) + save( + model.state_dict(), + osp.join(output_dir, + model_name + f"_epoch_{epoch+1:05d}.pdparams")) + + logger.info(f'training {model_name} finished') diff --git a/Bank_second_part/detect_process/paddlevideo/tasks/train_multigrid.py b/Bank_second_part/detect_process/paddlevideo/tasks/train_multigrid.py new file mode 100644 index 0000000..19e756f --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/tasks/train_multigrid.py @@ -0,0 +1,335 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import os.path as osp + +import paddle +import paddle.distributed as dist + +from ..loader.builder import build_dataloader, build_dataset +from ..modeling.builder import build_model +from ..solver import build_lr, build_optimizer +from ..utils import do_preciseBN +from paddlevideo.utils import get_logger, coloring +from paddlevideo.utils import (AverageMeter, build_record, log_batch, log_epoch, + save, load, mkdir) +from paddlevideo.utils.multigrid import MultigridSchedule, aggregate_sub_bn_stats, subn_load, subn_save, is_eval_epoch + + +def construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn, + world_size): + batch_size = cfg.DATASET.get('batch_size', 2) + train_dataset = build_dataset((cfg.DATASET.train, cfg.PIPELINE.train)) + precise_bn_dataloader_setting = dict( + batch_size=batch_size, + num_workers=cfg.DATASET.get('num_workers', 0), + places=places, + ) + if precise_bn: + cfg.DATASET.train.num_samples_precise_bn = num_iters_precise_bn * batch_size * world_size + precise_bn_dataset = build_dataset((cfg.DATASET.train, + cfg.PIPELINE.train)) + precise_bn_loader = build_dataloader(precise_bn_dataset, + **precise_bn_dataloader_setting) + cfg.DATASET.train.num_samples_precise_bn = None + else: + precise_bn_loader = None + + if cfg.MULTIGRID.SHORT_CYCLE: + # get batch size list in short cycle schedule + bs_factor = [ + int( + round((float(cfg.PIPELINE.train.transform[1]['MultiCrop'][ + 'target_size']) / (s * cfg.MULTIGRID.default_crop_size)) + **2)) for s in cfg.MULTIGRID.short_cycle_factors + ] + batch_sizes = [ + batch_size * bs_factor[0], + batch_size * bs_factor[1], + batch_size, + ] + train_dataloader_setting = dict( + batch_size=batch_sizes, + multigrid=True, + num_workers=cfg.DATASET.get('num_workers', 0), + places=places, + ) + else: + train_dataloader_setting = precise_bn_dataloader_setting + + train_loader = build_dataloader(train_dataset, **train_dataloader_setting) + if validate: + valid_dataset = build_dataset((cfg.DATASET.valid, cfg.PIPELINE.valid)) + validate_dataloader_setting = dict( + batch_size=batch_size, + num_workers=cfg.DATASET.get('num_workers', 0), + places=places, + drop_last=False, + shuffle=False) + valid_loader = build_dataloader(valid_dataset, + **validate_dataloader_setting) + else: + valid_loader = None + + return train_loader, valid_loader, precise_bn_loader + + +def build_trainer(cfg, places, parallel, validate, precise_bn, + num_iters_precise_bn, world_size): + """ + Build training model and its associated tools, including optimizer, + dataloaders and meters. + Args: + cfg (CfgNode): configs. + Returns: + model: training model. + optimizer: optimizer. + train_loader: training data loader. + val_loader: validatoin data loader. + precise_bn_loader: training data loader for computing + precise BN. + """ + model = build_model(cfg.MODEL) + if parallel: + model = paddle.DataParallel(model) + + train_loader, valid_loader, precise_bn_loader = \ + construct_loader(cfg, + places, + validate, + precise_bn, + num_iters_precise_bn, + world_size, + ) + + lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader)) + optimizer = build_optimizer(cfg.OPTIMIZER, lr, model=model) + + return ( + model, + lr, + optimizer, + train_loader, + valid_loader, + precise_bn_loader, + ) + + +def train_model_multigrid(cfg, world_size=1, validate=True): + """Train model entry + + Args: + cfg (dict): configuration. + parallel (bool): Whether multi-card training. Default: True + validate (bool): Whether to do evaluation. Default: False. + + """ + # Init multigrid. + multigrid = None + if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: + multigrid = MultigridSchedule() + cfg = multigrid.init_multigrid(cfg) + if cfg.MULTIGRID.LONG_CYCLE: + cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) + multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule] + + parallel = world_size != 1 + logger = get_logger("paddlevideo") + batch_size = cfg.DATASET.get('batch_size', 2) + + if cfg.get('use_npu', False): + places = paddle.set_device('npu') + elif cfg.get('use_xpu', False): + places = paddle.set_device('xpu') + else: + places = paddle.set_device('gpu') + + model_name = cfg.model_name + output_dir = cfg.get("output_dir", f"./output/{model_name}") + mkdir(output_dir) + local_rank = dist.ParallelEnv().local_rank + precise_bn = cfg.get("PRECISEBN") + num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN + + # 1. Construct model + model = build_model(cfg.MODEL) + if parallel: + model = paddle.DataParallel(model) + + # 2. Construct dataloader + train_loader, valid_loader, precise_bn_loader = \ + construct_loader(cfg, + places, + validate, + precise_bn, + num_iters_precise_bn, + world_size, + ) + + # 3. Construct optimizer + lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader)) + optimizer = build_optimizer( + cfg.OPTIMIZER, lr, parameter_list=model.parameters()) + + # Resume + resume_epoch = cfg.get("resume_epoch", 0) + if resume_epoch: + filename = osp.join( + output_dir, + model_name + str(local_rank) + '_' + f"{resume_epoch:05d}") + subn_load(model, filename, optimizer) + + # 4. Train Model + best = 0. + total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor) + for epoch in range(total_epochs): + if epoch < resume_epoch: + logger.info( + f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... " + ) + continue + + if cfg.MULTIGRID.LONG_CYCLE: + cfg, changed = multigrid.update_long_cycle(cfg, epoch) + if changed: + logger.info("====== Rebuild model/optimizer/loader =====") + ( + model, + lr, + optimizer, + train_loader, + valid_loader, + precise_bn_loader, + ) = build_trainer(cfg, places, parallel, validate, precise_bn, + num_iters_precise_bn, world_size) + + #load checkpoint after re-build model + if epoch != 0: + #epoch no need to -1, haved add 1 when save + filename = osp.join( + output_dir, + model_name + str(local_rank) + '_' + f"{(epoch):05d}") + subn_load(model, filename, optimizer) + #update lr last epoch, not to use saved params + lr.last_epoch = epoch + lr.step(rebuild=True) + + model.train() + record_list = build_record(cfg.MODEL) + tic = time.time() + for i, data in enumerate(train_loader): + record_list['reader_time'].update(time.time() - tic) + # 4.1 forward + outputs = model(data, mode='train') + # 4.2 backward + avg_loss = outputs['loss'] + avg_loss.backward() + # 4.3 minimize + optimizer.step() + optimizer.clear_grad() + + # log record + record_list['lr'].update( + float(optimizer._global_learning_rate()), batch_size) + for name, value in outputs.items(): + record_list[name].update(float(value), batch_size) + record_list['batch_time'].update(time.time() - tic) + tic = time.time() + + if i % cfg.get("log_interval", 10) == 0: + ips = "ips: {:.5f} instance/sec.".format( + batch_size / record_list["batch_time"].val) + log_batch(record_list, i, epoch + 1, total_epochs, "train", ips) + + # learning rate iter step + if cfg.OPTIMIZER.learning_rate.get("iter_step"): + lr.step() + + # learning rate epoch step + if not cfg.OPTIMIZER.learning_rate.get("iter_step"): + lr.step() + + ips = "ips: {:.5f} instance/sec.".format( + batch_size * record_list["batch_time"].count / + record_list["batch_time"].sum) + log_epoch(record_list, epoch + 1, "train", ips) + + def evaluate(best): + model.eval() + record_list = build_record(cfg.MODEL) + record_list.pop('lr') + tic = time.time() + for i, data in enumerate(valid_loader): + outputs = model(data, mode='valid') + + # log_record + for name, value in outputs.items(): + record_list[name].update(float(value), batch_size) + + record_list['batch_time'].update(time.time() - tic) + tic = time.time() + + if i % cfg.get("log_interval", 10) == 0: + ips = "ips: {:.5f} instance/sec.".format( + batch_size / record_list["batch_time"].val) + log_batch(record_list, i, epoch + 1, total_epochs, "val", + ips) + + ips = "ips: {:.5f} instance/sec.".format( + batch_size * record_list["batch_time"].count / + record_list["batch_time"].sum) + log_epoch(record_list, epoch + 1, "val", ips) + + best_flag = False + if record_list.get('top1') and record_list['top1'].avg > best: + best = record_list['top1'].avg + best_flag = True + return best, best_flag + + # use precise bn to improve acc + if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule): + logger.info(f"do precise BN in {epoch+1} ...") + do_preciseBN(model, precise_bn_loader, parallel, + min(num_iters_precise_bn, len(precise_bn_loader))) + + # aggregate sub_BN stats + logger.info("Aggregate sub_BatchNorm stats...") + aggregate_sub_bn_stats(model) + + # 5. Validation + if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule): + logger.info(f"eval in {epoch+1} ...") + with paddle.no_grad(): + best, save_best_flag = evaluate(best) + # save best + if save_best_flag: + save(optimizer.state_dict(), + osp.join(output_dir, model_name + "_best.pdopt")) + save(model.state_dict(), + osp.join(output_dir, model_name + "_best.pdparams")) + logger.info( + f"Already save the best model (top1 acc){int(best * 10000) / 10000}" + ) + + # 6. Save model and optimizer + if is_eval_epoch( + cfg, epoch, + total_epochs, multigrid.schedule) or epoch % cfg.get( + "save_interval", 10) == 0 or epoch in multi_save_epoch: + logger.info("[Save parameters] ======") + subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1, + model, optimizer) + + logger.info(f'training {model_name} finished') diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__init__.py b/Bank_second_part/detect_process/paddlevideo/utils/__init__.py new file mode 100644 index 0000000..d18561d --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .registry import Registry +from .build_utils import build +from .config import * +from .logger import setup_logger, coloring, get_logger +from .record import AverageMeter, build_record, log_batch, log_epoch +from .dist_utils import get_dist_info, main_only +from .save_load import save, load, load_ckpt, mkdir +from .precise_bn import do_preciseBN +from .profiler import add_profiler_step +__all__ = ['Registry', 'build'] diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..64ae223 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/build_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/build_utils.cpython-310.pyc new file mode 100644 index 0000000..273693d Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/build_utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/config.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000..7d3a10e Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/config.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/dist_utils.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/dist_utils.cpython-310.pyc new file mode 100644 index 0000000..c43503f Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/dist_utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/logger.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/logger.cpython-310.pyc new file mode 100644 index 0000000..30719bd Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/logger.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/precise_bn.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/precise_bn.cpython-310.pyc new file mode 100644 index 0000000..79ae684 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/precise_bn.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/profiler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/profiler.cpython-310.pyc new file mode 100644 index 0000000..1287287 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/profiler.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/record.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/record.cpython-310.pyc new file mode 100644 index 0000000..cc8857c Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/record.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/registry.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000..56450c3 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/registry.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/save_load.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/save_load.cpython-310.pyc new file mode 100644 index 0000000..62b1269 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/__pycache__/save_load.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/build_utils.py b/Bank_second_part/detect_process/paddlevideo/utils/build_utils.py new file mode 100644 index 0000000..73c0ca4 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/build_utils.py @@ -0,0 +1,35 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def build(cfg, registry, key='name'): + """Build a module from config dict. + Args: + cfg (dict): Config dict. It should at least contain the key. + registry (XXX): The registry to search the type from. + key (str): the key. + Returns: + obj: The constructed object. + """ + + assert isinstance(cfg, dict) and key in cfg + + cfg_copy = cfg.copy() + obj_type = cfg_copy.pop(key) + + obj_cls = registry.get(obj_type) + if obj_cls is None: + raise KeyError('{} is not in the {} registry'.format( + obj_type, registry.name)) + return obj_cls(**cfg_copy) diff --git a/Bank_second_part/detect_process/paddlevideo/utils/config.py b/Bank_second_part/detect_process/paddlevideo/utils/config.py new file mode 100644 index 0000000..f4d7941 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/config.py @@ -0,0 +1,174 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import yaml +from paddlevideo.utils.logger import coloring, get_logger, setup_logger + +__all__ = ['get_config'] + +logger = setup_logger("./", name="paddlevideo", level="INFO") + + +class AttrDict(dict): + def __getattr__(self, key): + return self[key] + + def __setattr__(self, key, value): + if key in self.__dict__: + self.__dict__[key] = value + else: + self[key] = value + + +def create_attr_dict(yaml_config): + from ast import literal_eval + for key, value in yaml_config.items(): + if type(value) is dict: + yaml_config[key] = value = AttrDict(value) + if isinstance(value, str): + try: + value = literal_eval(value) + except BaseException: + pass + if isinstance(value, AttrDict): + create_attr_dict(yaml_config[key]) + else: + yaml_config[key] = value + + +def parse_config(cfg_file): + """Load a config file into AttrDict""" + with open(cfg_file, 'r') as fopen: + yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader)) + create_attr_dict(yaml_config) + return yaml_config + + +def print_dict(d, delimiter=0): + """ + Recursively visualize a dict and + indenting acrrording by the relationship of keys. + """ + placeholder = "-" * 60 + for k, v in sorted(d.items()): + if isinstance(v, dict): + logger.info("{}{} : ".format(delimiter * " ", coloring(k, + "HEADER"))) + print_dict(v, delimiter + 4) + elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict): + logger.info("{}{} : ".format(delimiter * " ", + coloring(str(k), "HEADER"))) + for value in v: + print_dict(value, delimiter + 4) + else: + logger.info("{}{} : {}".format(delimiter * " ", + coloring(k, "HEADER"), + coloring(v, "OKGREEN"))) + + if k.isupper(): + logger.info(placeholder) + + +def print_config(config): + """ + visualize configs + Arguments: + config: configs + """ + print_dict(config) + + +def check_config(config): + """ + Check config + """ + pass + + +def override(dl, ks, v): + """ + Recursively replace dict of list + Args: + dl(dict or list): dict or list to be replaced + ks(list): list of keys + v(str): value to be replaced + """ + def str2num(v): + try: + return eval(v) + except Exception: + return v + + assert isinstance(dl, (list, dict)), ("{} should be a list or a dict") + assert len(ks) > 0, ('lenght of keys should larger than 0') + if isinstance(dl, list): + k = str2num(ks[0]) + if len(ks) == 1: + assert k < len(dl), ('index({}) out of range({})'.format(k, dl)) + dl[k] = str2num(v) + else: + override(dl[k], ks[1:], v) + else: + if len(ks) == 1: + #assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl)) + if not ks[0] in dl: + logger.warning('A new filed ({}) detected!'.format(ks[0], dl)) + dl[ks[0]] = str2num(v) + else: + assert ks[0] in dl, ( + '({}) doesn\'t exist in {}, a new dict field is invalid'.format( + ks[0], dl)) + override(dl[ks[0]], ks[1:], v) + + +def override_config(config, options=None): + """ + Recursively override the config + Args: + config(dict): dict to be replaced + options(list): list of pairs(key0.key1.idx.key2=value) + such as: [ + epochs=20', + 'PIPELINE.train.transform.1.ResizeImage.resize_short=300' + ] + Returns: + config(dict): replaced config + """ + if options is not None: + for opt in options: + assert isinstance(opt, + str), ("option({}) should be a str".format(opt)) + assert "=" in opt, ( + "option({}) should contain a =" + "to distinguish between key and value".format(opt)) + pair = opt.split('=') + assert len(pair) == 2, ("there can be only a = in the option") + key, value = pair + keys = key.split('.') + override(config, keys, value) + + return config + + +def get_config(fname, overrides=None, show=True): + """ + Read config from file + """ + assert os.path.exists(fname), ('config file({}) is not exist'.format(fname)) + config = parse_config(fname) + override_config(config, overrides) + if show: + print_config(config) + check_config(config) + return config diff --git a/Bank_second_part/detect_process/paddlevideo/utils/dist_utils.py b/Bank_second_part/detect_process/paddlevideo/utils/dist_utils.py new file mode 100644 index 0000000..7659e88 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/dist_utils.py @@ -0,0 +1,30 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools + +import paddle +import paddle.distributed as dist + +def get_dist_info(): + world_size = dist.get_world_size() + rank = dist.get_rank() + return rank, world_size + +def main_only(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank, _ = get_dist_info() + if rank == 0: + return func(*args, **kwargs) + return wrapper diff --git a/Bank_second_part/detect_process/paddlevideo/utils/logger.py b/Bank_second_part/detect_process/paddlevideo/utils/logger.py new file mode 100644 index 0000000..e9791b8 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/logger.py @@ -0,0 +1,113 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import sys +import datetime + +from paddle.distributed import ParallelEnv + + + +Color = { + 'RED': '\033[31m', + 'HEADER': '\033[35m', # deep purple + 'PURPLE': '\033[95m', # purple + 'OKBLUE': '\033[94m', + 'OKGREEN': '\033[92m', + 'WARNING': '\033[93m', + 'FAIL': '\033[91m', + 'ENDC': '\033[0m' +} + + +def coloring(message, color="OKGREEN"): + assert color in Color.keys() + if os.environ.get('COLORING', True): + return Color[color] + str(message) + Color["ENDC"] + else: + return message + + +logger_initialized = [] + + +def setup_logger(output=None, name="paddlevideo", level="INFO"): + """ + Initialize the paddlevideo logger and set its verbosity level to "INFO". + Args: + output (str): a file name or a directory to save log. If None, will not save log file. + If ends with ".txt" or ".log", assumed to be a file name. + Otherwise, logs will be saved to `output/log.txt`. + name (str): the root module name of this logger + Returns: + logging.Logger: a logger + """ + def time_zone(sec, fmt): + real_time = datetime.datetime.now() + return real_time.timetuple() + logging.Formatter.converter = time_zone + + logger = logging.getLogger(name) + if level == "INFO": + logger.setLevel(logging.INFO) + elif level=="DEBUG": + logger.setLevel(logging.DEBUG) + logger.propagate = False + + if level == "DEBUG": + plain_formatter = logging.Formatter( + "[%(asctime)s] %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d %H:%M:%S") + else: + plain_formatter = logging.Formatter( + "[%(asctime)s] %(message)s", + datefmt="%m/%d %H:%M:%S") + # stdout logging: master only + local_rank = ParallelEnv().local_rank + if local_rank == 0: + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + formatter = plain_formatter + ch.setFormatter(formatter) + logger.addHandler(ch) + + # file logging: all workers + if output is not None: + if output.endswith(".txt") or output.endswith(".log"): + filename = output + else: + filename = os.path.join(output, ".log.txt") + if local_rank > 0: + filename = filename + ".rank{}".format(local_rank) + + # PathManager.mkdirs(os.path.dirname(filename)) + os.makedirs(os.path.dirname(filename), exist_ok=True) + + # fh = logging.StreamHandler(_cached_log_stream(filename) + fh = logging.FileHandler(filename, mode='a') + fh.setLevel(logging.DEBUG) + fh.setFormatter(plain_formatter) + logger.addHandler(fh) + logger_initialized.append(name) + return logger + + +def get_logger(name, output=None): + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + + return setup_logger(name=name, output=name) diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__init__.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__init__.py new file mode 100644 index 0000000..10295b5 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__init__.py @@ -0,0 +1,10 @@ +from .multigrid import MultigridSchedule +from .batchnorm_helper import get_norm, aggregate_sub_bn_stats +from .short_sampler import DistributedShortSampler +from .save_load_helper import subn_save, subn_load +from .interval_helper import is_eval_epoch + +__all__ = [ + 'MultigridSchedule', 'get_norm', 'aggregate_sub_bn_stats', + 'DistributedShortSampler', 'subn_save', 'subn_load', 'is_eval_epoch' +] diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..dc3ea59 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/batchnorm_helper.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/batchnorm_helper.cpython-310.pyc new file mode 100644 index 0000000..05cf1e8 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/batchnorm_helper.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/interval_helper.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/interval_helper.cpython-310.pyc new file mode 100644 index 0000000..ef20957 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/interval_helper.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/multigrid.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/multigrid.cpython-310.pyc new file mode 100644 index 0000000..64dba74 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/multigrid.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/save_load_helper.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/save_load_helper.cpython-310.pyc new file mode 100644 index 0000000..68e467e Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/save_load_helper.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/short_sampler.cpython-310.pyc b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/short_sampler.cpython-310.pyc new file mode 100644 index 0000000..de4dd56 Binary files /dev/null and b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/__pycache__/short_sampler.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/batchnorm_helper.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/batchnorm_helper.py new file mode 100644 index 0000000..e39b067 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/batchnorm_helper.py @@ -0,0 +1,142 @@ +from functools import partial +import paddle + + +def get_norm(bn_norm_type, bn_num_splits): + """ + Args: + cfg (CfgNode): model building configs, details are in the comments of + the config file. + Returns: + nn.Layer: the normalization layer. + """ + if bn_norm_type == "batchnorm": + return paddle.nn.BatchNorm3D + elif bn_norm_type == "sub_batchnorm": + return partial(SubBatchNorm3D, num_splits=bn_num_splits) + else: + raise NotImplementedError( + "Norm type {} is not supported".format(bn_norm_type)) + + +def aggregate_sub_bn_stats(model): + """ + Recursively find all SubBN modules and aggregate sub-BN stats. + Args: + model (nn.Layer): model to be aggregate sub-BN stats + Returns: + count (int): number of SubBN module found. + """ + count = 0 + for child in model.children(): + if isinstance(child, SubBatchNorm3D): + child.aggregate_stats() + count += 1 + else: + count += aggregate_sub_bn_stats(child) + return count + + +class SubBatchNorm3D(paddle.nn.Layer): + """ + Implement based on paddle2.0. + The standard BN layer computes stats across all examples in a GPU. In some + cases it is desirable to compute stats across only a subset of examples + SubBatchNorm3D splits the batch dimension into N splits, and run BN on + each of them separately (so that the stats are computed on each subset of + examples (1/N of batch) independently. During evaluation, it aggregates + the stats from all splits into one BN. + """ + def __init__(self, num_splits, **args): + """ + Args: + num_splits (int): number of splits. + args (list): list of args + """ + super(SubBatchNorm3D, self).__init__() + self.num_splits = num_splits + self.num_features = args["num_features"] + self.weight_attr = args["weight_attr"] + self.bias_attr = args["bias_attr"] + + # Keep only one set of weight and bias (outside). + if self.weight_attr == False: + self.weight = self.create_parameter( + attr=None, + shape=[self.num_features], + default_initializer=paddle.nn.initializer.Constant(1.0)) + self.weight.stop_gradient = True + else: + self.weight = self.create_parameter( + attr=self.weight_attr, + shape=[self.num_features], + default_initializer=paddle.nn.initializer.Constant(1.0)) + self.weight.stop_gradient = self.weight_attr is not None \ + and self.weight_attr.learning_rate == 0. + + if self.bias_attr == False: + self.bias = self.create_parameter(attr=None, + shape=[self.num_features], + is_bias=True) + self.bias.stop_gradient = True + else: + self.bias = self.create_parameter(attr=self.bias_attr, + shape=[self.num_features], + is_bias=True) + self.bias.stop_gradient = self.bias_attr is not None \ + and self.bias_attr.learning_rate == 0. + + # set weights and bias fixed (inner). + args["weight_attr"] = False + args["bias_attr"] = False + self.bn = paddle.nn.BatchNorm3D(**args) + # update number of features used in split_bn + args["num_features"] = self.num_features * self.num_splits + self.split_bn = paddle.nn.BatchNorm3D(**args) + + def _get_aggregated_mean_std(self, means, stds, n): + """ + Calculate the aggregated mean and stds. + Use the method of update mean and std when merge multi-part data. + Args: + means (tensor): mean values. + stds (tensor): standard deviations. + n (int): number of sets of means and stds. + """ + mean = paddle.sum(paddle.reshape(means, (n, -1)), axis=0) / n + std = (paddle.sum(paddle.reshape(stds, (n, -1)), axis=0) / n + + paddle.sum(paddle.reshape( + paddle.pow((paddle.reshape(means, (n, -1)) - mean), 2), + (n, -1)), + axis=0) / n) + return mean, std + + def aggregate_stats(self): + """ + Synchronize running_mean, and running_var to self.bn. + Call this before eval, then call model.eval(); + When eval, forward function will call self.bn instead of self.split_bn, + During this time the running_mean, and running_var of self.bn has been obtained from + self.split_bn. + """ + if self.split_bn.training: + bn_mean_tensor, bn_variance_tensor = self._get_aggregated_mean_std( + self.split_bn._mean, + self.split_bn._variance, + self.num_splits, + ) + self.bn._mean.set_value(bn_mean_tensor) + self.bn._variance.set_value(bn_variance_tensor) + + def forward(self, x): + if self.training: + n, c, t, h, w = x.shape + x = paddle.reshape( + x, (n // self.num_splits, c * self.num_splits, t, h, w)) + x = self.split_bn(x) + x = paddle.reshape(x, (n, c, t, h, w)) + else: + x = self.bn(x) + x = paddle.multiply(x, paddle.reshape(self.weight, (-1, 1, 1, 1))) + x = paddle.add(x, paddle.reshape(self.bias, (-1, 1, 1, 1))) + return x diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/interval_helper.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/interval_helper.py new file mode 100644 index 0000000..2df4bc7 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/interval_helper.py @@ -0,0 +1,19 @@ +def is_eval_epoch(cfg, cur_epoch, total_epochs, multigrid_schedule): + """ + Determine if the model should be evaluated at the current epoch. + Args: + cfg (CfgNode): configs. Details can be found in + slowfast/config/defaults.py + cur_epoch (int): current epoch. + multigrid_schedule (List): schedule for multigrid training. + """ + if cur_epoch + 1 == total_epochs: + return True + if multigrid_schedule is not None: + prev_epoch = 0 + for s in multigrid_schedule: + if cur_epoch < s[-1]: + period = max( + (s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1) + return (s[-1] - 1 - cur_epoch) % period == 0 + prev_epoch = s[-1] diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/multigrid.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/multigrid.py new file mode 100644 index 0000000..a296a06 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/multigrid.py @@ -0,0 +1,233 @@ +"""Functions for multigrid training.""" + +import numpy as np + + +class MultigridSchedule(object): + """ + This class defines multigrid training schedule and update cfg accordingly. + """ + def init_multigrid(self, cfg): + """ + Update cfg based on multigrid settings. + Args: + cfg (configs): configs that contains training and multigrid specific + hyperparameters. + Returns: + cfg (configs): the updated cfg. + """ + self.schedule = None + # We may modify cfg.DATASET.batch_size, cfg.PIPELINE.train.decode_sampler.num_frames, and + # cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] during training, so we store their original + # value in cfg and use them as global variables. + cfg.MULTIGRID.default_batch_size = cfg.DATASET.batch_size # total bs,64 + cfg.MULTIGRID.default_temporal_size = cfg.PIPELINE.train.decode_sampler.num_frames # 32 + cfg.MULTIGRID.default_crop_size = cfg.PIPELINE.train.transform[1][ + 'MultiCrop']['target_size'] # 224 + + if cfg.MULTIGRID.LONG_CYCLE: + self.schedule = self.get_long_cycle_schedule(cfg) + cfg.OPTIMIZER.learning_rate.steps = [0] + [ + s[-1] for s in self.schedule + ] + # Fine-tuning phase. + cfg.OPTIMIZER.learning_rate.steps[-1] = ( + cfg.OPTIMIZER.learning_rate.steps[-2] + + cfg.OPTIMIZER.learning_rate.steps[-1]) // 2 + cfg.OPTIMIZER.learning_rate.lrs = [ + cfg.OPTIMIZER.learning_rate.gamma**s[0] * s[1][0] + for s in self.schedule + ] + # Fine-tuning phase. + cfg.OPTIMIZER.learning_rate.lrs = cfg.OPTIMIZER.learning_rate.lrs[:-1] + [ + cfg.OPTIMIZER.learning_rate.lrs[-2], + cfg.OPTIMIZER.learning_rate.lrs[-1], + ] + + cfg.OPTIMIZER.learning_rate.max_epoch = self.schedule[-1][-1] + + elif cfg.MULTIGRID.SHORT_CYCLE: + cfg.OPTIMIZER.learning_rate.steps = [ + int(s * cfg.MULTIGRID.epoch_factor) + for s in cfg.OPTIMIZER.learning_rate.steps + ] + cfg.OPTIMIZER.learning_rate.max_epoch = int( + cfg.OPTIMIZER.learning_rate.max_epoch * + cfg.OPTIMIZER.learning_rate.max_epoch) + return cfg + + def update_long_cycle(self, cfg, cur_epoch): + """ + Before every epoch, check if long cycle shape should change. If it + should, update cfg accordingly. + Args: + cfg (configs): configs that contains training and multigrid specific + hyperparameters. + cur_epoch (int): current epoch index. + Returns: + cfg (configs): the updated cfg. + changed (bool): whether to change long cycle shape at this epoch + """ + base_b, base_t, base_s = get_current_long_cycle_shape( + self.schedule, cur_epoch) + if base_s != cfg.PIPELINE.train.transform[1]['MultiCrop'][ + 'target_size'] or base_t != cfg.PIPELINE.train.decode_sampler.num_frames: + #NOTE Modify + # no need to modify, used by pool_size in head, None when multigrid + # cfg.MODEL.head.num_frames = base_t + # cfg.MODEL.head.crop_size = base_s + cfg.PIPELINE.train.decode_sampler.num_frames = base_t + cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] = base_s + cfg.DATASET.batch_size = base_b * cfg.MULTIGRID.default_batch_size #change bs + + bs_factor = (float(cfg.DATASET.batch_size) / + cfg.MULTIGRID.bn_base_size) + + if bs_factor == 1: #single bs == bn_base_size (== 8) + cfg.MODEL.backbone.bn_norm_type = "batchnorm" + else: + cfg.MODEL.backbone.bn_norm_type = "sub_batchnorm" + cfg.MODEL.backbone.bn_num_splits = int(bs_factor) + + cfg.MULTIGRID.long_cycle_sampling_rate = cfg.PIPELINE.train.decode_sampler.sampling_rate * ( + cfg.MULTIGRID.default_temporal_size // base_t) + print("Long cycle updates:") + print("\tbn_norm_type: {}".format(cfg.MODEL.backbone.bn_norm_type)) + if cfg.MODEL.backbone.bn_norm_type == "sub_batchnorm": + print("\tbn_num_splits: {}".format( + cfg.MODEL.backbone.bn_num_splits)) + print("\tTRAIN.batch_size[single card]: {}".format( + cfg.DATASET.batch_size)) + print("\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format( + cfg.PIPELINE.train.decode_sampler.num_frames, + cfg.MULTIGRID.long_cycle_sampling_rate)) + print("\tDATA.train_crop_size: {}".format( + cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'])) + return cfg, True + else: + return cfg, False + + def get_long_cycle_schedule(self, cfg): + """ + Based on multigrid hyperparameters, define the schedule of a long cycle. + Args: + cfg (configs): configs that contains training and multigrid specific + hyperparameters. + Returns: + schedule (list): Specifies a list long cycle base shapes and their + corresponding training epochs. + """ + + steps = cfg.OPTIMIZER.learning_rate.steps + + default_size = float( + cfg.PIPELINE.train.decode_sampler.num_frames * + cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size']** + 2) # 32 * 224 * 224 C*H*W + default_iters = steps[-1] # 196 + + # Get shapes and average batch size for each long cycle shape. + avg_bs = [] + all_shapes = [] + # for t_factor, s_factor in cfg.MULTIGRID.long_cycle_factors: + for item in cfg.MULTIGRID.long_cycle_factors: + t_factor, s_factor = item["value"] + base_t = int( + round(cfg.PIPELINE.train.decode_sampler.num_frames * t_factor)) + base_s = int( + round( + cfg.PIPELINE.train.transform[1]['MultiCrop']['target_size'] + * s_factor)) + if cfg.MULTIGRID.SHORT_CYCLE: + shapes = [ + [ + base_t, + cfg.MULTIGRID.default_crop_size * + cfg.MULTIGRID.short_cycle_factors[0], + ], + [ + base_t, + cfg.MULTIGRID.default_crop_size * + cfg.MULTIGRID.short_cycle_factors[1], + ], + [base_t, base_s], + ] #first two is short_cycle, last is the base long_cycle + else: + shapes = [[base_t, base_s]] + + # (T, S) -> (B, T, S) + shapes = [[ + int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1] + ] for s in shapes] + avg_bs.append(np.mean([s[0] for s in shapes])) + all_shapes.append(shapes) + + # Get schedule regardless of cfg.MULTIGRID.epoch_factor. + total_iters = 0 + schedule = [] + for step_index in range(len(steps) - 1): + step_epochs = steps[step_index + 1] - steps[step_index] + + for long_cycle_index, shapes in enumerate(all_shapes): + #ensure each of 4 sequences run the same num of iters + cur_epochs = (step_epochs * avg_bs[long_cycle_index] / + sum(avg_bs)) + + # get cur_iters from cur_epochs + cur_iters = cur_epochs / avg_bs[long_cycle_index] + total_iters += cur_iters + schedule.append((step_index, shapes[-1], cur_epochs)) + + iter_saving = default_iters / total_iters # ratio between default iters and real iters + + final_step_epochs = cfg.OPTIMIZER.learning_rate.max_epoch - steps[-1] + + # We define the fine-tuning phase to have the same amount of iteration + # saving as the rest of the training. + #final_step_epochs / iter_saving make fine-tune having the same iters as training + ft_epochs = final_step_epochs / iter_saving * avg_bs[-1] + + # schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs)) + schedule.append((step_index + 1, all_shapes[-1][-1], ft_epochs)) + + # Obtrain final schedule given desired cfg.MULTIGRID.epoch_factor. + x = (cfg.OPTIMIZER.learning_rate.max_epoch * + cfg.MULTIGRID.epoch_factor / sum(s[-1] for s in schedule)) + + final_schedule = [] + total_epochs = 0 + for s in schedule: + epochs = s[2] * x + total_epochs += epochs + final_schedule.append((s[0], s[1], int(round(total_epochs)))) + print_schedule(final_schedule) + return final_schedule + + +def print_schedule(schedule): + """ + Log schedule. + """ + print( + "Long_cycle_index\tBase_shape(bs_factor,temporal_size,crop_size)\tEpochs" + ) + for s in schedule: + print("{}\t\t\t{}\t\t\t\t\t{}".format(s[0], s[1], s[2])) + + +def get_current_long_cycle_shape(schedule, epoch): + """ + Given a schedule and epoch index, return the long cycle base shape. + Args: + schedule (configs): configs that contains training and multigrid specific + hyperparameters. + cur_epoch (int): current epoch index. + Returns: + shapes (list): A list describing the base shape in a long cycle: + [batch size relative to default, + number of frames, spatial dimension]. + """ + for s in schedule: + if epoch < s[-1]: + return s[1] + return schedule[-1][1] diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/save_load_helper.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/save_load_helper.py new file mode 100644 index 0000000..94a52d5 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/save_load_helper.py @@ -0,0 +1,237 @@ +import os +import numpy as np +import paddle +import copy + + +def sub_to_normal_bn(sd): + """ + When save, Convert the Sub-BN paprameters to normal BN parameters in a state dict. + There are two copies of BN layers in a Sub-BN implementation: `bn.bn` and + `bn.split_bn`. `bn.split_bn` is used during training and + "compute_precise_bn". Before saving or evaluation, its stats are copied to + `bn.bn`. We rename `bn.bn` to `bn` and store it to be consistent with normal + BN layers. + Args: + sd (OrderedDict): a dict of parameters which might contain Sub-BN + parameters. + Returns: + new_sd (OrderedDict): a dict with Sub-BN parameters reshaped to + normal parameters. + """ + modifications = [ + ("bn.bn._mean", "bn._mean"), + ("bn.bn._variance", "bn._variance"), + ] + to_remove = ["bn.bn.", ".split_bn."] + key_list = list(sd.keys()) #odict_keys to list + for key in key_list: + for before, after in modifications: + if key.endswith(before): + new_key = key.split(before)[0] + after + sd[new_key] = sd.pop(key) + + for rm in to_remove: + if rm in key and key in sd: + del sd[key] + + +def normal_to_sub_bn(checkpoint_sd, model_sd): + """ + When load, Convert BN parameters to Sub-BN parameters if model contains Sub-BNs. + Args: + checkpoint_sd (OrderedDict): source dict of parameters. + model_sd (OrderedDict): target dict of parameters. + Returns: + new_sd (OrderedDict): converted dict of parameters. + """ + for key in model_sd: + if key not in checkpoint_sd: + # not to replace bn.weight and bn.bias + if "bn.split_bn." in key and "bn.weight" not in key and "bn.bias" not in key: + load_key = key.replace("bn.split_bn.", "bn.") + bn_key = key.replace("bn.split_bn.", "bn.bn.") + checkpoint_sd[key] = checkpoint_sd.pop(load_key) + checkpoint_sd[bn_key] = checkpoint_sd[key] + + # match the shape of bn.split_bn._xx + # model_sd: split_bn.rm.shape = num_feature*num_split + # checkpoint_sd: split_bn.rm.shape = bn.rm.shape = num_feature + for key in model_sd: + if key in checkpoint_sd: + model_blob_shape = model_sd[key].shape #bn.split_bn + c2_blob_shape = checkpoint_sd[key].shape #bn.bn + + if (len(model_blob_shape) == 1 and len(c2_blob_shape) == 1 + and model_blob_shape[0] > c2_blob_shape[0] + and model_blob_shape[0] % c2_blob_shape[0] == 0): + before_shape = checkpoint_sd[key].shape + checkpoint_sd[key] = np.concatenate( + [checkpoint_sd[key]] * + (model_blob_shape[0] // c2_blob_shape[0])) + if 'split_bn' not in key: #split_bn is excepted + print("{} {} -> {}".format(key, before_shape, + checkpoint_sd[key].shape)) + return checkpoint_sd + + +def mapping_opt_dict(opt_dict, model_key_list): + """ + Paddle Name schedule: conv_1.w -> conv_2.w + Sometimes: sub_bn -> bn + when re-build model, we desire the parameter name to be coincident, + but the parameters name index will be added, as conv_1 to conv_2, not conv_1. + It will raise error if we set old saved parameters to new created optimizer. + as conv_2 cannot find in state_dict(only conv_1). + Args: + opt_dict: optimizer state dict, including the name and value of parameters gradient. + model_key_list: the parameters name list of re-build model. + Return: optimizer state dict with modified keys + """ + def get_name_info(PNAME, PN_key_list, key_list): + min_index = float('inf') + max_index = 0 + for name in PN_key_list[1:]: + for key in key_list: + if name in key: + index = int(key.split('.')[0].split(name)[-1]) + if index < min_index: + min_index = index + if index > max_index: + max_index = index + num_name = max_index - min_index + 1 + PNAME[name].append((min_index, max_index, num_name)) + min_index = float('inf') + max_index = 0 + + PNAME = { + "LR_Scheduler": [], + "conv3d_": [], + "linear_": [], + "sub_batch_norm3d_": [], + "batch_norm3d_": [], + } + + pd_key_list = list(opt_dict.keys()) + print("The number of parameters in saved optimizer state dict = {}".format( + len(pd_key_list))) + print("The number of parameters in re-build model list = {}".format( + len(model_key_list))) + # 1 may be LR_Scheduler + PN_key_list = list(PNAME.keys()) + + # get the number of each PNAME + get_name_info(PNAME, PN_key_list, pd_key_list) + get_name_info(PNAME, PN_key_list, model_key_list) + print("[Parameters info] prefix: min_index, max_index, number_params: \n", + PNAME) + + # whether to change name of bn layer + change_name = False + if PNAME["sub_batch_norm3d_"][0][-1] == -float('inf'): + PN_key_list.remove("sub_batch_norm3d_") + if PNAME["sub_batch_norm3d_"][1][-1] != -float('inf'): + print( + "Optimizer state dict saved bn, but Re-build model use sub_bn, changed name!" + ) + change_name = True + else: + print("Optimizer state dict saved bn, and Re-build model use bn") + else: + PN_key_list.remove("batch_norm3d_") + if PNAME["sub_batch_norm3d_"][1][-1] == -float('inf'): + print( + "Optimizer state dict saved sub_bn, but Re-build model use bn, changed name!" + ) + change_name = True + else: + print( + "Optimizer state dict saved sub_bn, Re-build model use sub_bn") + + #update key name + # sub_bn -> bn name mapping, pre-define dict + change_dict = { + "sub_batch_norm3d_": "batch_norm3d_", + "batch_norm3d_": "sub_batch_norm3d_" + } + for key in pd_key_list: + for name in PN_key_list[1:]: + if key.startswith(name): + start = change_dict[name] if ( + change_name and "batch_norm" in name) else name + str_index = key.split('.')[0].split(name)[-1] + index = int(str_index) + new_index = str(index + + (PNAME[start][1][0] - PNAME[name][0][0])) + end = key.split('.')[-1] + update_key = start + new_index + '.' + end + opt_dict[update_key] = opt_dict.pop(key) + + return opt_dict + + +def subn_save(save_dir, name_prefix, epoch, video_model, optimizer): + if not os.path.isdir(save_dir): + os.makedirs(save_dir) + model_path = os.path.join(save_dir, name_prefix + "{:05d}".format(epoch)) + model_dict = video_model.state_dict() + sub_to_normal_bn(model_dict) + opti_dict = optimizer.state_dict() + paddle.save(model_dict, model_path + '.pdparams') + paddle.save(opti_dict, model_path + '.pdopt') + print('[Saved Epoch {} parameters and optimizer state ]'.format(epoch)) + + +def subn_load(model, ck_path, optimizer=None): + """ + Load the checkpoint from the given file. + Args: + model (model): model to load the weights from the checkpoint. + optimizer (optim, optional): optimizer to load the historical state. + ck_path (str): checkpoint path + Returns: + (int): the number of training epoch of the checkpoint. + """ + + assert os.path.exists(ck_path + ".pdparams"), \ + "Given dir {}.pdparams not exist.".format(ck_path) + print("load checkpint from {}.pdparams".format(ck_path)) + + model_dict = model.state_dict() + checkpoint_dict = paddle.load(ck_path + ".pdparams") + # checkpoint_dict = copy.deepcopy(checkpoint_dict_orig) #not modify when multi card + pre_train_dict = normal_to_sub_bn(checkpoint_dict, model_dict) + + # Match pre-trained weights that have same shape as current model. + pre_train_dict_match = { + k: v + for k, v in pre_train_dict.items() + if k in model_dict and tuple(v.shape) == tuple(model_dict[k].shape) + } + + # Weights that do not have match from the pre-trained model. + not_load_layers = [ + k for k in model_dict.keys() if k not in pre_train_dict_match.keys() + ] + # Log weights that are not loaded with the pre-trained weights. + if not_load_layers: + for k in not_load_layers: + if 'bn.weight' not in k and 'bn.bias' not in k: + print("Network weights {} not loaded.".format(k)) + + # Load pre-trained weights. + model.set_state_dict(pre_train_dict_match) + + if optimizer: + assert os.path.exists(ck_path + ".pdopt"), \ + "Given dir {}.pdopt not exist.".format(ck_path) + print("load checkpint from {}.pdopt".format(ck_path)) + opt_dict = paddle.load(ck_path + ".pdopt") + # get parameters that required gradient from re-build model + model_key_list = [] + for param in model.parameters(): + if param.stop_gradient == False: + model_key_list.append(param.name) + + new_opt_dict = mapping_opt_dict(opt_dict, model_key_list) + optimizer.set_state_dict(new_opt_dict) diff --git a/Bank_second_part/detect_process/paddlevideo/utils/multigrid/short_sampler.py b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/short_sampler.py new file mode 100644 index 0000000..0004dac --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/multigrid/short_sampler.py @@ -0,0 +1,147 @@ +from __future__ import print_function +from __future__ import division + +import numpy as np +import math + +from paddle.io import BatchSampler + +__all__ = ["DistributedShortSampler"] + + +class DistributedShortSampler(BatchSampler): + """Sampler that restricts data loading to a subset of the dataset. + In such case, each process can pass a DistributedBatchSampler instance + as a DataLoader sampler, and load a subset of the original dataset that + is exclusive to it. + .. note:: + Batch size is dynamic changed following short cycle schedule. + + Args: + dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement + or other python object which implemented + `__len__` for BatchSampler to get sample + number of data source. + batch_sizes(list): batch size list of one cycle. + num_replicas(int, optional): porcess number in distributed training. + If :attr:`num_replicas` is None, :attr:`num_replicas` will be + retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. + Default None. + rank(int, optional): the rank of the current process among :attr:`num_replicas` + processes. If :attr:`rank` is None, :attr:`rank` is retrieved from + :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None. + shuffle(bool): whther to shuffle indices order before genrating + batch indices. Default False. + drop_last(bool): whether drop the last incomplete batch dataset size + is not divisible by the batch size. Default False + """ + def __init__(self, + dataset, + batch_sizes, + num_replicas=None, + rank=None, + shuffle=False, + drop_last=False): + self.dataset = dataset + + assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \ + "batch_size should be a positive integer" + self.batch_sizes = batch_sizes + self.len_batch_sizes = len(self.batch_sizes) + assert isinstance(shuffle, bool), \ + "shuffle should be a boolean value" + self.shuffle = shuffle + assert isinstance(drop_last, bool), \ + "drop_last should be a boolean number" + + from paddle.distributed import ParallelEnv + + if num_replicas is not None: + assert isinstance(num_replicas, int) and num_replicas > 0, \ + "num_replicas should be a positive integer" + self.nranks = num_replicas + else: + self.nranks = ParallelEnv().nranks + + if rank is not None: + assert isinstance(rank, int) and rank >= 0, \ + "rank should be a non-negative integer" + self.local_rank = rank + else: + self.local_rank = ParallelEnv().local_rank + + self.drop_last = drop_last + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks)) + self.total_size = self.num_samples * self.nranks + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - + len(indices))] #completion last iter + assert len(indices) == self.total_size + if self.shuffle: + np.random.RandomState(self.epoch).shuffle(indices) + self.epoch += 1 + + # subsample + def _get_indices_by_batch_size(indices): + total_batch_size = sum(self.batch_sizes) + subsampled_indices = [] + last_batch_size = self.total_size % ( + total_batch_size * self.nranks) #number samples of last batch + assert last_batch_size % self.nranks == 0 + last_local_batch_size = last_batch_size // self.nranks + + for i in range(self.local_rank * total_batch_size, + len(indices) - last_batch_size, + total_batch_size * self.nranks): + subsampled_indices.extend(indices[i:i + total_batch_size]) + + indices = indices[len(indices) - last_batch_size:] + subsampled_indices.extend( + indices[self.local_rank * + last_local_batch_size:(self.local_rank + 1) * + last_local_batch_size]) + return subsampled_indices + + if self.nranks > 1: + indices = _get_indices_by_batch_size(indices) + + assert len(indices) == self.num_samples #index length in each card + _sample_iter = iter(indices) + + batch_indices = [] + counter = 0 + batch_size = self.batch_sizes[0] + for idx in _sample_iter: + batch_indices.append( + (idx, counter % + self.len_batch_sizes)) #to be used in dataloader get_item + if len(batch_indices) == batch_size: + yield batch_indices + counter += 1 + batch_size = self.batch_sizes[counter % self.len_batch_sizes] + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + def __len__(self): + avg_batch_size = sum(self.batch_sizes) / float(self.len_batch_sizes) + if self.drop_last: + return int(np.floor(self.num_samples / avg_batch_size)) + else: + return int(np.ceil(self.num_samples / avg_batch_size)) + + def set_epoch(self, epoch): + """ + Sets the epoch number. When :attr:`shuffle=True`, this number is used + as seeds of random numbers. By default, users may not set this, all + replicas (workers) use a different random ordering for each epoch. + If set same number at each epoch, this sampler will yield the same + ordering at all epoches. + Arguments: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/Bank_second_part/detect_process/paddlevideo/utils/precise_bn.py b/Bank_second_part/detect_process/paddlevideo/utils/precise_bn.py new file mode 100644 index 0000000..c9fdd40 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/precise_bn.py @@ -0,0 +1,94 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import itertools + +from paddlevideo.utils import get_logger + +logger = get_logger("paddlevideo") +""" +Implement precise bn, which is useful for improving accuracy. +""" + + +@paddle.no_grad() # speed up and save CUDA memory +def do_preciseBN(model, + data_loader, + parallel, + num_iters=200, + use_amp=False, + amp_level=None): + """ + Recompute and update the batch norm stats to make them more precise. During + training both BN stats and the weight are changing after every iteration, so + the running average can not precisely reflect the actual stats of the + current model. + In this function, the BN stats are recomputed with fixed weights, to make + the running average more precise. Specifically, it computes the true average + of per-batch mean/variance instead of the running average. + This is useful to improve validation accuracy. + Args: + model: the model whose bn stats will be recomputed + data_loader: an iterator. Produce data as input to the model + num_iters: number of iterations to compute the stats. + Return: + the model with precise mean and variance in bn layers. + """ + bn_layers_list = [ + m for m in model.sublayers() + if any((isinstance(m, bn_type) + for bn_type in (paddle.nn.BatchNorm1D, paddle.nn.BatchNorm2D, + paddle.nn.BatchNorm3D))) and m.training + ] + if len(bn_layers_list) == 0: + return + + # moving_mean=moving_mean*momentum+batch_mean*(1.−momentum) + # we set momentum=0. to get the true mean and variance during forward + momentum_actual = [bn._momentum for bn in bn_layers_list] + for bn in bn_layers_list: + bn._momentum = 0. + + running_mean = [paddle.zeros_like(bn._mean) + for bn in bn_layers_list] # pre-ignore + running_var = [paddle.zeros_like(bn._variance) for bn in bn_layers_list] + + ind = -1 + for ind, data in enumerate(itertools.islice(data_loader, num_iters)): + logger.info("Computing precise BN {} / {}...".format( + ind + 1, num_iters)) + + if use_amp: + with paddle.amp.auto_cast( + custom_black_list={"reduce_mean", + "conv3d"}, level=amp_level): + model(data, mode='train') + else: + model(data, mode='train') + + for i, bn in enumerate(bn_layers_list): + # Accumulates the bn stats. + running_mean[i] += (bn._mean - running_mean[i]) / (ind + 1) + running_var[i] += (bn._variance - running_var[i]) / (ind + 1) + + assert ind == num_iters - 1, ( + "update_bn_stats is meant to run for {} iterations, but the dataloader stops at {} iterations." + .format(num_iters, ind)) + + # Sets the precise bn stats. + for i, bn in enumerate(bn_layers_list): + bn._mean.set_value(running_mean[i]) + bn._variance.set_value(running_var[i]) + bn._momentum = momentum_actual[i] diff --git a/Bank_second_part/detect_process/paddlevideo/utils/profiler.py b/Bank_second_part/detect_process/paddlevideo/utils/profiler.py new file mode 100644 index 0000000..629ef4e --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/profiler.py @@ -0,0 +1,128 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle.profiler as profiler + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None +_prof = None + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True, + 'timer_only': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + elif key == 'timer_only': + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _prof + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan + # timer_only = True only the model's throughput and time overhead are displayed + # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives. + # timer_only = False the output Timeline information can be found in the profiler_log directory + if _prof is None: + _timer_only = str(_profiler_options['timer_only']) == str(True) + _prof = profiler.Profiler( + scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]), + on_trace_ready = profiler.export_chrome_tracing('./profiler_log'), + timer_only = _timer_only) + _prof.start() + else: + _prof.step() + + if _profiler_step_id == _profiler_options['batch_range'][1]: + _prof.stop() + _prof.summary( + op_detail=True, + thread_sep=False, + time_unit='ms') + _prof = None + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/Bank_second_part/detect_process/paddlevideo/utils/record.py b/Bank_second_part/detect_process/paddlevideo/utils/record.py new file mode 100644 index 0000000..4aad434 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/record.py @@ -0,0 +1,163 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +from collections import OrderedDict + +import paddle + +from .logger import coloring, get_logger + +logger = get_logger("paddlevideo") + +__all__ = ['AverageMeter', 'build_record', 'log_batch', 'log_epoch'] + + +def build_record(cfg): + record_list = [ + ("loss", AverageMeter('loss', '7.5f')), + ("lr", AverageMeter('lr', 'f', need_avg=False)), + ] + if 'Recognizer1D' in cfg.framework: #TODO: required specify str in framework + record_list.append(("hit_at_one", AverageMeter("hit_at_one", '.5f'))) + record_list.append(("perr", AverageMeter("perr", '.5f'))) + record_list.append(("gap", AverageMeter("gap", '.5f'))) + elif 'Recognizer' in cfg.framework: + record_list.append(("top1", AverageMeter("top1", '.5f'))) + record_list.append(("top5", AverageMeter("top5", '.5f'))) + elif 'FastRCNN' in cfg.framework: + record_list.append( + ("recall@thr=0.5", AverageMeter("recall@thr=0.5", '.5f'))) + record_list.append(("prec@thr=0.5", AverageMeter("prec@thr=0.5", + '.5f'))) + record_list.append(("recall@top3", AverageMeter("recall@top3", '.5f'))) + record_list.append(("prec@top3", AverageMeter("prec@top3", '.5f'))) + record_list.append(("recall@top5", AverageMeter("recall@top5", '.5f'))) + record_list.append(("prec@top5", AverageMeter("prec@top5", '.5f'))) + record_list.append(("mAP@0.5IOU", AverageMeter("mAP@0.5IOU", '.5f'))) + elif 'DepthEstimator' in cfg.framework: + record_list.append(("abs_rel", AverageMeter("abs_rel", '.5f'))) + record_list.append(("sq_rel", AverageMeter("sq_rel", '.5f'))) + record_list.append(("rmse", AverageMeter("rmse", '.5f'))) + record_list.append(("rmse_log", AverageMeter("rmse_log", '.5f'))) + record_list.append(("a1", AverageMeter("a1", '.5f'))) + record_list.append(("a2", AverageMeter("a2", '.5f'))) + record_list.append(("a3", AverageMeter("a3", '.5f'))) + record_list.append(("losses_day", AverageMeter("losses_day", '.5f'))) + record_list.append(("losses_night", AverageMeter("losses_night", + '.5f'))) + elif 'MSTCN' in cfg.framework or 'ASRF' in cfg.framework: + record_list.append(("F1@0.50", AverageMeter("F1@0.50", '.5f'))) + + elif 'YOWOLocalizer' in cfg.framework: + record_list.append(("nCorrect", AverageMeter('nCorrect', '.1f'))) + record_list.append(("fscore", AverageMeter("fscore", '.5f'))) + + record_list.append(("batch_time", AverageMeter('batch_cost', '.5f'))) + record_list.append(("reader_time", AverageMeter('reader_cost', '.5f'))) + record_list = OrderedDict(record_list) + return record_list + + +class AverageMeter(object): + """ + Computes and stores the average and current value + """ + def __init__(self, name='', fmt='f', need_avg=True): + self.name = name + self.fmt = fmt + self.need_avg = need_avg + self.reset() + + def reset(self): + """ reset """ + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + """ update """ + if isinstance(val, paddle.Tensor): + val = float(val) + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + @property + def total(self): + return '{self.name}_sum: {self.sum:{self.fmt}}'.format(self=self) + + @property + def total_minute(self): + return '{self.name}_sum: {s:{self.fmt}} min'.format(s=self.sum / 60, + self=self) + + @property + def mean(self): + return '{self.name}_avg: {self.avg:{self.fmt}}'.format( + self=self) if self.need_avg else '' + + @property + def value(self): + return '{self.name}: {self.val:{self.fmt}}'.format(self=self) + + +def log_batch(metric_list, + batch_id, + epoch_id, + total_epoch, + mode, + ips, + eta_sec: int = None): + batch_cost = str(metric_list['batch_time'].value) + ' sec,' + reader_cost = str(metric_list['reader_time'].value) + ' sec,' + + metric_values = [] + for m in metric_list: + if not (m == 'batch_time' or m == 'reader_time'): + metric_values.append(metric_list[m].value) + metric_str = ' '.join([str(v) for v in metric_values]) + epoch_str = "epoch:[{:>3d}/{:<3d}]".format(epoch_id, total_epoch) + step_str = "{:s} step:{:<4d}".format(mode, batch_id) + if eta_sec is not None: + eta_str = "eta: {:s}".format( + str(datetime.timedelta(seconds=int(eta_sec)))) + else: + eta_str = '' + logger.info("{:s} {:s} {:s} {:s} {:s} {} {:s}".format( + coloring(epoch_str, "HEADER") if batch_id == 0 else epoch_str, + coloring(step_str, "PURPLE"), coloring(metric_str, 'OKGREEN'), + coloring(batch_cost, "OKGREEN"), coloring(reader_cost, 'OKGREEN'), ips, + eta_str)) + + +def log_epoch(metric_list, epoch, mode, ips): + batch_cost = 'avg_' + str(metric_list['batch_time'].value) + ' sec,' + reader_cost = 'avg_' + str(metric_list['reader_time'].value) + ' sec,' + batch_sum = str(metric_list['batch_time'].total) + ' sec,' + + metric_values = [] + for m in metric_list: + if not (m == 'batch_time' or m == 'reader_time'): + metric_values.append(metric_list[m].mean) + metric_str = ' '.join([str(v) for v in metric_values]) + + end_epoch_str = "END epoch:{:<3d}".format(epoch) + + logger.info("{:s} {:s} {:s} {:s} {:s} {:s} {}".format( + coloring(end_epoch_str, "RED"), coloring(mode, "PURPLE"), + coloring(metric_str, "OKGREEN"), coloring(batch_cost, "OKGREEN"), + coloring(reader_cost, "OKGREEN"), coloring(batch_sum, "OKGREEN"), ips)) diff --git a/Bank_second_part/detect_process/paddlevideo/utils/registry.py b/Bank_second_part/detect_process/paddlevideo/utils/registry.py new file mode 100644 index 0000000..81b76bd --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/registry.py @@ -0,0 +1,96 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Registry(object): + """ + The registry that provides name -> object mapping, to support third-party users' custom modules. + + To register an object: + + .. code-block:: python + + BACKBONES = Registry('backbone') + @BACKBONES.register() + class ResNet: + pass + Or: + .. code-block:: python + + BACKBONES = Registry('backbone') + class ResNet: + pass + BACKBONES.register(ResNet) + + Usage: To build a module. + + .. code-block:: python + backbone_name = "ResNet" + b = BACKBONES.get(backbone_name)() + + """ + def __init__(self, name): + """ + Args: + name (str): the name of this registry + """ + self._name = name + self._obj_map = {} + + def __contains__(self, key): + return self._obj_map.get(key) is not None + + def _do_register(self, name, obj): + assert ( + name not in self._obj_map + ), "An object named '{}' was already registered in '{}' registry!".format( + name, self._name) + self._obj_map[name] = obj + + def register(self, obj=None, name=None): + """ + Register the given object under the the name `obj.__name__`. + Can be used as either a decorator or not. See docstring of this class for usage. + """ + if obj is None: + # used as a decorator + def deco(func_or_class, name=name): + if name is None: + name = func_or_class.__name__ + self._do_register(name, func_or_class) + return func_or_class + + return deco + + # used as a function call + if name is None: + name = obj.__name__ + self._do_register(name, obj) + + def get(self, name): + """Get the registry record. + + Args: + name (str): The class name. + + Returns: + ret: The class. + """ + ret = self._obj_map.get(name) + if ret is None: + raise KeyError( + "No object named '{}' found in '{}' registry!".format( + name, self._name)) + + return ret diff --git a/Bank_second_part/detect_process/paddlevideo/utils/save_load.py b/Bank_second_part/detect_process/paddlevideo/utils/save_load.py new file mode 100644 index 0000000..10bb5f0 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/utils/save_load.py @@ -0,0 +1,289 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import os.path as osp +import time + +import paddle +import paddle.nn.functional as F +from paddlevideo.utils import get_logger, main_only +from tqdm import tqdm +import numpy as np +from scipy import ndimage + + +def pretrain_swin_param_trans(model, state_dicts): + # delete classifier's params + if 'head.fc' + '.weight' in state_dicts: + del state_dicts['head.fc' + '.weight'] + if 'head.fc' + '.bias' in state_dicts: + del state_dicts['head.fc' + '.bias'] + + state_dicts = { + k.replace('backbone.', ''): v + for k, v in state_dicts.items() + } + + if len(state_dicts) == len(model.state_dict()): + print("Load 3D weights") + return state_dicts + + print("Load 2D weights") + relative_position_index_keys = [ + k for k in state_dicts.keys() if "relative_position_index" in k + ] + for k in relative_position_index_keys: + del state_dicts[k] + + # delete attn_mask since we always re-init it + attn_mask_keys = [k for k in state_dicts.keys() if "attn_mask" in k] + for k in attn_mask_keys: + del state_dicts[k] + + state_dicts['patch_embed.proj.weight'] = state_dicts[ + 'patch_embed.proj.weight'].unsqueeze(2).tile( + [1, 1, model.patch_size[0], 1, 1]) / model.patch_size[0] + + # bicubic interpolate relative_position_bias_table if not match + relative_position_bias_table_keys = [ + k for k in state_dicts.keys() if "relative_position_bias_table" in k + ] + total_len = len(relative_position_bias_table_keys) + with tqdm(total=total_len, + position=1, + bar_format='{desc}', + desc="Loading weights") as desc: + for key in tqdm(relative_position_bias_table_keys, + total=total_len, + position=0): + relative_position_bias_table_pretrained = state_dicts[key] + relative_position_bias_table_current = model.state_dict()[key] + L1, nH1 = relative_position_bias_table_pretrained.shape + L2, nH2 = relative_position_bias_table_current.shape + L2 = (2 * model.window_size[1] - 1) * (2 * model.window_size[2] - 1) + wd = model.window_size[0] + if nH1 != nH2: + desc.set_description(f"Error in loading {key}, skip") + else: + if L1 != L2: + S1 = int(L1**0.5) + relative_position_bias_table_pretrained_resized = paddle.nn.functional.interpolate( + relative_position_bias_table_pretrained.transpose( + [1, 0]).reshape([1, nH1, S1, S1]), + size=(2 * model.window_size[1] - 1, + 2 * model.window_size[2] - 1), + mode='bicubic') + relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.reshape( + [nH2, L2]).transpose([1, 0]) + desc.set_description(f"Loading {key}") + state_dicts[key] = relative_position_bias_table_pretrained.tile( + [2 * wd - 1, 1]) + time.sleep(0.01) + ret_str = "loading {:<20d} weights completed.".format( + len(model.state_dict())) + desc.set_description(ret_str) + return state_dicts + + +def pretrain_vit_param_trans(model, state_dicts, num_patches, num_seg, + attention_type): + """ + Convert ViT's pre-trained model parameters to a parameter dictionary that matches the existing model + """ + if 'head' + '.weight' in state_dicts: + del state_dicts['head' + '.weight'] + if 'head' + '.bias' in state_dicts: + del state_dicts['head' + '.bias'] + + total_len = len(model.state_dict()) + if num_patches + 1 != state_dicts['pos_embed'].shape[1]: # when + pos_embed = state_dicts['pos_embed'] + cls_pos_embed = paddle.to_tensor( + pos_embed[0, 0, :]).unsqueeze(0).unsqueeze(1) + other_pos_embed = paddle.to_tensor(pos_embed[0, 1:, :]) + gs_new = int(np.sqrt(num_patches)) + gs_old = int(np.sqrt(other_pos_embed.shape[0])) + zoom = (gs_new / gs_old, gs_new / gs_old, 1) + other_pos_embed = paddle.reshape(other_pos_embed, [gs_old, gs_old, -1]) + other_pos_embed = ndimage.zoom(other_pos_embed, zoom, order=1) + other_pos_embed = paddle.to_tensor(other_pos_embed) + new_pos_embed = paddle.reshape(other_pos_embed, [1, num_patches, -1]) + new_pos_embed = paddle.concat((cls_pos_embed, new_pos_embed), axis=1) + state_dicts['pos_embed'] = new_pos_embed + time.sleep(0.01) + + if 'time_embed' in state_dicts and num_seg != state_dicts[ + 'time_embed'].shape[1]: + time_embed = state_dicts['time_embed'].transpose((0, 2, 1)).unsqueeze(0) + new_time_embed = F.interpolate(time_embed, + size=(time_embed.shape[-2], num_seg), + mode='nearest') + state_dicts['time_embed'] = new_time_embed.squeeze(0).transpose( + (0, 2, 1)) + time.sleep(0.01) + with tqdm(total=total_len, + position=1, + bar_format='{desc}', + desc="Loading weights") as desc: + if attention_type == 'divided_space_time': + new_state_dicts = state_dicts.copy() + for key in tqdm(state_dicts): + if 'blocks' in key and 'attn' in key: + desc.set_description("Loading %s" % key) + new_key = key.replace('attn', 'temporal_attn') + if not new_key in state_dicts: + new_state_dicts[new_key] = state_dicts[key] + else: + new_state_dicts[new_key] = state_dicts[new_key] + if 'blocks' in key and 'norm1' in key: + desc.set_description("Loading %s" % key) + new_key = key.replace('norm1', 'temporal_norm1') + if not new_key in state_dicts: + new_state_dicts[new_key] = state_dicts[key] + else: + new_state_dicts[new_key] = state_dicts[new_key] + time.sleep(0.01) + elif attention_type == 'space_only': # tokenshift raw vit + new_state_dicts = state_dicts.copy() + + ret_str = "loading {:<20d} weights completed.".format( + len(model.state_dict())) + desc.set_description(ret_str) + return new_state_dicts + + +def pretrain_resnet18_param_trans(model, loaded_dict): + encoder_dict = model.encoder.state_dict() + pose_encoder_dict = model.pose_encoder.state_dict() + + names = ['encoder.', 'encoder_day.', 'encoder_night.'] + for name in names: + total_len = len(loaded_dict.items()) + with tqdm(total=total_len, + position=1, + bar_format='{desc}', + desc="Loading weights") as desc: + for key, value in tqdm(loaded_dict.items(), + total=total_len, + position=0): + key = str(name + key) + if key in encoder_dict: + encoder_dict[key] = value + desc.set_description('Loading %s' % key) + time.sleep(0.01) + + num_input_images = 2 + loaded_dict['conv1.weight'] = paddle.concat( + [loaded_dict['conv1.weight']] * num_input_images, 1) / num_input_images + total_len = len(loaded_dict.items()) + with tqdm(total=total_len, + position=1, + bar_format='{desc}', + desc="Loading weights") as desc: + for name, value in tqdm(loaded_dict.items(), + total=total_len, + position=0): + name = str('encoder.' + name) + if name in pose_encoder_dict: + pose_encoder_dict[name] = value + desc.set_description('Loading %s' % key) + time.sleep(0.01) + ret_str = "loading {:<20d} weights completed.".format( + len(model.state_dict())) + desc.set_description(ret_str) + return encoder_dict, pose_encoder_dict + + +#XXX(shipping): maybe need load N times because of different cards have different params. +@main_only +def load_ckpt(model, weight_path, **kargs): + """ + 1. Load pre-trained model parameters + 2. Extract and convert from the pre-trained model to the parameters + required by the existing model + 3. Load the converted parameters of the existing model + """ + #model.set_state_dict(state_dict) + + if not osp.isfile(weight_path): + raise IOError(f'{weight_path} is not a checkpoint file') + #state_dicts = load(weight_path) + + logger = get_logger("paddlevideo") + state_dicts = paddle.load(weight_path) + if 'ResnetEncoder' in str(model): + encoder_dict, pose_encoder_dict = pretrain_resnet18_param_trans( + model, state_dicts) + model.encoder.load_dict(encoder_dict) + model.pose_encoder.load_dict(pose_encoder_dict) + tmp = model.state_dict() + elif "VisionTransformer" in str(model): # For TimeSformer case + tmp = pretrain_vit_param_trans(model, state_dicts, kargs['num_patches'], + kargs['num_seg'], + kargs['attention_type']) + elif 'SwinTransformer3D' in str(model): + tmp = pretrain_swin_param_trans(model, state_dicts) + else: + tmp = {} + total_len = len(model.state_dict()) + with tqdm(total=total_len, + position=1, + bar_format='{desc}', + desc="Loading weights") as desc: + for item in tqdm(model.state_dict(), total=total_len, position=0): + name = item + desc.set_description('Loading %s' % name) + if name not in state_dicts: # Convert from non-parallel model + if str('backbone.' + name) in state_dicts: + tmp[name] = state_dicts['backbone.' + name] + else: # Convert from parallel model + tmp[name] = state_dicts[name] + time.sleep(0.01) + ret_str = "loading {:<20d} weights completed.".format( + len(model.state_dict())) + desc.set_description(ret_str) + model.set_state_dict(tmp) + + +def mkdir(dir): + if not os.path.exists(dir): + # avoid error when train with multiple gpus + try: + os.makedirs(dir) + except: + pass + + +def _extract_student_weights(all_params, student_prefix="Student."): + s_params = { + key[len(student_prefix):]: all_params[key] + for key in all_params if student_prefix in key + } + return s_params + + +@main_only +def save(obj, path, save_student_model=False): + if save_student_model: + s_params = _extract_student_weights(obj) + student_path = path.replace(".pdparams", "_student.pdparams") + if len(s_params) > 0: + paddle.save(s_params, student_path) + paddle.save(obj, path) + + +def load(file_name): + if not osp.isfile(file_name): + raise IOError(f'{file_name} not exist') + return paddle.load(file_name) diff --git a/Bank_second_part/detect_process/paddlevideo/version.py b/Bank_second_part/detect_process/paddlevideo/version.py new file mode 100644 index 0000000..b5b7f48 --- /dev/null +++ b/Bank_second_part/detect_process/paddlevideo/version.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["paddlevideo_version"] +paddlevideo_version = "0.0.1" diff --git a/Bank_second_part/detect_process/personDet.py b/Bank_second_part/detect_process/personDet.py new file mode 100644 index 0000000..0cef736 --- /dev/null +++ b/Bank_second_part/detect_process/personDet.py @@ -0,0 +1,42 @@ + +def analysis_yolov8(frame, model_coco,confidence_set): + + # 第一步:用COCO数据集推理 + results_coco = model_coco(frame) + + re_list = [] + + if results_coco: + + for r in results_coco: + + boxes = r.boxes + + idx = 0 + + for box in boxes: + + idx += 1 + b = box.xyxy[0] # get box coordinates in (top, left, bottom, right) format + c = box.cls + + # 保存标签和坐标值作为返回结果 + blist = b.tolist() + labels_name = model_coco.names[int(c)] + + confidence = float(box.conf) + + confidence = round(confidence, 2) + + # 过滤置信度以下目标 + if confidence < confidence_set: + + continue + + if labels_name == 'person': + # 一个结果字典 + re_dict = {labels_name:blist} + + re_list.append(re_dict) + + return re_list diff --git a/Bank_second_part/detect_process/tools.py b/Bank_second_part/detect_process/tools.py deleted file mode 100644 index 8d9af60..0000000 --- a/Bank_second_part/detect_process/tools.py +++ /dev/null @@ -1,212 +0,0 @@ -import cv2 -import os - -class Process_tools(): - - # 图像文件夹 - def get_video_list(path): - video_ext = [".mp4", ".avi",".MP4"] - video_names = [] - for maindir, subdir, file_name_list in os.walk(path): - for filename in file_name_list: - apath = os.path.join(maindir, filename) - ext = os.path.splitext(apath)[1] - if ext in video_ext: - video_names.append(apath) - return video_names - - - # 截取裁剪需要的视频帧 - def save_seg_video(video_name,frameToStart,frametoStop,videoWriter,bbox): - - cap = cv2.VideoCapture(video_name) - count = 0 - while True: - - success, frame = cap.read() - - if success: - - count += 1 - if count <= frametoStop and count > frameToStart: # 选取起始帧 - print('correct= ', count) - - #裁剪视频画面 - frame_target = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] # (split_height, split_width) - - videoWriter.write(frame_target) - - if not success or count >= frametoStop: - break - - print('end') - - - # 获得字典中所有values值(这个值是列表) - def get_dict_values(lst): - """ - 获取列表中所有字典的 values 值(如果值是列表) - - 参数: - lst: 包含字典的列表 - - 返回值: - values: 包含所有字典的 values 值的列表(如果值是列表) - """ - return [value for dictionary in lst for value in dictionary.values() if isinstance(value, list)] - - - - # 解析检测后的结果,为检测后的结果排序 - def analysis_sort_list(result_dict): - - # print('result_dict:',result_dict) - - # 获得检测列表 - re_list = result_dict['start_bbox'] - # print('re_list:',re_list) - - # 获得列表中所有字典的values值 - re_bbox_list = Process_tools.get_dict_values(re_list) - - # 为检测出来的标注框排序 - sorted_lst = sorted(re_bbox_list, key=lambda x: x[0]) - - return sorted_lst - - - #对比重叠率高的两个部分,并结合标注框,保存最大的标注框 - def contrast_bbox(e_bbox,r_bbox): - - e_bbox_min = e_bbox[:2] - r_bbox_min = r_bbox[:2] - - bbox_min = [min(x, y) for x, y in zip(e_bbox_min, r_bbox_min)] - - e_bbox_max = e_bbox[-2:] - r_bbox_max = r_bbox[-2:] - - bbox_max = [max(x, y) for x, y in zip(e_bbox_max, r_bbox_max)] - - bbox = bbox_min + bbox_max - - return bbox - - - - # 解析result_list列表 - def analysis_re01_list(example_dict,result_dict): - - # 第一次检测到目标的帧率和信息 - example_dict_fps = list(example_dict.keys())[0] - example_sorted_lst = Process_tools.analysis_sort_list(example_dict) - - # 当前帧检测结果中所有的检测结果数值 - re_dict_fps = list(result_dict.keys())[0] - re_dict_sorted_lst = Process_tools.analysis_sort_list(result_dict) - - # 保存前后帧率连续的范围、筛选出相同的部分 - cut_list = [] - example_temp = [] - re_temp = [] - - for i,ex_bbox in enumerate(example_sorted_lst): - - for j,re_bbox in enumerate(re_dict_sorted_lst): - - iou = Process_tools.calculate_iou(box1=ex_bbox, box2=re_bbox) - - # print(iou) - - if iou > 0: - - bbox = Process_tools.contrast_bbox(e_bbox=ex_bbox,r_bbox=re_bbox) - - cut_list.append({i:bbox}) - example_temp.append(ex_bbox) - re_temp.append(re_bbox) - - break - - else: - continue - - example_sorted_lst = [item for item in example_sorted_lst if item not in example_temp] - re_dict_sorted_lst = [item for item in re_dict_sorted_lst if item not in re_temp] - - return cut_list,example_sorted_lst,re_dict_sorted_lst - - - # 计算前后帧率重叠范围 - def calculate_iou(box1, box2): - """ - 计算两个边界框之间的IoU值 - - 参数: - box1: 边界框1的坐标(x1, y1, x2, y2) - box2: 边界框2的坐标(x1, y1, x2, y2) - - 返回值: - iou: 两个边界框之间的IoU值 - """ - x1 = max(box1[0], box2[0]) - y1 = max(box1[1], box2[1]) - x2 = min(box1[2], box2[2]) - y2 = min(box1[3], box2[3]) - - # 计算交集区域面积 - intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1) - - # 计算边界框1和边界框2的面积 - box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1) - box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1) - - # 计算并集区域面积 - union_area = box1_area + box2_area - intersection_area - - # 计算IoU值 - iou = intersection_area / union_area - - return iou - - def para_correction(images_size,bbox,dertpara): - - ''' - 修正检测后标注框过小的情况,如果有修正参数则使用修正参数,如果没有就按照坐标值扩大两倍 - - ''' - - if dertpara: - pass - else: - w = (bbox[2] - bbox[0]) /2 - h = (bbox[3] - bbox[1]) /2 - - bbox_extand_list_x = [bbox[0] - w,bbox[2] + w] - bbox_extand_list_y = [bbox[1] - h,bbox[3] + h] - - bbox_list_x = Process_tools.contrast(size=images_size[0],bbox_extand_list=bbox_extand_list_x) - bbox_list_y = Process_tools.contrast(size=images_size[1],bbox_extand_list=bbox_extand_list_y) - - bbox_list = bbox_list_x + bbox_list_y - - return bbox_list - - - def contrast(size,bbox_extand_list): - - ''' - 对比数值是否在这个范围内 - ''' - - bbox_list = [] - - for x in bbox_extand_list: - - if x in range(size): - bbox_list.append(x) - if x > size: - bbox_list.append(size) - if x < 0: - bbox_list.append(0) - return bbox_list \ No newline at end of file diff --git a/Bank_second_part/detect_process/tools/__init__.py b/Bank_second_part/detect_process/tools/__init__.py new file mode 100644 index 0000000..e8d173d --- /dev/null +++ b/Bank_second_part/detect_process/tools/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['utils', 'PaddleVideo', 'ava_predict'] + +from . import utils +from .wheel import PaddleVideo +from . import ava_predict diff --git a/Bank_second_part/detect_process/tools/__pycache__/__init__.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..3988193 Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/__init__.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/tools/__pycache__/ava_predict.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/ava_predict.cpython-310.pyc new file mode 100644 index 0000000..c133bea Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/ava_predict.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/tools/__pycache__/utils.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..b17058f Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/utils.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/tools/__pycache__/wheel.cpython-310.pyc b/Bank_second_part/detect_process/tools/__pycache__/wheel.cpython-310.pyc new file mode 100644 index 0000000..55c3bf9 Binary files /dev/null and b/Bank_second_part/detect_process/tools/__pycache__/wheel.cpython-310.pyc differ diff --git a/Bank_second_part/detect_process/tools/ava_predict.py b/Bank_second_part/detect_process/tools/ava_predict.py new file mode 100644 index 0000000..5d333a2 --- /dev/null +++ b/Bank_second_part/detect_process/tools/ava_predict.py @@ -0,0 +1,509 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import paddle +import os, sys +import copy as cp +import cv2 +import math +try: + import ppdet +except ImportError as e: + print( + f"Warning! {e}, [paddledet] package and it's dependencies is required for AVA." + ) + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) + +from paddlevideo.modeling.builder import build_model +from paddlevideo.utils import get_config +from paddlevideo.loader.builder import build_dataloader, build_dataset, build_pipeline +from paddlevideo.metrics.ava_utils import read_labelmap + +import time +from os import path as osp +import numpy as np +from paddlevideo.utils import get_config +import pickle + +from paddlevideo.utils import (get_logger, load, mkdir, save) +import shutil + +FONTFACE = cv2.FONT_HERSHEY_DUPLEX +FONTSCALE = 0.5 +FONTCOLOR = (255, 255, 255) # BGR, white +MSGCOLOR = (128, 128, 128) # BGR, gray +THICKNESS = 1 +LINETYPE = 1 + + +def hex2color(h): + """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" + return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) + + +plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' +plate_blue = plate_blue.split('-') +plate_blue = [hex2color(h) for h in plate_blue] +plate_green = '004b23-006400-007200-008000-38b000-70e000' +plate_green = plate_green.split('-') +plate_green = [hex2color(h) for h in plate_green] + + +def abbrev(name): + """Get the abbreviation of label name: + 'take (an object) from (a person)' -> 'take ... from ...' + """ + while name.find('(') != -1: + st, ed = name.find('('), name.find(')') + name = name[:st] + '...' + name[ed + 1:] + return name + + +# annotations is pred results +def visualize(frames, annotations, plate=plate_blue, max_num=5): + """Visualize frames with predicted annotations. + Args: + frames (list[np.ndarray]): Frames for visualization, note that + len(frames) % len(annotations) should be 0. + annotations (list[list[tuple]]): The predicted results. + plate (str): The plate used for visualization. Default: plate_blue. + max_num (int): Max number of labels to visualize for a person box. + Default: 5,目前不能大于5. + Returns: + list[np.ndarray]: Visualized frames. + """ + + assert max_num + 1 <= len(plate) + plate = [x[::-1] for x in plate] + frames_ = cp.deepcopy(frames) + nf, na = len(frames), len(annotations) + assert nf % na == 0 + nfpa = len(frames) // len(annotations) + anno = None + h, w, _ = frames[0].shape + # proposals被归一化需要还原真实坐标值 + scale_ratio = np.array([w, h, w, h]) + + for i in range(na): + anno = annotations[i] + if anno is None: + continue + for j in range(nfpa): + ind = i * nfpa + j + frame = frames_[ind] + for ann in anno: + box = ann[0] + label = ann[1] + if not len(label): + continue + score = ann[2] + box = (box * scale_ratio).astype(np.int64) + st, ed = tuple(box[:2]), tuple(box[2:]) + cv2.rectangle(frame, st, ed, plate[0], 2) + for k, lb in enumerate(label): + if k >= max_num: + break + text = abbrev(lb) + text = ': '.join([text, str(score[k])]) + location = (0 + st[0], 18 + k * 18 + st[1]) + textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, + THICKNESS)[0] + textwidth = textsize[0] + diag0 = (location[0] + textwidth, location[1] - 14) + diag1 = (location[0], location[1] + 2) + cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + return frames_ + + +def frame_extraction(video_path, target_dir): + """Extract frames given video_path. + Args: + video_path (str): The video_path. + """ + + if not os.path.exists(target_dir): + os.makedirs(target_dir, exist_ok=True) + + # Should be able to handle videos up to several hours + frame_tmpl = osp.join(target_dir, '{:05d}.jpg') + vid = cv2.VideoCapture(video_path) + + FPS = int(vid.get(5)) + + frames = [] + frame_paths = [] + + flag, frame = vid.read() + index = 1 + while flag: + frames.append(frame) + frame_path = frame_tmpl.format(index) + frame_paths.append(frame_path) + cv2.imwrite(frame_path, frame) + index += 1 + flag, frame = vid.read() + return frame_paths, frames, FPS + + +def parse_args(): + def str2bool(v): + return v.lower() in ("true", "t", "1") + + # general params + parser = argparse.ArgumentParser("PaddleVideo Inference model script") + parser.add_argument('-c', + '--config', + type=str, + default='configs/example.yaml', + help='config file path') + + parser.add_argument('--video_path', help='video file/url') + + parser.add_argument('-o', + '--override', + action='append', + default=[], + help='config options to be overridden') + parser.add_argument('-w', + '--weights', + type=str, + help='weights for finetuning or testing') + + #detection_model_name + parser.add_argument('--detection_model_name', + help='the name of detection model ') + # detection_model_weights + parser.add_argument('--detection_model_weights', + help='the weights path of detection model ') + + # params for predict + parser.add_argument('--out-filename', + default='ava_det_demo.mp4', + help='output filename') + parser.add_argument('--predict-stepsize', + default=8, + type=int, + help='give out a prediction per n frames') + parser.add_argument( + '--output-stepsize', + default=4, + type=int, + help=('show one frame per n frames in the demo, we should have: ' + 'predict_stepsize % output_stepsize == 0')) + parser.add_argument('--output-fps', + default=6, + type=int, + help='the fps of demo video output') + + return parser.parse_args() + + +# 一帧的结果。根据概率大小进行排序 +def pack_result(human_detection, result): + """Short summary. + Args: + human_detection (np.ndarray): Human detection result. + result (type): The predicted label of each human proposal. + Returns: + tuple: Tuple of human proposal, label name and label score. + """ + results = [] + if result is None: + return None + + for prop, res in zip(human_detection, result): + res.sort(key=lambda x: -x[1]) + + results.append((prop, [x[0] for x in res], [x[1] for x in res])) + + return results + + +# 构造数据处理需要的results +def get_timestep_result(frame_dir, timestamp, clip_len, frame_interval, FPS): + result = {} + + result["frame_dir"] = frame_dir + + frame_num = len(os.listdir(frame_dir)) + + dir_name = frame_dir.split("/")[-1] + result["video_id"] = dir_name + + result['timestamp'] = timestamp + + timestamp_str = '{:04d}'.format(timestamp) + img_key = dir_name + "," + timestamp_str + result['img_key'] = img_key + + result['shot_info'] = (1, frame_num) + result['fps'] = FPS + + result['suffix'] = '{:05}.jpg' + + result['timestamp_start'] = 1 + result['timestamp_end'] = int(frame_num / result['fps']) + + return result + + +def detection_inference(frame_paths, output_dir, model_name, weights_path): + """Detect human boxes given frame paths. + Args: + frame_paths (list[str]): The paths of frames to do detection inference. + Returns: + list[np.ndarray]: The human detection results. + """ + + detection_cfg = ppdet.model_zoo.get_config_file(model_name) + detection_cfg = ppdet.core.workspace.load_config(detection_cfg) + detection_trainer = ppdet.engine.Trainer(detection_cfg, mode='test') + detection_trainer.load_weights(weights_path) + + print('Performing Human Detection for each frame') + + detection_trainer.predict(frame_paths, output_dir=output_dir, save_txt=True) + + print("finish object detection") + + results = [] + + for frame_path in frame_paths: + (file_dir, file_name) = os.path.split(frame_path) + (file_path, ext) = os.path.splitext(frame_path) + + txt_file_name = file_name.replace(ext, ".txt") + txt_path = os.path.join(output_dir, txt_file_name) + results.append(txt_path) + + return results + + +def get_detection_result(txt_file_path, img_h, img_w, person_det_score_thr): + """ + 根据检测结果文件得到图像中人的检测框(proposals)和置信度(scores) + txt_file_path:检测结果存放路径 + img_h:图像高度 + img_w:图像宽度 + """ + + proposals = [] + scores = [] + + with open(txt_file_path, 'r') as detection_file: + lines = detection_file.readlines() + for line in lines: # person 0.9842637181282043 0.0 469.1407470703125 944.7770385742188 831.806396484375 + items = line.split(" ") + if items[0] != 'person': #只要人 + continue + + score = items[1] + + if (float)(score) < person_det_score_thr: + continue + + x1 = (float(items[2])) / img_w + y1 = ((float)(items[3])) / img_h + box_w = ((float)(items[4])) + box_h = ((float)(items[5])) + + x2 = (float(items[2]) + box_w) / img_w + y2 = (float(items[3]) + box_h) / img_h + + scores.append(score) + + proposals.append([x1, y1, x2, y2]) + + return np.array(proposals), np.array(scores) + + +@paddle.no_grad() +def main(args): + config = get_config(args.config, show=False) #parse config file + + # extract frames from video + video_path = args.video_path + frame_dir = 'tmp_frames' + frame_paths, frames, FPS = frame_extraction(video_path, frame_dir) + + num_frame = len(frame_paths) #视频秒数*FPS + assert num_frame != 0 + print("Frame Number:", num_frame) + + # 帧图像高度和宽度 + h, w, _ = frames[0].shape + + # Get clip_len, frame_interval and calculate center index of each clip + data_process_pipeline = build_pipeline(config.PIPELINE.test) #测试时输出处理流水配置 + + clip_len = config.PIPELINE.test.sample['clip_len'] + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + frame_interval = config.PIPELINE.test.sample['frame_interval'] + + # 此处关键帧每秒取一个 + clip_len = config.PIPELINE.test.sample['clip_len'] + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + frame_interval = config.PIPELINE.test.sample['frame_interval'] + window_size = clip_len * frame_interval + timestamps = np.arange(window_size // 2, (num_frame + 1 - window_size // 2), + args.predict_stepsize) + print("timetamps number:", len(timestamps)) + + # get selected frame list according to timestamps + selected_frame_list = [] + for timestamp in timestamps: + selected_frame_list.append(frame_paths[timestamp - 1]) + + # Load label_map + label_map_path = config.DATASET.test['label_file'] + categories, class_whitelist = read_labelmap(open(label_map_path)) + label_map = {} + for item in categories: + id = item['id'] + name = item['name'] + label_map[id] = name + + # Construct model. + if config.MODEL.backbone.get('pretrained'): + config.MODEL.backbone.pretrained = '' # disable pretrain model init + model = build_model(config.MODEL) + + model.eval() + state_dicts = load(args.weights) + model.set_state_dict(state_dicts) + + detection_result_dir = 'tmp_detection' + detection_model_name = args.detection_model_name + detection_model_weights = args.detection_model_weights + detection_txt_list = detection_inference(selected_frame_list, + detection_result_dir, + detection_model_name, + detection_model_weights) + assert len(detection_txt_list) == len(timestamps) + + print('Performing SpatioTemporal Action Detection for each clip') + human_detections = [] + predictions = [] + + index = 0 + for timestamp, detection_txt_path in zip(timestamps, detection_txt_list): + proposals, scores = get_detection_result( + detection_txt_path, h, w, + (float)(config.DATASET.test['person_det_score_thr'])) + if proposals.shape[0] == 0: + predictions.append(None) + human_detections.append(None) + continue + + human_detections.append(proposals) + + result = get_timestep_result(frame_dir, + timestamp, + clip_len, + frame_interval, + FPS=FPS) + result["proposals"] = proposals + result["scores"] = scores + + new_result = data_process_pipeline(result) + proposals = new_result['proposals'] + + img_slow = new_result['imgs'][0] + img_slow = img_slow[np.newaxis, :] + img_fast = new_result['imgs'][1] + img_fast = img_fast[np.newaxis, :] + + proposals = proposals[np.newaxis, :] + + scores = scores[np.newaxis, :] + + img_shape = np.asarray(new_result['img_shape']) + img_shape = img_shape[np.newaxis, :] + + data = [ + paddle.to_tensor(img_slow, dtype='float32'), + paddle.to_tensor(img_fast, dtype='float32'), + paddle.to_tensor(proposals, dtype='float32'), scores, + paddle.to_tensor(img_shape, dtype='int32') + ] + + with paddle.no_grad(): + result = model(data, mode='infer') + + result = result[0] + prediction = [] + + person_num = proposals.shape[1] + # N proposals + for i in range(person_num): + prediction.append([]) + + # Perform action score thr + for i in range(len(result)): + if i + 1 not in class_whitelist: + continue + for j in range(person_num): + if result[i][j, 4] > config.MODEL.head['action_thr']: + prediction[j].append((label_map[i + 1], result[i][j, + 4])) + predictions.append(prediction) + + index = index + 1 + if index % 10 == 0: + print(index, "/", len(timestamps)) + + results = [] + for human_detection, prediction in zip(human_detections, predictions): + results.append(pack_result(human_detection, prediction)) + + def dense_timestamps(timestamps, n): + """Make it nx frames.""" + old_frame_interval = (timestamps[1] - timestamps[0]) + start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 + new_frame_inds = np.arange( + len(timestamps) * n) * old_frame_interval / n + start + return new_frame_inds.astype(np.int) + + dense_n = int(args.predict_stepsize / args.output_stepsize) #30 + frames = [ + cv2.imread(frame_paths[i - 1]) + for i in dense_timestamps(timestamps, dense_n) + ] + + vis_frames = visualize(frames, results) + + try: + import moviepy.editor as mpy + except ImportError: + raise ImportError('Please install moviepy to enable output file') + + vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], + fps=args.output_fps) + vid.write_videofile(args.out_filename) + print("finish write !") + + # delete tmp files and dirs + shutil.rmtree(frame_dir) + shutil.rmtree(detection_result_dir) + + +if __name__ == '__main__': + args = parse_args() #解析参数 + main(args) diff --git a/Bank_second_part/detect_process/tools/export_model.py b/Bank_second_part/detect_process/tools/export_model.py new file mode 100644 index 0000000..401091a --- /dev/null +++ b/Bank_second_part/detect_process/tools/export_model.py @@ -0,0 +1,267 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import os.path as osp +import sys + +import paddle +from paddle.jit import to_static +from paddle.static import InputSpec + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) + +from paddlevideo.modeling.builder import build_model +from paddlevideo.utils import get_config + + +def parse_args(): + parser = argparse.ArgumentParser("PaddleVideo export model script") + parser.add_argument('-c', + '--config', + type=str, + default='configs/example.yaml', + help='config file path') + parser.add_argument('--override', + action='append', + default=[], + help='config options to be overridden') + parser.add_argument("-p", + "--pretrained_params", + default='./best.pdparams', + type=str, + help='params path') + parser.add_argument("-o", + "--output_path", + type=str, + default="./inference", + help='output path') + + parser.add_argument('--save_name', + type=str, + default=None, + help='specify the exported inference \ + files(pdiparams and pdmodel) name,\ + only used in TIPC') + + return parser.parse_args() + + +def trim_config(cfg): + """ + Reuse the trainging config will bring useless attributes, such as: backbone.pretrained model. + and some build phase attributes should be overrided, such as: backbone.num_seg. + Trim it here. + """ + model_name = cfg.model_name + if cfg.MODEL.get('backbone') and cfg.MODEL.backbone.get('pretrained'): + cfg.MODEL.backbone.pretrained = "" # not ued when inference + + # for distillation + if cfg.MODEL.get('models'): + if cfg.MODEL.models[0]['Teacher']['backbone'].get('pretrained'): + cfg.MODEL.models[0]['Teacher']['backbone']['pretrained'] = "" + if cfg.MODEL.models[1]['Student']['backbone'].get('pretrained'): + cfg.MODEL.models[1]['Student']['backbone']['pretrained'] = "" + + return cfg, model_name + + +def get_input_spec(cfg, model_name): + if model_name in ['ppTSM', 'TSM', 'MoViNet', 'ppTSMv2']: + input_spec = [[ + InputSpec( + shape=[None, cfg.num_seg, 3, cfg.target_size, cfg.target_size], + dtype='float32'), + ]] + elif model_name in ['TokenShiftVisionTransformer']: + input_spec = [[ + InputSpec(shape=[ + None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size + ], + dtype='float32'), + ]] + elif model_name in ['TSN', 'ppTSN']: + input_spec = [[ + InputSpec(shape=[ + None, cfg.num_seg * 10, 3, cfg.target_size, cfg.target_size + ], + dtype='float32'), + ]] + elif model_name in ['BMN']: + input_spec = [[ + InputSpec(shape=[None, cfg.feat_dim, cfg.tscale], + dtype='float32', + name='feat_input'), + ]] + elif model_name in ['TimeSformer', 'ppTimeSformer']: + input_spec = [[ + InputSpec(shape=[ + None, 3, cfg.num_seg * 3, cfg.target_size, cfg.target_size + ], + dtype='float32'), + ]] + elif model_name in ['VideoSwin']: + input_spec = [[ + InputSpec(shape=[ + None, 3, cfg.num_seg * cfg.seg_len * 1, cfg.target_size, + cfg.target_size + ], + dtype='float32'), + ]] + elif model_name in ['VideoSwin_TableTennis']: + input_spec = [[ + InputSpec(shape=[ + None, 3, cfg.num_seg * cfg.seg_len * 3, cfg.target_size, + cfg.target_size + ], + dtype='float32'), + ]] + elif model_name in ['AttentionLSTM']: + input_spec = [[ + InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]], + dtype='float32'), # for rgb_data + InputSpec(shape=[ + None, + ], dtype='int64'), # for rgb_len + InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[0]], + dtype='float32'), # for rgb_mask + InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]], + dtype='float32'), # for audio_data + InputSpec(shape=[ + None, + ], dtype='int64'), # for audio_len + InputSpec(shape=[None, cfg.embedding_size, cfg.feature_dims[1]], + dtype='float32'), # for audio_mask + ]] + elif model_name in ['SlowFast']: + input_spec = [[ + InputSpec(shape=[ + None, 3, cfg.num_frames // cfg.alpha, cfg.target_size, + cfg.target_size + ], + dtype='float32', + name='slow_input'), + InputSpec(shape=[ + None, 3, cfg.num_frames, cfg.target_size, cfg.target_size + ], + dtype='float32', + name='fast_input'), + ]] + elif model_name in ['STGCN', 'AGCN', 'CTRGCN']: + input_spec = [[ + InputSpec(shape=[ + None, cfg.num_channels, cfg.window_size, cfg.vertex_nums, + cfg.person_nums + ], + dtype='float32'), + ]] + # 由于在模型运行过程中涉及到第一维乘human个数(N*M), 所以这里用1作为shape + elif model_name in ['AGCN2s']: + input_spec = [[ + InputSpec(shape=[ + 1, cfg.num_channels, cfg.window_size, cfg.vertex_nums, + cfg.person_nums + ], + dtype='float32'), + ]] + elif model_name in ['TransNetV2']: + input_spec = [[ + InputSpec(shape=[ + None, + cfg.num_frames, + cfg.height, + cfg.width, + cfg.num_channels, + ], + dtype='float32'), + ]] + elif model_name in ['MSTCN', 'ASRF']: + input_spec = [[ + InputSpec(shape=[None, cfg.num_channels, None], dtype='float32'), + ]] + elif model_name in ['ADDS']: + input_spec = [[ + InputSpec(shape=[None, cfg.num_channels, cfg.height, cfg.width], + dtype='float32'), + ]] + elif model_name in ['AVA_SlowFast_FastRcnn']: + input_spec = [[ + InputSpec(shape=[ + None, 3, cfg.num_frames // cfg.alpha, cfg.target_size, + cfg.target_size + ], + dtype='float32', + name='slow_input'), + InputSpec(shape=[ + None, 3, cfg.num_frames, cfg.target_size, cfg.target_size + ], + dtype='float32', + name='fast_input'), + InputSpec(shape=[None, None, 4], dtype='float32', name='proposals'), + InputSpec(shape=[None, 2], dtype='float32', name='img_shape') + ]] + elif model_name in ['PoseC3D']: + input_spec = [[ + InputSpec(shape=[None, 1, 17, 48, 56, 56], dtype='float32'), + ]] + elif model_name in ['YOWO']: + input_spec = [[ + InputSpec(shape=[ + 1, 3, cfg.num_seg, cfg.target_size, cfg.target_size + ], + dtype='float32'), + ]] + return input_spec + + +def main(): + args = parse_args() + cfg, model_name = trim_config( + get_config(args.config, overrides=args.override, show=False)) + + print(f"Building model({model_name})...") + model = build_model(cfg.MODEL) + assert osp.isfile( + args.pretrained_params + ), f"pretrained params ({args.pretrained_params} is not a file path.)" + + if not os.path.isdir(args.output_path): + os.makedirs(args.output_path) + + print(f"Loading params from ({args.pretrained_params})...") + params = paddle.load(args.pretrained_params) + model.set_dict(params) + + model.eval() + + # for rep nets + for layer in model.sublayers(): + if hasattr(layer, "rep") and not getattr(layer, "is_repped"): + layer.rep() + + input_spec = get_input_spec(cfg.INFERENCE, model_name) + model = to_static(model, input_spec=input_spec) + paddle.jit.save( + model, + osp.join(args.output_path, + model_name if args.save_name is None else args.save_name)) + print( + f"model ({model_name}) has been already saved in ({args.output_path}).") + + +if __name__ == "__main__": + main() diff --git a/Bank_second_part/detect_process/tools/predict.py b/Bank_second_part/detect_process/tools/predict.py new file mode 100644 index 0000000..bc9bd8c --- /dev/null +++ b/Bank_second_part/detect_process/tools/predict.py @@ -0,0 +1,327 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from os import path as osp +import paddle +from paddle import inference +from paddle.inference import Config, create_predictor + +from utils import build_inference_helper +from paddlevideo.utils import get_config + + +def parse_args(): + def str2bool(v): + return v.lower() in ("true", "t", "1") + + # general params + parser = argparse.ArgumentParser("PaddleVideo Inference model script") + parser.add_argument( + '-c', + '--config', + type=str, + default='configs/example.yaml', + help='config file path') + parser.add_argument( + '-o', + '--override', + action='append', + default=[], + help='config options to be overridden') + parser.add_argument("-i", "--input_file", type=str, help="input file path") + parser.add_argument( + "--time_test_file", + type=str2bool, + default=False, + help="whether input time test file") + parser.add_argument("--model_file", type=str) + parser.add_argument("--params_file", type=str) + + # params for paddle predict + parser.add_argument("-b", "--batch_size", type=int, default=1) + parser.add_argument("--use_gpu", type=str2bool, default=True) + parser.add_argument("--use_xpu", type=str2bool, default=False) + parser.add_argument("--use_npu", type=str2bool, default=False) + parser.add_argument("--precision", type=str, default="fp32") + parser.add_argument("--ir_optim", type=str2bool, default=True) + parser.add_argument("--use_tensorrt", type=str2bool, default=False) + parser.add_argument("--gpu_mem", type=int, default=8000) + parser.add_argument("--enable_benchmark", type=str2bool, default=False) + parser.add_argument("--enable_mkldnn", type=str2bool, default=False) + parser.add_argument("--cpu_threads", type=int, default=None) + parser.add_argument("--disable_glog", type=str2bool, default=False) + # parser.add_argument("--hubserving", type=str2bool, default=False) #TODO + + return parser.parse_args() + + +def create_paddle_predictor(args, cfg): + config = Config(args.model_file, args.params_file) + if args.use_gpu: + config.enable_use_gpu(args.gpu_mem, 0) + elif args.use_npu: + config.enable_npu() + elif args.use_xpu: + config.enable_xpu() + else: + config.disable_gpu() + if args.cpu_threads: + config.set_cpu_math_library_num_threads(args.cpu_threads) + if args.enable_mkldnn: + # cache 10 different shapes for mkldnn to avoid memory leak + config.set_mkldnn_cache_capacity(10) + config.enable_mkldnn() + if args.precision == "fp16": + config.enable_mkldnn_bfloat16() + + # config.disable_glog_info() + config.switch_ir_optim(args.ir_optim) # default true + if args.use_tensorrt: + # choose precision + if args.precision == "fp16": + precision = inference.PrecisionType.Half + elif args.precision == "int8": + precision = inference.PrecisionType.Int8 + else: + precision = inference.PrecisionType.Float32 + + # calculate real max batch size during inference when tenrotRT enabled + max_batch_size = args.batch_size + if 'num_seg' in cfg.INFERENCE: + # num_seg: number of segments when extracting frames. + # seg_len: number of frames extracted within a segment, default to 1. + # num_views: the number of video frame groups obtained by cropping and flipping, + # uniformcrop=3, tencrop=10, centercrop=1. + num_seg = cfg.INFERENCE.num_seg + seg_len = cfg.INFERENCE.get('seg_len', 1) + num_views = 1 + if 'tsm' in cfg.model_name.lower(): + num_views = 1 # CenterCrop + elif 'tsn' in cfg.model_name.lower(): + num_views = 10 # TenCrop + elif 'timesformer' in cfg.model_name.lower(): + num_views = 3 # UniformCrop + elif 'videoswin' in cfg.model_name.lower(): + num_views = 3 # UniformCrop + elif 'tokenshift' in cfg.model_name.lower(): + num_views = 3 # UniformCrop + max_batch_size = args.batch_size * num_views * num_seg * seg_len + config.enable_tensorrt_engine( + precision_mode=precision, max_batch_size=max_batch_size) + + config.enable_memory_optim() + # use zero copy + config.switch_use_feed_fetch_ops(False) + + # disable glog + if args.disable_glog: + config.disable_glog_info() + + # for ST-GCN tensorRT case usage + # config.delete_pass("shuffle_channel_detect_pass") + + predictor = create_predictor(config) + + return config, predictor + + +def parse_file_paths(input_path: str) -> list: + if osp.isfile(input_path): + files = [ + input_path, + ] + else: + files = os.listdir(input_path) + files = [ + file for file in files + if (file.endswith(".avi") or file.endswith(".mp4")) + ] + files = [osp.join(input_path, file) for file in files] + return files + + +def main(): + """predict using paddle inference model + """ + args = parse_args() + cfg = get_config(args.config, overrides=args.override, show=False) + + model_name = cfg.model_name + print(f"Inference model({model_name})...") + InferenceHelper = build_inference_helper(cfg.INFERENCE) + + inference_config, predictor = create_paddle_predictor(args, cfg) + + # get input_tensor and output_tensor + input_names = predictor.get_input_names() + output_names = predictor.get_output_names() + input_tensor_list = [] + output_tensor_list = [] + for item in input_names: + input_tensor_list.append(predictor.get_input_handle(item)) + for item in output_names: + output_tensor_list.append(predictor.get_output_handle(item)) + + # get the absolute file path(s) to be processed + if model_name in ["MSTCN", "ASRF"]: + files = InferenceHelper.get_process_file(args.input_file) + else: + files = parse_file_paths(args.input_file) + + if model_name == 'TransNetV2': + for file in files: + inputs = InferenceHelper.preprocess(file) + outputs = [] + for input in inputs: + # Run inference + for i in range(len(input_tensor_list)): + input_tensor_list[i].copy_from_cpu(input) + predictor.run() + output = [] + for j in range(len(output_tensor_list)): + output.append(output_tensor_list[j].copy_to_cpu()) + outputs.append(output) + + # Post process output + InferenceHelper.postprocess(outputs) + + elif model_name == 'AVA_SlowFast_FastRcnn': + for file in files: # for videos + inputs = InferenceHelper.preprocess(file) + outputs = [] + for input in inputs: + # Run inference + input_len = len(input_tensor_list) + + for i in range(input_len): + if type(input[i]) == paddle.Tensor: + input_tmp = input[i].numpy() + else: + input_tmp = input[i] + input_tensor_list[i].copy_from_cpu(input_tmp) + predictor.run() + output = [] + for j in range(len(output_tensor_list)): + output.append(output_tensor_list[j].copy_to_cpu()) + outputs.append(output) + + # Post process output + InferenceHelper.postprocess(outputs) + elif model_name == 'YOWO': + for file in files: # for videos + (_, filename) = os.path.split(file) + (filename, _) = os.path.splitext(filename) + save_dir = osp.join('inference', 'YOWO_infer') + if not osp.exists('inference'): + os.mkdir('inference') + if not osp.exists(save_dir): + os.mkdir(save_dir) + save_path = osp.join(save_dir, filename) + if not osp.exists(save_path): + os.mkdir(save_path) + inputs, frames = InferenceHelper.preprocess(file) + for idx, input in enumerate(inputs): + # Run inference + outputs = [] + input_len = len(input_tensor_list) + for i in range(input_len): + input_tensor_list[i].copy_from_cpu(input[i]) + predictor.run() + for j in range(len(output_tensor_list)): + outputs.append(output_tensor_list[j].copy_to_cpu()) + # Post process output + InferenceHelper.postprocess(outputs, frames[idx], osp.join(save_path, str(idx).zfill(3))) + else: + if args.enable_benchmark: + num_warmup = 3 + + # instantiate auto log + try: + import auto_log + except ImportError as e: + print(f"{e}, [git+https://github.com/LDOUBLEV/AutoLog] " + f"package and it's dependencies is required for " + f"python-inference when enable_benchmark=True.") + pid = os.getpid() + autolog = auto_log.AutoLogger( + model_name=cfg.model_name, + model_precision=args.precision, + batch_size=args.batch_size, + data_shape="dynamic", + save_path="./output/auto_log.lpg", + inference_config=inference_config, + pids=pid, + process_name=None, + gpu_ids=0 if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=num_warmup) + if not args.time_test_file: + test_video_num = 15 + files = [args.input_file for _ in range(test_video_num)] + else: + f_input = open(args.input_file, 'r') + files = [i.strip() for i in f_input.readlines()] + test_video_num = len(files) + f_input.close() + + # Inferencing process + batch_num = args.batch_size + for st_idx in range(0, len(files), batch_num): + ed_idx = min(st_idx + batch_num, len(files)) + + # auto log start + if args.enable_benchmark: + autolog.times.start() + + # Pre process batched input + batched_inputs = InferenceHelper.preprocess_batch( + files[st_idx:ed_idx]) + + # get pre process time cost + if args.enable_benchmark: + autolog.times.stamp() + + # run inference + for i in range(len(input_tensor_list)): + input_tensor_list[i].copy_from_cpu(batched_inputs[i]) + predictor.run() + + batched_outputs = [] + for j in range(len(output_tensor_list)): + batched_outputs.append(output_tensor_list[j].copy_to_cpu()) + + # get inference process time cost + if args.enable_benchmark: + autolog.times.stamp() + + InferenceHelper.postprocess(batched_outputs, + not args.enable_benchmark) + + # get post process time cost + if args.enable_benchmark: + autolog.times.end(stamp=True) + + # time.sleep(0.01) # sleep for T4 GPU + + # report benchmark log if enabled + if args.enable_benchmark: + autolog.report() + + +if __name__ == "__main__": + main() diff --git a/Bank_second_part/detect_process/tools/summary.py b/Bank_second_part/detect_process/tools/summary.py new file mode 100644 index 0000000..28bd6f7 --- /dev/null +++ b/Bank_second_part/detect_process/tools/summary.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys +import os.path as osp + +import paddle +import paddle.nn.functional as F +from paddle.jit import to_static +import paddleslim + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) + +from paddlevideo.modeling.builder import build_model +from paddlevideo.utils import get_config + + +def parse_args(): + + parser = argparse.ArgumentParser("PaddleVideo Summary") + parser.add_argument('-c', + '--config', + type=str, + default='configs/example.yaml', + help='config file path') + + parser.add_argument("--img_size", type=int, default=224) + parser.add_argument("--num_seg", type=int, default=8) + parser.add_argument("--FLOPs", + action="store_true", + help="whether to print FLOPs") + + return parser.parse_args() + + +def _trim(cfg, args): + """ + Reuse the trainging config will bring useless attribute, such as: backbone.pretrained model. Trim it here. + """ + model_name = cfg.model_name + cfg = cfg.MODEL + cfg.backbone.pretrained = "" + + if 'num_seg' in cfg.backbone: + cfg.backbone.num_seg = args.num_seg + return cfg, model_name + + +def main(): + args = parse_args() + cfg, model_name = _trim(get_config(args.config, show=False), args) + print(f"Building model({model_name})...") + model = build_model(cfg) + + img_size = args.img_size + num_seg = args.num_seg + #NOTE: only support tsm now, will refine soon + params_info = paddle.summary(model, (1, 1, num_seg, 3, img_size, img_size)) + print(params_info) + + if args.FLOPs: + flops_info = paddleslim.analysis.flops( + model, [1, 1, num_seg, 3, img_size, img_size]) + print(flops_info) + + +if __name__ == "__main__": + main() diff --git a/Bank_second_part/detect_process/tools/utils.py b/Bank_second_part/detect_process/tools/utils.py new file mode 100644 index 0000000..bbdd2d1 --- /dev/null +++ b/Bank_second_part/detect_process/tools/utils.py @@ -0,0 +1,1670 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import sys +from typing import List +import pickle + +import cv2 +try: + import imageio +except ImportError as e: + print( + f"Warning! {e}, [imageio] package and it's dependencies is required for VideoSwin." + ) +try: + import matplotlib as mpl + import matplotlib.cm as cm +except ImportError as e: + print( + f"Warning! {e}, [matplotlib] package and it's dependencies is required for ADDS." + ) +import numpy as np +import paddle +import paddle.nn.functional as F +import pandas +from PIL import Image + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) +from abc import abstractmethod + +from paddlevideo.loader.builder import build_pipeline +from paddlevideo.loader.pipelines import ( + AutoPadding, CenterCrop, DecodeSampler, FeatureDecoder, FrameDecoder, + GroupResize, Image2Array, ImageDecoder, JitterScale, MultiCrop, + Normalization, PackOutput, Sampler, SamplerPkl, Scale, SkeletonNorm, + TenCrop, ToArray, UniformCrop, VideoDecoder, SegmentationSampler, + SketeonCropSample, MultiCenterCrop, SketeonCropSample, UniformSampleFrames, + PoseDecode, PoseCompact, Resize, CenterCrop_V2, GeneratePoseTarget, + FormatShape, Collect) +from paddlevideo.metrics.ava_utils import read_labelmap +from paddlevideo.metrics.bmn_metric import boundary_choose, soft_nms +from paddlevideo.utils import Registry, build, get_config +from paddlevideo.modeling.framework.segmenters.utils import ASRFPostProcessing + +from tools.ava_predict import (detection_inference, frame_extraction, + get_detection_result, get_timestep_result, + pack_result, visualize) +from paddlevideo.modeling.framework.localizers.yowo_utils import nms, get_region_boxes + +INFERENCE = Registry('inference') + + +def build_inference_helper(cfg): + return build(cfg, INFERENCE) + + +class Base_Inference_helper(): + def __init__(self, + num_seg=8, + seg_len=1, + short_size=256, + target_size=224, + top_k=1): + """Base_Inference_helper + + Args: + num_seg (int, optional): number of segmentations of an sliced input video. Defaults to 8. + seg_len (int, optional): length of each segmentation. Defaults to 1. + short_size (int, optional): short size of input video. Defaults to 256. + target_size (int, optional): size of cropped video. Defaults to 224. + top_k (int, optional): select topk result in outputs. Defaults to 1. + """ + self.num_seg = num_seg + self.seg_len = seg_len + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + + @abstractmethod + def preprocess(self, input_file: str): + """preprocess abstractmethod + + Args: + input_file (str): input file path. + """ + pass + + def preprocess_batch(self, file_list: List[str]) -> List[np.ndarray]: + """preprocess for file list + + Args: + file_list (List[str]): file pathes in an list, [path1, path2, ...]. + + Returns: + List[np.ndarray]: batched inputs data, [data_batch[0], data_batch[1], ...]. + """ + batched_inputs = [] + for file in file_list: + inputs = self.preprocess(file) + batched_inputs.append(inputs) + batched_inputs = [ + np.concatenate([item[i] for item in batched_inputs]) + for i in range(len(batched_inputs[0])) + ] + self.input_file = file_list + return batched_inputs + + def postprocess(self, + output: np.ndarray, + print_output: bool = True, + return_result: bool = False): + """postprocess + + Args: + output (np.ndarray): batched output scores, shape of (batch_size, class_num). + print_output (bool, optional): whether to print result. Defaults to True. + """ + if not isinstance(self.input_file, list): + self.input_file = [ + self.input_file, + ] + output = output[0] # [B, num_cls] + N = len(self.input_file) + if output.shape[0] != N: + output = output.reshape([N] + [output.shape[0] // N] + + list(output.shape[1:])) # [N, T, C] + output = output.mean(axis=1) # [N, C] + output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() + results_list = [] + for i in range(N): + classes = np.argpartition(output[i], -self.top_k)[-self.top_k:] + classes = classes[np.argsort(-output[i, classes])] + scores = output[i, classes] + topk_class = classes[:self.top_k] + topk_scores = scores[:self.top_k] + result = { + "video_id": self.input_file[i], + "topk_class": topk_class, + "topk_scores": topk_scores + } + results_list.append(result) + if print_output: + print("Current video file: {0}".format(self.input_file[i])) + print("\ttop-{0} class: {1}".format(self.top_k, topk_class)) + print("\ttop-{0} score: {1}".format(self.top_k, topk_scores)) + if return_result: + return results_list + + +@INFERENCE.register() +class ppTSM_Inference_helper(Base_Inference_helper): + def __init__(self, + num_seg=8, + seg_len=1, + short_size=256, + target_size=224, + top_k=1): + self.num_seg = num_seg + self.seg_len = seg_len + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'filename': input_file} + img_mean = [0.485, 0.456, 0.406] + img_std = [0.229, 0.224, 0.225] + ops = [ + VideoDecoder(backend="decord"), + Sampler(self.num_seg, self.seg_len, valid_mode=True), + Scale(self.short_size), + CenterCrop(self.target_size), + Image2Array(), + Normalization(img_mean, img_std) + ] + for op in ops: + results = op(results) + + res = np.expand_dims(results['imgs'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class ppTSN_Inference_helper(Base_Inference_helper): + def __init__(self, + num_seg=25, + seg_len=1, + short_size=256, + target_size=224, + top_k=1): + self.num_seg = num_seg + self.seg_len = seg_len + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'filename': input_file} + img_mean = [0.485, 0.456, 0.406] + img_std = [0.229, 0.224, 0.225] + ops = [ + VideoDecoder(backend="decord"), + Sampler(self.num_seg, + self.seg_len, + valid_mode=True, + select_left=True), + Scale(self.short_size, + fixed_ratio=True, + do_round=True, + backend='cv2'), + TenCrop(self.target_size), + Image2Array(), + Normalization(img_mean, img_std) + ] + for op in ops: + results = op(results) + + res = np.expand_dims(results['imgs'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class BMN_Inference_helper(Base_Inference_helper): + def __init__(self, feat_dim, dscale, tscale, result_path): + self.feat_dim = feat_dim + self.dscale = dscale + self.tscale = tscale + self.result_path = result_path + if not os.path.isdir(self.result_path): + os.makedirs(self.result_path) + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + file_info = json.load(open(input_file)) + self.feat_path = file_info['feat_path'] + self.video_duration = file_info['duration_second'] + feat = np.load(self.feat_path).astype('float32').T + res = np.expand_dims(feat, axis=0).copy() + + return [res] + + def postprocess(self, outputs, print_output=True): + """ + output: list + """ + pred_bm, pred_start, pred_end = outputs + self._gen_props(pred_bm, pred_start[0], pred_end[0], print_output) + + def _gen_props(self, pred_bm, pred_start, pred_end, print_output): + snippet_xmins = [1.0 / self.tscale * i for i in range(self.tscale)] + snippet_xmaxs = [ + 1.0 / self.tscale * i for i in range(1, self.tscale + 1) + ] + + pred_bm = pred_bm[0, 0, :, :] * pred_bm[0, 1, :, :] + start_mask = boundary_choose(pred_start) + start_mask[0] = 1. + end_mask = boundary_choose(pred_end) + end_mask[-1] = 1. + score_vector_list = [] + for idx in range(self.dscale): + for jdx in range(self.tscale): + start_index = jdx + end_index = start_index + idx + if end_index < self.tscale and start_mask[ + start_index] == 1 and end_mask[end_index] == 1: + xmin = snippet_xmins[start_index] + xmax = snippet_xmaxs[end_index] + xmin_score = pred_start[start_index] + xmax_score = pred_end[end_index] + bm_score = pred_bm[idx, jdx] + conf_score = xmin_score * xmax_score * bm_score + score_vector_list.append([xmin, xmax, conf_score]) + + cols = ["xmin", "xmax", "score"] + score_vector_list = np.stack(score_vector_list) + df = pandas.DataFrame(score_vector_list, columns=cols) + + result_dict = {} + proposal_list = [] + df = soft_nms(df, alpha=0.4, t1=0.55, t2=0.9) + for idx in range(min(100, len(df))): + tmp_prop={"score":df.score.values[idx], \ + "segment":[max(0,df.xmin.values[idx])*self.video_duration, \ + min(1,df.xmax.values[idx])*self.video_duration]} + proposal_list.append(tmp_prop) + + result_dict[self.feat_path] = proposal_list + + # print top-5 predictions + if print_output: + print("Current video file: {0} :".format(self.feat_path)) + for pred in proposal_list[:5]: + print(pred) + + # save result + outfile = open( + os.path.join(self.result_path, "bmn_results_inference.json"), "w") + + json.dump(result_dict, outfile) + + +@INFERENCE.register() +class TokenShift_Inference_helper(Base_Inference_helper): + def __init__(self, + num_seg=8, + seg_len=1, + short_size=256, + target_size=256, + top_k=1, + mean=[0.5, 0.5, 0.5], + std=[0.5, 0.5, 0.5]): + self.num_seg = num_seg + self.seg_len = seg_len + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + self.mean = mean + self.std = std + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'filename': input_file} + ops = [ + VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg), + Sampler(self.num_seg, self.seg_len, valid_mode=True), + Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]), + Image2Array(data_format='cthw'), + JitterScale(self.short_size, self.short_size), + MultiCenterCrop(self.target_size) + ] + for op in ops: + results = op(results) + + # [N,C,Tx3,H,W] + res = np.expand_dims(results['imgs'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class TimeSformer_Inference_helper(Base_Inference_helper): + def __init__(self, + num_seg=8, + seg_len=1, + short_size=224, + target_size=224, + top_k=1, + mean=[0.45, 0.45, 0.45], + std=[0.225, 0.225, 0.225]): + self.num_seg = num_seg + self.seg_len = seg_len + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + self.mean = mean + self.std = std + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'filename': input_file} + ops = [ + VideoDecoder(backend='pyav', mode='test', num_seg=self.num_seg), + Sampler(self.num_seg, + self.seg_len, + valid_mode=True, + linspace_sample=True), + Normalization(self.mean, self.std, tensor_shape=[1, 1, 1, 3]), + Image2Array(data_format='cthw'), + JitterScale(self.short_size, self.short_size), + UniformCrop(self.target_size) + ] + for op in ops: + results = op(results) + + # [N,C,Tx3,H,W] + res = np.expand_dims(results['imgs'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class VideoSwin_Inference_helper(Base_Inference_helper): + def __init__(self, + num_seg=4, + seg_len=32, + frame_interval=2, + short_size=224, + target_size=224, + top_k=1, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375]): + + self.num_seg = num_seg + self.seg_len = seg_len + self.frame_interval = frame_interval + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + self.mean = mean + self.std = std + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + self.input_file = input_file + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'filename': input_file} + ops = [ + VideoDecoder(backend='decord', mode='valid'), + Sampler(num_seg=self.num_seg, + frame_interval=self.frame_interval, + seg_len=self.seg_len, + valid_mode=True, + use_pil=False), + Scale(short_size=self.short_size, + fixed_ratio=False, + keep_ratio=True, + backend='cv2', + do_round=True), + CenterCrop(target_size=224, backend='cv2'), + Normalization(mean=self.mean, + std=self.std, + tensor_shape=[3, 1, 1, 1], + inplace=True), + Image2Array(data_format='cthw') + ] + for op in ops: + results = op(results) + + res = np.expand_dims(results['imgs'], axis=0).copy() + return [res] + + def postprocess(self, output, print_output=True): + """ + output: list + """ + if not isinstance(self.input_file, list): + self.input_file = [ + self.input_file, + ] + output = output[0] # [B, num_cls] + N = len(self.input_file) + if output.shape[0] != N: + output = output.reshape([N] + [output.shape[0] // N] + + list(output.shape[1:])) # [N, T, C] + output = output.mean(axis=1) # [N, C] + for i in range(N): + classes = np.argpartition(output[i], -self.top_k)[-self.top_k:] + classes = classes[np.argsort(-output[i, classes])] + scores = output[i, classes] + if print_output: + print("Current video file: {0}".format(self.input_file[i])) + for j in range(self.top_k): + print("\ttop-{0} class: {1}".format(j + 1, classes[j])) + print("\ttop-{0} score: {1}".format(j + 1, scores[j])) + + +@INFERENCE.register() +class VideoSwin_TableTennis_Inference_helper(Base_Inference_helper): + def __init__(self, + num_seg=1, + seg_len=32, + short_size=256, + target_size=224, + top_k=1): + self.num_seg = num_seg + self.seg_len = seg_len + self.short_size = short_size + self.target_size = target_size + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'frame_dir': input_file, 'suffix': 'img_{:05}.jpg'} + img_mean = [123.675, 116.28, 103.53] + img_std = [58.395, 57.12, 57.375] + ops = [ + FrameDecoder(), + SamplerPkl(num_seg=self.num_seg, + seg_len=self.seg_len, + backend='cv2', + valid_mode=True), + Scale(short_size=self.short_size, + fixed_ratio=False, + keep_ratio=True, + backend='cv2', + do_round=True), + UniformCrop(target_size=self.target_size, backend='cv2'), + Normalization(mean=img_mean, + std=img_std, + tensor_shape=[3, 1, 1, 1], + inplace=True), + Image2Array(data_format='cthw') + ] + for op in ops: + results = op(results) + + res = np.expand_dims(results['imgs'], axis=0).copy() + return [res] + + def add_text_to_video( + self, + video_path, + output_dir="applications/TableTennis/ActionRecognition/results", + text=None): + os.makedirs(output_dir, exist_ok=True) + if video_path.endswith('.pkl'): + try: + import cPickle as pickle + from cStringIO import StringIO + except ImportError: + import pickle + from io import BytesIO + from PIL import Image + data_loaded = pickle.load(open(video_path, 'rb'), encoding='bytes') + _, _, frames = data_loaded + frames_len = len(frames) + + else: + videoCapture = cv2.VideoCapture() + videoCapture.open(video_path) + + fps = videoCapture.get(cv2.CAP_PROP_FPS) + frame_width = int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH)) + frame_height = int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + frames_len = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT) + print("fps=", int(fps), "frames=", int(frames_len), "scale=", + f"{frame_height}x{frame_width}") + + frames_rgb_list = [] + for i in range(int(frames_len)): + if video_path.endswith('.pkl'): + frame = np.array( + Image.open(BytesIO(frames[i])).convert("RGB").resize( + (240, 135)))[:, :, ::-1].astype('uint8') + else: + _, frame = videoCapture.read() + frame = cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_COMPLEX, + 1.0, (0, 0, 255), 2) + frames_rgb_list.append(frame[:, :, ::-1]) # bgr to rgb + if not video_path.endswith('.pkl'): + videoCapture.release() + cv2.destroyAllWindows() + output_filename = os.path.basename(video_path) + output_filename = output_filename.split('.')[0] + '.gif' + imageio.mimsave(f'{output_dir}/{output_filename}', + frames_rgb_list, + 'GIF', + duration=0.00085) + + def postprocess(self, output, print_output=True, save_gif=True): + """ + output: list + """ + if not isinstance(self.input_file, list): + self.input_file = [ + self.input_file, + ] + output = output[0] # [B, num_cls] + N = len(self.input_file) + if output.shape[0] != N: + output = output.reshape([N] + [output.shape[0] // N] + + list(output.shape[1:])) # [N, T, C] + output = output.mean(axis=1) # [N, C] + for i in range(N): + classes = np.argpartition(output[i], -self.top_k)[-self.top_k:] + classes = classes[np.argsort(-output[i, classes])] + scores = output[i, classes] + if print_output: + print("Current video file: {0}".format(self.input_file[i])) + for j in range(self.top_k): + print("\ttop-{0} class: {1}".format(j + 1, classes[j])) + print("\ttop-{0} score: {1}".format(j + 1, scores[j])) + if save_gif: + self.add_text_to_video( + self.input_file[0], + text=f"{str(classes[0])} {float(scores[0]):.5f}") + + +@INFERENCE.register() +class SlowFast_Inference_helper(Base_Inference_helper): + def __init__(self, + num_frames=32, + sampling_rate=2, + target_size=256, + alpha=8, + top_k=1): + self.num_frames = num_frames + self.sampling_rate = sampling_rate + self.target_size = target_size + self.alpha = alpha + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = { + 'filename': input_file, + 'temporal_sample_index': 0, + 'spatial_sample_index': 0, + 'temporal_num_clips': 1, + 'spatial_num_clips': 1 + } + img_mean = [0.45, 0.45, 0.45] + img_std = [0.225, 0.225, 0.225] + ops = [ + DecodeSampler(self.num_frames, self.sampling_rate, test_mode=True), + JitterScale(self.target_size, self.target_size), + MultiCrop(self.target_size), + Image2Array(transpose=False), + Normalization(img_mean, img_std, tensor_shape=[1, 1, 1, 3]), + PackOutput(self.alpha), + ] + for op in ops: + results = op(results) + + res = [] + for item in results['imgs']: + res.append(np.expand_dims(item, axis=0).copy()) + return res + + def postprocess(self, output, print_output=True): + """ + output: list + """ + if not isinstance(self.input_file, list): + self.input_file = [ + self.input_file, + ] + output = output[0] # [B, num_cls] + + N = len(self.input_file) + if output.shape[0] != N: + output = output.reshape([N] + [output.shape[0] // N] + + list(output.shape[1:])) # [N, T, C] + output = output.mean(axis=1) # [N, C] + # output = F.softmax(paddle.to_tensor(output), axis=-1).numpy() # done in it's head + for i in range(N): + classes = np.argpartition(output[i], -self.top_k)[-self.top_k:] + classes = classes[np.argsort(-output[i, classes])] + scores = output[i, classes] + if print_output: + print("Current video file: {0}".format(self.input_file[i])) + for j in range(self.top_k): + print("\ttop-{0} class: {1}".format(j + 1, classes[j])) + print("\ttop-{0} score: {1}".format(j + 1, scores[j])) + + +@INFERENCE.register() +class STGCN_Inference_helper(Base_Inference_helper): + def __init__(self, + num_channels, + window_size, + vertex_nums, + person_nums, + top_k=1): + self.num_channels = num_channels + self.window_size = window_size + self.vertex_nums = vertex_nums + self.person_nums = person_nums + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + data = np.load(input_file) + results = {'data': data} + ops = [AutoPadding(window_size=self.window_size), SkeletonNorm()] + for op in ops: + results = op(results) + + res = np.expand_dims(results['data'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class CTRGCN_Inference_helper(Base_Inference_helper): + def __init__(self, + num_channels=3, + vertex_nums=25, + person_nums=2, + window_size=64, + p_interval=[0.95], + top_k=1): + self.window_size = window_size + self.p_interval = p_interval + self.num_channels = num_channels + self.vertex_nums = vertex_nums + self.person_nums = person_nums + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + data = np.load(input_file) + results = {'data': data} + ops = [ + SketeonCropSample(window_size=self.window_size, + p_interval=self.p_interval) + ] + for op in ops: + results = op(results) + + res = np.expand_dims(results['data'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class AGCN2s_Inference_helper(Base_Inference_helper): + def __init__(self, + window_size=300, + num_channels=3, + vertex_nums=25, + person_nums=2, + top_k=1): + self.window_size = window_size + self.num_channels = num_channels + self.vertex_nums = vertex_nums + self.person_nums = person_nums + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + data = np.load(input_file) + results = {'data': data} + + res = np.expand_dims(results['data'], axis=0).copy() + return [res] + + +@INFERENCE.register() +class MSTCN_Inference_helper(Base_Inference_helper): + def __init__(self, num_channels, actions_map_file_path, feature_path=None): + self.num_channels = num_channels + file_ptr = open(actions_map_file_path, 'r') + actions = file_ptr.read().split('\n')[:-1] + file_ptr.close() + self.actions_dict = dict() + for a in actions: + self.actions_dict[a.split()[1]] = int(a.split()[0]) + + self.feature_path = feature_path + self.file_name_list = [] + + def get_process_file(self, input_file_txt): + with open(input_file_txt, 'r') as file_ptr: + info = file_ptr.read().split('\n')[:-1] + + files = [] + for video_name in info: + if self.feature_path is not None: + file_name = video_name.split('.')[0] + ".npy" + input_file = os.path.join(self.feature_path, file_name) + else: + input_file = video_name + + assert os.path.isfile( + input_file) is not None, "{0} not exists".format(input_file) + files.append(input_file) + + self.file_name_list.append(input_file.split('/')[-1].split('.')[0]) + return files + + def preprocess(self, input_file): + """ + input_file: str, feature file list txt path + return: list + """ + output_list = [] + + data = np.load(input_file) + results = {'video_feat': data, 'video_gt': None} + ops = [] + for op in ops: + results = op(results) + + res = np.expand_dims(results['video_feat'], axis=0).copy() + output_list.append(res) + return output_list + + def postprocess(self, output, print_output=True): + reslut_path = os.path.join("./inference/infer_results/") + if not os.path.isdir(reslut_path): + os.makedirs(reslut_path) + output = [output] + for outputs in output: + output_np = outputs[0] + recognition = [] + for i in range(output_np.shape[0]): + recognition = np.concatenate((recognition, [ + list(self.actions_dict.keys())[list( + self.actions_dict.values()).index(output_np[i])] + ])) + recog_content = list(recognition) + recog_content = [line + "\n" for line in recog_content] + + filename = self.file_name_list.pop(0) + + write_path = os.path.join(reslut_path, filename + ".txt") + f = open(write_path, "w") + f.writelines(recog_content) + f.close() + print("result write in : " + write_path) + + +@INFERENCE.register() +class ASRF_Inference_helper(Base_Inference_helper): + def __init__(self, + num_channels, + actions_map_file_path, + postprocessing_method, + boundary_threshold, + feature_path=None): + self.num_channels = num_channels + file_ptr = open(actions_map_file_path, 'r') + actions = file_ptr.read().split('\n')[:-1] + file_ptr.close() + self.actions_dict = dict() + for a in actions: + self.actions_dict[a.split()[1]] = int(a.split()[0]) + + self.postprocessing_method = postprocessing_method + self.boundary_threshold = boundary_threshold + self.feature_path = feature_path + self.file_name_list = [] + + def get_process_file(self, input_file_txt): + with open(input_file_txt, 'r') as file_ptr: + info = file_ptr.read().split('\n')[:-1] + + files = [] + for video_name in info: + if self.feature_path is not None: + file_name = video_name.split('.')[0] + ".npy" + input_file = os.path.join(self.feature_path, file_name) + else: + input_file = video_name + + assert os.path.isfile( + input_file) is not None, "{0} not exists".format(input_file) + files.append(input_file) + + self.file_name_list.append(input_file.split('/')[-1].split('.')[0]) + return files + + def preprocess(self, input_file): + """ + input_file: str, feature file list txt path + return: list + """ + + output_list = [] + + data = np.load(input_file) + results = {'video_feat': data, 'video_gt': None} + ops = [] + for op in ops: + results = op(results) + + res = np.expand_dims(results['video_feat'], axis=0).copy() + output_list.append(res) + return output_list + + def postprocess(self, output, print_output=True): + reslut_path = os.path.join("./inference/infer_results/") + if not os.path.isdir(reslut_path): + os.makedirs(reslut_path) + output = [output] + for outputs in output: + outputs_cls_np = outputs[0] + outputs_boundary_np = outputs[1] + + output_np = ASRFPostProcessing( + outputs_cls_np, + outputs_boundary_np, + self.postprocessing_method, + boundary_threshold=self.boundary_threshold).numpy()[0, :] + + recognition = [] + for i in range(output_np.shape[0]): + recognition = np.concatenate((recognition, [ + list(self.actions_dict.keys())[list( + self.actions_dict.values()).index(output_np[i])] + ])) + recog_content = list(recognition) + recog_content = [line + "\n" for line in recog_content] + + filename = self.file_name_list.pop(0) + + write_path = os.path.join(reslut_path, filename + ".txt") + f = open(write_path, "w") + f.writelines(recog_content) + f.close() + print("result write in : " + write_path) + + +@INFERENCE.register() +class AttentionLSTM_Inference_helper(Base_Inference_helper): + def __init__( + self, + num_classes, #Optional, the number of classes to be classified. + feature_num, + feature_dims, + embedding_size, + lstm_size, + top_k=1): + self.num_classes = num_classes + self.feature_num = feature_num + self.feature_dims = feature_dims + self.embedding_size = embedding_size + self.lstm_size = lstm_size + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = {'filename': input_file} + ops = [FeatureDecoder(num_classes=self.num_classes, has_label=False)] + for op in ops: + results = op(results) + + res = [] + for modality in ['rgb', 'audio']: + res.append( + np.expand_dims(results[f'{modality}_data'], axis=0).copy()) + res.append( + np.expand_dims(results[f'{modality}_len'], axis=0).copy()) + res.append( + np.expand_dims(results[f'{modality}_mask'], axis=0).copy()) + return res + + +@INFERENCE.register() +class TransNetV2_Inference_helper(): + def __init__(self, + num_frames, + height, + width, + num_channels, + threshold=0.5, + output_path=None, + visualize=True): + self._input_size = (height, width, num_channels) + self.output_path = output_path + self.len_frames = 0 + self.threshold = threshold + self.visualize = visualize + + def input_iterator(self, frames): + # return windows of size 100 where the first/last 25 frames are from the previous/next batch + # the first and last window must be padded by copies of the first and last frame of the video + no_padded_frames_start = 25 + no_padded_frames_end = 25 + 50 - ( + len(frames) % 50 if len(frames) % 50 != 0 else 50) # 25 - 74 + + start_frame = np.expand_dims(frames[0], 0) + end_frame = np.expand_dims(frames[-1], 0) + padded_inputs = np.concatenate([start_frame] * no_padded_frames_start + + [frames] + + [end_frame] * no_padded_frames_end, 0) + + ptr = 0 + while ptr + 100 <= len(padded_inputs): + out = padded_inputs[ptr:ptr + 100] + out = out.astype(np.float32) + ptr += 50 + yield out[np.newaxis] + + def preprocess(self, input_file): + """ + input_file: str, file path + return: iterator + """ + try: + import ffmpeg + except ImportError as e: + print( + f"Warning! {e}, [ffmpeg-python] package and it's dependencies is required for TransNetV2." + ) + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + self.input_file = input_file + self.filename = os.path.splitext(os.path.split(self.input_file)[1])[0] + video_stream, err = ffmpeg.input( + self.input_file).output("pipe:", + format="rawvideo", + pix_fmt="rgb24", + s="48x27").run(capture_stdout=True, + capture_stderr=True) + self.frames = np.frombuffer(video_stream, + np.uint8).reshape([-1, 27, 48, 3]) + self.len_frames = len(self.frames) + + return self.input_iterator(self.frames) + + def predictions_to_scenes(self, predictions): + predictions = (predictions > self.threshold).astype(np.uint8) + scenes = [] + t, t_prev, start = -1, 0, 0 + for i, t in enumerate(predictions): + if t_prev == 1 and t == 0: + start = i + if t_prev == 0 and t == 1 and i != 0: + scenes.append([start, i]) + t_prev = t + if t == 0: + scenes.append([start, i]) + + # just fix if all predictions are 1 + if len(scenes) == 0: + return np.array([[0, len(predictions) - 1]], dtype=np.int32) + + return np.array(scenes, dtype=np.int32) + + def visualize_predictions(self, frames, predictions): + from PIL import Image, ImageDraw + + if isinstance(predictions, np.ndarray): + predictions = [predictions] + + ih, iw, ic = frames.shape[1:] + width = 25 + + # pad frames so that length of the video is divisible by width + # pad frames also by len(predictions) pixels in width in order to show predictions + pad_with = width - len(frames) % width if len( + frames) % width != 0 else 0 + frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)), + (0, 0)]) + + predictions = [np.pad(x, (0, pad_with)) for x in predictions] + height = len(frames) // width + + img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic]) + img = np.concatenate(np.split( + np.concatenate(np.split(img, height), axis=2)[0], width), + axis=2)[0, :-1] + + img = Image.fromarray(img) + draw = ImageDraw.Draw(img) + + # iterate over all frames + for i, pred in enumerate(zip(*predictions)): + x, y = i % width, i // width + x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1 + + # we can visualize multiple predictions per single frame + for j, p in enumerate(pred): + color = [0, 0, 0] + color[(j + 1) % 3] = 255 + + value = round(p * (ih - 1)) + if value != 0: + draw.line((x + j, y, x + j, y - value), + fill=tuple(color), + width=1) + return img + + def postprocess(self, outputs, print_output=True): + """ + output: list + """ + predictions = [] + for output in outputs: + single_frame_logits, all_frames_logits = output + single_frame_pred = F.sigmoid(paddle.to_tensor(single_frame_logits)) + all_frames_pred = F.sigmoid(paddle.to_tensor(all_frames_logits)) + predictions.append((single_frame_pred.numpy()[0, 25:75, 0], + all_frames_pred.numpy()[0, 25:75, 0])) + single_frame_pred = np.concatenate( + [single_ for single_, all_ in predictions]) + all_frames_pred = np.concatenate( + [all_ for single_, all_ in predictions]) + single_frame_predictions, all_frame_predictions = single_frame_pred[: + self + . + len_frames], all_frames_pred[: + self + . + len_frames] + + scenes = self.predictions_to_scenes(single_frame_predictions) + + if print_output: + print("Current video file: {0}".format(self.input_file)) + print("\tShot Boundarys: {0}".format(scenes)) + + if self.output_path: + if not os.path.exists(self.output_path): + os.makedirs(self.output_path) + predictions = np.stack( + [single_frame_predictions, all_frame_predictions], 1) + predictions_file = os.path.join(self.output_path, + self.filename + "_predictions.txt") + np.savetxt(predictions_file, predictions, fmt="%.6f") + scenes_file = os.path.join(self.output_path, + self.filename + "_scenes.txt") + np.savetxt(scenes_file, scenes, fmt="%d") + + if self.visualize: + pil_image = self.visualize_predictions( + self.frames, + predictions=(single_frame_predictions, + all_frame_predictions)) + image_file = os.path.join(self.output_path, + self.filename + "_vis.png") + pil_image.save(image_file) + + +@INFERENCE.register() +class ADDS_Inference_helper(Base_Inference_helper): + def __init__(self, + frame_idxs=[0], + num_scales=4, + side_map={ + "2": 2, + "3": 3, + "l": 2, + "r": 3 + }, + height=256, + width=512, + full_res_shape=None, + num_channels=None, + img_ext=".png", + K=None): + + self.frame_idxs = frame_idxs + self.num_scales = num_scales + self.side_map = side_map + self.full_res_shape = full_res_shape + self.img_ext = img_ext + self.height = height + self.width = width + self.K = K + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + results = { + 'filename': input_file, + 'mode': 'infer', + 'day_or_night': 'day', + } + ops = [ + ImageDecoder( + backend='pil', + dataset='kitti', + frame_idxs=self.frame_idxs, + num_scales=self.num_scales, + side_map=self.side_map, + full_res_shape=self.full_res_shape, + img_ext=self.img_ext, + ), + GroupResize( + height=self.height, + width=self.width, + K=self.K, + scale=1, + mode='infer', + ), + ToArray(), + ] + for op in ops: + results = op(results) + res = results['imgs'][('color', 0, 0)] + res = np.expand_dims(res, axis=0).copy() + return [res] + + def postprocess(self, output, print_output, save_dir='data/'): + """ + output: list + """ + if not isinstance(self.input_file, list): + self.input_file = [ + self.input_file, + ] + print(len(output)) + N = len(self.input_file) + for i in range(N): + pred_depth = output[i] # [H, W] + if print_output: + print("Current input image: {0}".format(self.input_file[i])) + file_name = os.path.basename(self.input_file[i]).split('.')[0] + save_path = os.path.join(save_dir, + file_name + "_depth" + ".png") + pred_depth_color = self._convertPNG(pred_depth) + pred_depth_color.save(save_path) + print(f"pred depth image saved to: {save_path}") + + def _convertPNG(self, image_numpy): + disp_resized = cv2.resize(image_numpy, (1280, 640)) + disp_resized_np = disp_resized + vmax = np.percentile(disp_resized_np, 95) + normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) + mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') + colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * + 255).astype(np.uint8) + im = Image.fromarray(colormapped_im) + return im + + +@INFERENCE.register() +class AVA_SlowFast_FastRCNN_Inference_helper(Base_Inference_helper): + def __init__(self, + detection_model_name, + detection_model_weights, + config_file_path, + predict_stepsize=8, + output_stepsize=4, + output_fps=6, + out_filename='ava_det_demo.mp4', + num_frames=32, + alpha=4, + target_size=256): + self.detection_model_name = detection_model_name + self.detection_model_weights = detection_model_weights + + self.config = get_config(config_file_path, + show=False) #parse config file + self.predict_stepsize = predict_stepsize + self.output_stepsize = output_stepsize + self.output_fps = output_fps + self.out_filename = out_filename + self.num_frames = num_frames + self.alpha = alpha + self.target_size = target_size + + def preprocess(self, input_file): + """ + input_file: str, file path + """ + + frame_dir = 'tmp_frames' + self.frame_paths, frames, FPS = frame_extraction(input_file, frame_dir) + num_frame = len(self.frame_paths) #视频秒数*FPS + assert num_frame != 0 + + # 帧图像高度和宽度 + h, w, _ = frames[0].shape + + # Get clip_len, frame_interval and calculate center index of each clip + data_process_pipeline = build_pipeline( + self.config.PIPELINE.test) #测试时输出处理流水配置 + + clip_len = self.config.PIPELINE.test.sample['clip_len'] + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + frame_interval = self.config.PIPELINE.test.sample['frame_interval'] + + # 此处关键帧每秒取一个 + clip_len = self.config.PIPELINE.test.sample['clip_len'] + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + frame_interval = self.config.PIPELINE.test.sample['frame_interval'] + window_size = clip_len * frame_interval + timestamps = np.arange(window_size // 2, + (num_frame + 1 - window_size // 2), + self.predict_stepsize) + + selected_frame_list = [] + for timestamp in timestamps: + selected_frame_list.append(self.frame_paths[timestamp - 1]) + + # Load label_map + label_map_path = self.config.DATASET.test['label_file'] + self.categories, self.class_whitelist = read_labelmap( + open(label_map_path)) + label_map = {} + for item in self.categories: + id = item['id'] + name = item['name'] + label_map[id] = name + + self.label_map = label_map + + detection_result_dir = 'tmp_detection' + detection_model_name = self.detection_model_name + detection_model_weights = self.detection_model_weights + detection_txt_list = detection_inference(selected_frame_list, + detection_result_dir, + detection_model_name, + detection_model_weights) + assert len(detection_txt_list) == len(timestamps) + + human_detections = [] + data_list = [] + person_num_list = [] + + for timestamp, detection_txt_path in zip(timestamps, + detection_txt_list): + proposals, scores = get_detection_result( + detection_txt_path, h, w, + (float)(self.config.DATASET.test['person_det_score_thr'])) + + if proposals.shape[0] == 0: + #person_num_list.append(0) + human_detections.append(None) + continue + + human_detections.append(proposals) + + result = get_timestep_result(frame_dir, + timestamp, + clip_len, + frame_interval, + FPS=FPS) + result["proposals"] = proposals + result["scores"] = scores + + new_result = data_process_pipeline(result) + proposals = new_result['proposals'] + + img_slow = new_result['imgs'][0] + img_slow = img_slow[np.newaxis, :] + img_fast = new_result['imgs'][1] + img_fast = img_fast[np.newaxis, :] + + proposals = proposals[np.newaxis, :] + + scores = scores[np.newaxis, :] + + img_shape = np.asarray(new_result['img_shape']) + img_shape = img_shape[np.newaxis, :] + + data = [ + paddle.to_tensor(img_slow, dtype='float32'), + paddle.to_tensor(img_fast, dtype='float32'), + paddle.to_tensor(proposals, dtype='float32'), + paddle.to_tensor(img_shape, dtype='int32') + ] + + person_num = proposals.shape[1] + person_num_list.append(person_num) + + data_list.append(data) + + self.human_detections = human_detections + self.person_num_list = person_num_list + self.timestamps = timestamps + self.frame_dir = frame_dir + self.detection_result_dir = detection_result_dir + + return data_list + + def postprocess(self, outputs, print_output=True): + """ + output: list + """ + predictions = [] + + assert len(self.person_num_list) == len(outputs) + + #print("*** self.human_detections",len( self.human_detections)) + #print("*** outputs",len( outputs)) + + index = 0 + for t_index in range(len(self.timestamps)): + if self.human_detections[t_index] is None: + predictions.append(None) + continue + + human_detection = self.human_detections[t_index] + + output = outputs[index] + result = output #长度为类别个数,不包含背景 + + person_num = self.person_num_list[index] + + index = index + 1 + + prediction = [] + + if human_detection is None: + predictions.append(None) + continue + + # N proposals + for i in range(person_num): + prediction.append([]) + + # Perform action score thr + for i in range(len(result)): # for class + if i + 1 not in self.class_whitelist: + continue + for j in range(person_num): + if result[i][j, 4] > self.config.MODEL.head['action_thr']: + prediction[j].append( + (self.label_map[i + 1], result[i][j, 4] + )) # label_map is a dict, label index start from 1 + predictions.append(prediction) + + results = [] + for human_detection, prediction in zip(self.human_detections, + predictions): + results.append(pack_result(human_detection, prediction)) + + def dense_timestamps(timestamps, n): + """Make it nx frames.""" + old_frame_interval = (timestamps[1] - timestamps[0]) + start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 + new_frame_inds = np.arange( + len(timestamps) * n) * old_frame_interval / n + start + return new_frame_inds.astype(np.int) + + dense_n = int(self.predict_stepsize / self.output_stepsize) #30 + frames = [ + cv2.imread(self.frame_paths[i - 1]) + for i in dense_timestamps(self.timestamps, dense_n) + ] + + vis_frames = visualize(frames, results) + + try: + import moviepy.editor as mpy + except ImportError: + raise ImportError('Please install moviepy to enable output file') + + vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], + fps=self.output_fps) + vid.write_videofile(self.out_filename) + print("finish write !") + + # delete tmp files and dirs + shutil.rmtree(self.frame_dir) + shutil.rmtree(self.detection_result_dir) + + +@INFERENCE.register() +class PoseC3D_Inference_helper(Base_Inference_helper): + def __init__(self, top_k=1): + self.top_k = top_k + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + with open(input_file, 'rb') as f: + data = pickle.load(f) + self.input_file = input_file + + left_kp = [1, 3, 5, 7, 9, 11, 13, 15] + right_kp = [2, 4, 6, 8, 10, 12, 14, 16] + ops = [ + UniformSampleFrames(clip_len=48, num_clips=10, test_mode=True), + PoseDecode(), + PoseCompact(hw_ratio=1., allow_imgpad=True), + Resize(scale=(-1, 56)), + CenterCrop_V2(crop_size=56), + GeneratePoseTarget(sigma=0.6, + use_score=True, + with_kp=True, + with_limb=False, + double=True, + left_kp=left_kp, + right_kp=right_kp), + FormatShape(input_format='NCTHW'), + Collect(keys=['imgs', 'label'], meta_keys=[]) + ] + + for op in ops: + results = op(data) + results = [results[0][np.newaxis, :, :, :, :, :]] + self.num_segs = results[0].shape[1] + return results + + def postprocess(self, outputs, print_output=True): + batch_size = outputs[0].shape[0] + cls_score = outputs[0].reshape( + [batch_size // self.num_segs, self.num_segs, outputs[0].shape[-1]]) + output = F.softmax(paddle.to_tensor(cls_score), + axis=2).mean(axis=1).numpy() + N = len(self.input_file) + for i in range(N): + classes = np.argpartition(output[i], -self.top_k)[-self.top_k:] + classes = classes[np.argsort(-output[i, classes])] + scores = output[i, classes] + if print_output: + print("Current video file: {0}".format(self.input_file[i])) + for j in range(self.top_k): + print("\ttop-{0} class: {1}".format(j + 1, classes[j])) + print("\ttop-{0} score: {1}".format(j + 1, scores[j])) + + +@INFERENCE.register() +class YOWO_Inference_helper(Base_Inference_helper): + + def __init__(self, + num_seg=16, + target_size=224, + nms_thresh=0.5, + conf_thresh_valid=0.5, + mean=[0.4345, 0.4051, 0.3775], + std=[0.2768, 0.2713, 0.2737]): + self.num_seg = num_seg + self.target_size = target_size + self.nms_thresh = nms_thresh + self.conf_thresh_valid = conf_thresh_valid + self.mean = mean + self.std = std + + def preprocess(self, input_file): + """ + input_file: str, file path + return: list + """ + assert os.path.isfile(input_file) is not None, "{0} not exists".format( + input_file) + cap = cv2.VideoCapture(input_file) + queue = [] + inputs = [] + frames = [] + while (cap.isOpened()): + ret, frame = cap.read() + if ret == False: + break + if len(queue) <= 0: # At initialization, populate queue with initial frame + for i in range(self.num_seg): + queue.append(frame) + + # Add the read frame to last and pop out the oldest one + queue.append(frame) + queue.pop(0) + + # Resize images + imgs = [cv2.resize(img, (self.target_size, self.target_size), interpolation=cv2.INTER_LINEAR) for img in + queue] + + # Convert image to CHW keeping BGR order. + imgs = [img.transpose([2, 0, 1]) for img in imgs] + + # Image [0, 255] -> [0, 1]. + imgs = [img / 255.0 for img in imgs] + + imgs = [ + np.ascontiguousarray( + img.reshape((3, imgs[0].shape[1], imgs[0].shape[2])) + ).astype(np.float32) + for img in imgs + ] + + # Concat list of images to single ndarray. + imgs = np.concatenate( + [np.expand_dims(img, axis=1) for img in imgs], axis=1 + ) + + imgs = np.ascontiguousarray(imgs) + imgs = np.expand_dims(imgs, axis=0) + imgs = np.expand_dims(imgs, axis=0) + inputs.append(imgs) + frames.append(queue[-1]) + + return inputs, frames + + def postprocess(self, outputs, frame, filename, save_img=True): + """ + outputs: list + frames: list + """ + labels = [ + "Basketball", "BasketballDunk", "Biking", "CliffDiving", "CricketBowling", + "Diving", "Fencing", "FloorGymnastics", "GolfSwing", "HorseRiding", + "IceDancing", "LongJump", "PoleVault", "RopeClimbing", "SalsaSpin", + "SkateBoarding", "Skiing", "Skijet", "SoccerJuggling", "Surfing", + "TennisSwing", "TrampolineJumping", "VolleyballSpiking", "WalkingWithDog"] + nms_thresh = 0.5 + font = cv2.FONT_HERSHEY_SIMPLEX + for out in outputs: + out = paddle.to_tensor(out) + preds = [] + all_boxes = get_region_boxes(out) + for i in range(out.shape[0]): + boxes = all_boxes[i] + boxes = nms(boxes, nms_thresh) + + for box in boxes: + x1 = round(float(box[0] - box[2] / 2.0) * 320.0) + y1 = round(float(box[1] - box[3] / 2.0) * 240.0) + x2 = round(float(box[0] + box[2] / 2.0) * 320.0) + y2 = round(float(box[1] + box[3] / 2.0) * 240.0) + + det_conf = float(box[4]) + for j in range((len(box) - 5) // 2): + cls_conf = float(box[5 + 2 * j].item()) + prob = det_conf * cls_conf + preds.append([[x1, y1, x2, y2], prob, labels[int(box[6])]]) + + for _, dets in enumerate(preds): + if dets[1] < 0.4: + break + text = dets[2] + ' ' + '{:.2f}'.format(dets[1]) + cv2.rectangle(frame, (dets[0][0], dets[0][1]), (dets[0][2], dets[0][3]), (0, 255, 0), 2) + cv2.putText(frame, text, (dets[0][0] + 3, dets[0][1] - 5 - 10 * _), font, 0.5, (0, 255, 0), 2) + cv2.imwrite('{}.jpg'.format(filename), frame) \ No newline at end of file diff --git a/Bank_second_part/detect_process/tools/wheel.py b/Bank_second_part/detect_process/tools/wheel.py new file mode 100644 index 0000000..77281be --- /dev/null +++ b/Bank_second_part/detect_process/tools/wheel.py @@ -0,0 +1,354 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(os.path.join(__dir__, '')) + +import numpy as np +import tarfile +import requests +from tqdm import tqdm +import shutil + +from paddle import inference +from paddle.inference import Config, create_predictor + +from tools.utils import ppTSM_Inference_helper + +__all__ = ['PaddleVideo'] + +# path of download model and data +BASE_DIR = os.path.expanduser("~/.paddlevideo_inference/") +BASE_INFERENCE_MODEL_DIR = os.path.join(BASE_DIR, 'inference_model') +BASE_VIDEOS_DIR = os.path.join(BASE_DIR, 'videos') + +# support Models +MODELS = { + 'ppTSM': + 'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar', + 'ppTSM_v2': + 'https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_v2_infer.tar' +} + +MODEL_NAMES = list(MODELS.keys()) + + +def parse_args(mMain=True, add_help=True): + """ + Args: + mMain: bool. True for command args, False for python interface + """ + import argparse + + def str2bool(v): + return v.lower() in ("true", "t", "1") + + if mMain == True: + + # general params + parser = argparse.ArgumentParser(add_help=add_help) + parser.add_argument("--model_name", type=str, default='') + parser.add_argument("-v", "--video_file", type=str, default='') + parser.add_argument("--use_gpu", type=str2bool, default=True) + + # params for decode and sample + parser.add_argument("--num_seg", type=int, default=16) + + # params for preprocess + parser.add_argument("--short_size", type=int, default=256) + parser.add_argument("--target_size", type=int, default=224) + + # params for predict + parser.add_argument("--model_file", type=str, default='') + parser.add_argument("--params_file", type=str) + parser.add_argument("-b", "--batch_size", type=int, default=1) + parser.add_argument("--use_fp16", type=str2bool, default=False) + parser.add_argument("--ir_optim", type=str2bool, default=True) + parser.add_argument("--use_tensorrt", type=str2bool, default=False) + parser.add_argument("--gpu_mem", type=int, default=8000) + parser.add_argument("--top_k", type=int, default=1) + parser.add_argument("--enable_mkldnn", type=bool, default=False) + parser.add_argument("--label_name_path", type=str, default='') + + return parser.parse_args() + + else: + return argparse.Namespace(model_name='', + video_file='', + use_gpu=True, + num_seg=16, + short_size=256, + target_size=224, + model_file='', + params_file='', + batch_size=1, + use_fp16=False, + ir_optim=True, + use_tensorrt=False, + gpu_mem=8000, + top_k=1, + enable_mkldnn=False, + label_name_path='') + + +def parse_file_paths(input_path: str) -> list: + if os.path.isfile(input_path): + files = [ + input_path, + ] + else: + files = os.listdir(input_path) + files = [ + file for file in files + if (file.endswith(".avi") or file.endswith(".mp4")) + ] + files = [os.path.join(input_path, file) for file in files] + return files + + +def download_with_progressbar(url, save_path): + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(save_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes: + raise Exception("Something went wrong while downloading models") + + +def download_inference_model(model_storage_directory, url): + # using custom model + tar_file_name_list = [ + 'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel' + ] + if not os.path.exists( + os.path.join(model_storage_directory, + 'inference.pdiparams')) or not os.path.exists( + os.path.join(model_storage_directory, + 'inference.pdmodel')): + tmp_path = os.path.join(model_storage_directory, url.split('/')[-1]) + print('download {} to {}'.format(url, tmp_path)) + os.makedirs(model_storage_directory, exist_ok=True) + download_with_progressbar(url, tmp_path) #download + + #save to directory + with tarfile.open(tmp_path, 'r') as tarObj: + for member in tarObj.getmembers(): + filename = None + for tar_file_name in tar_file_name_list: + if tar_file_name in member.name: + filename = tar_file_name + if filename is None: + continue + file = tarObj.extractfile(member) + with open(os.path.join(model_storage_directory, filename), + 'wb') as f: + f.write(file.read()) + os.remove(tmp_path) + + +def create_paddle_predictor(args): + config = Config(args.model_file, args.params_file) + + if args.use_gpu: + config.enable_use_gpu(args.gpu_mem, 0) + else: + config.disable_gpu() + if args.enable_mkldnn: + # cache 10 different shapes for mkldnn to avoid memory leak + config.set_mkldnn_cache_capacity(10) + config.enable_mkldnn() + + config.disable_glog_info() + config.switch_ir_optim(args.ir_optim) # default true + if args.use_tensorrt: + config.enable_tensorrt_engine( + precision_mode=Config.Precision.Half + if args.use_fp16 else Config.Precision.Float32, + max_batch_size=args.batch_size) + + config.enable_memory_optim() + # use zero copy + config.switch_use_feed_fetch_ops(False) + predictor = create_predictor(config) + + return predictor + + +def load_label_name_dict(path): + result = {} + if not os.path.exists(path): + print( + 'Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!' + ) + else: + for line in open(path, 'r'): + partition = line.split('\n')[0].partition(' ') + try: + result[int(partition[0])] = str(partition[-1]) + except: + result = {} + break + return result + + +class PaddleVideo(object): + def __init__(self, **kwargs): + print( + '\nInference models that Paddle provides are listed as follows:\n{}' + .format(MODEL_NAMES), '\n') + process_params = parse_args(mMain=False, add_help=False) + process_params.__dict__.update(**kwargs) + + if not os.path.exists(process_params.model_file): + if process_params.model_name is None: + raise Exception('Please input model name that you want to use!') + if process_params.model_name in MODEL_NAMES: + url = MODELS[process_params.model_name] + download_path = os.path.join(BASE_INFERENCE_MODEL_DIR, + process_params.model_name) + if not os.path.exists(download_path): + os.makedirs(download_path) + + #create pretrained model download_path + download_inference_model(model_storage_directory=download_path, + url=url) + + process_params.model_file = os.path.join( + download_path, 'inference.pdmodel') + process_params.params_file = os.path.join( + download_path, 'inference.pdiparams') + process_params.label_name_path = os.path.join( + __dir__, '../data/k400/Kinetics-400_label_list.txt') + else: + raise Exception( + 'If you want to use your own model, Please input model_file as model path!' + ) + else: + print('Using user-specified model and params!') + print("process params are as follows: \n{}".format(process_params)) + self.label_name_dict = load_label_name_dict( + process_params.label_name_path) + + self.args = process_params + self.predictor = create_paddle_predictor(process_params) + + def predict(self, video): + """ + predict label of video with paddlevideo + Args: + video:input video for clas, support single video , internet url, folder path containing series of videos + Returns: + list[dict:{videoname: "",class_ids: [], scores: [], label_names: []}],if label name path is None,label names will be empty + """ + video_list = [] + assert isinstance(video, (str)) + + # get input_tensor and output_tensor + input_names = self.predictor.get_input_names() + output_names = self.predictor.get_output_names() + input_tensor_list = [] + output_tensor_list = [] + for item in input_names: + input_tensor_list.append(self.predictor.get_input_handle(item)) + for item in output_names: + output_tensor_list.append(self.predictor.get_output_handle(item)) + + if isinstance(video, str): + # download internet video + if video.startswith('http'): + if not os.path.exists(BASE_VIDEOS_DIR): + os.makedirs(BASE_VIDEOS_DIR) + video_path = os.path.join(BASE_VIDEOS_DIR, 'tmp.mp4') + download_with_progressbar(video, video_path) + print("Current using video from Internet:{}, renamed as: {}". + format(video, video_path)) + video = video_path + files = parse_file_paths(video) + else: + print('Please input legal video!') + + # Inferencing process + InferenceHelper = ppTSM_Inference_helper( + num_seg=self.args.num_seg, + short_size=self.args.short_size, + target_size=self.args.target_size, + top_k=self.args.top_k) + batch_num = self.args.batch_size + for st_idx in range(0, len(files), batch_num): + ed_idx = min(st_idx + batch_num, len(files)) + + # Pre process batched input + batched_inputs = InferenceHelper.preprocess_batch( + files[st_idx:ed_idx]) + + # run inference + for i in range(len(input_tensor_list)): + input_tensor_list[i].copy_from_cpu(batched_inputs[i]) + self.predictor.run() + + batched_outputs = [] + for j in range(len(output_tensor_list)): + batched_outputs.append(output_tensor_list[j].copy_to_cpu()) + + results_list = InferenceHelper.postprocess(batched_outputs, + print_output=False, + return_result=True) + + for res in results_list: + classes = res["topk_class"] + label_names = [] + if len(self.label_name_dict) != 0: + label_names = [self.label_name_dict[c] for c in classes] + res["label_names"] = label_names + + print("Current video file: {0}".format(res["video_id"])) + print("\ttop-{0} classes: {1}".format(len(res["topk_class"]), + res["topk_class"])) + print("\ttop-{0} scores: {1}".format(len(res["topk_scores"]), + res["topk_scores"])) + print("\ttop-{0} label names: {1}".format( + len(res["label_names"]), res["label_names"])) + + +def main(): + # for cmd + args = parse_args(mMain=True) + clas_engine = PaddleVideo(**(args.__dict__)) + clas_engine.predict(args.video_file) + + +if __name__ == '__main__': + main() diff --git a/Bank_second_part/detect_process/tools_function.py b/Bank_second_part/detect_process/tools_function.py new file mode 100644 index 0000000..14eb9a3 --- /dev/null +++ b/Bank_second_part/detect_process/tools_function.py @@ -0,0 +1,353 @@ +import cv2 +import os + + + +# 图像文件夹 +def get_video_list(path): + video_ext = [".mp4", ".avi",".MP4"] + video_names = [] + for maindir, subdir, file_name_list in os.walk(path): + for filename in file_name_list: + apath = os.path.join(maindir, filename) + ext = os.path.splitext(apath)[1] + if ext in video_ext: + video_names.append(apath) + return video_names + + +# # 截取裁剪需要的视频帧 +# def save_seg_video(video_name, frameToStart, frametoStop, videoWriter, bbox): +# cap = video_name +# cap.set(cv2.CAP_PROP_POS_FRAMES, frameToStart) # 设置初始帧数 +# count = frameToStart + +# while True: +# success, frame = cap.read() + +# if not success or count > frametoStop: +# break + +# if count >= frameToStart: +# # 裁剪视频画面 +# frame_target = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] +# videoWriter.write(frame_target) + +# count += 1 + +# cap.release() + + # 截取裁剪需要的视频帧 +def save_seg_video(video_name,frameToStart,frametoStop,videoWriter,bbox): + + cap = cv2.VideoCapture(video_name) + count = 0 + while True: + success,frame = cap.read() + if success: + count += 1 + if count <= frametoStop and count > frameToStart: # 选取起始帧 + # print('correct= ', count) + + #裁剪视频画面 + frame_target = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] # (split_height, split_width) + frame_target = cv2.resize(frame_target,(200,200)) + + videoWriter.write(frame_target) + + if not success or count >= frametoStop: + break + + videoWriter.release() + cap.release() + + + + +# 获得字典中所有values值(这个值是列表) +def get_dict_values(lst): + """ + 获取列表中所有字典的 values 值(如果值是列表) + + 参数: + lst: 包含字典的列表 + + 返回值: + values: 包含所有字典的 values 值的列表(如果值是列表) + """ + return [value for dictionary in lst for value in dictionary.values() if isinstance(value, list)] + + + # 解析检测后的结果,为检测后的结果排序 +def analysis_sort_list(re_list): + + # print('result_dict:',result_dict) + + # 获得检测列表 + # re_list = result_dict['start_bbox'] + # print('re_list:',re_list) + + # 获得列表中所有字典的values值 + # re_bbox_list = Process_tools.get_dict_values(re_list) + + # 为检测出来的标注框排序 + sorted_lst = sorted(re_list, key=lambda x: x[0]) + + return sorted_lst + + + #对比重叠率高的两个部分,并结合标注框,保存最大的标注框 +def contrast_bbox(e_bbox,r_bbox): + + e_bbox_min = e_bbox[:2] + r_bbox_min = r_bbox[:2] + + bbox_min = [min(x, y) for x, y in zip(e_bbox_min, r_bbox_min)] + + e_bbox_max = e_bbox[-2:] + r_bbox_max = r_bbox[-2:] + + bbox_max = [max(x, y) for x, y in zip(e_bbox_max, r_bbox_max)] + + bbox = bbox_min + bbox_max + + return bbox + + + # 解析result_list列表 +def analysis_re01_list(example_list,result_list): + + ''' + example_dict:对比的参照 + result_dict: 需要与参照对比的结果 + + example_sorted_lst:返回值中,原先有现在没有部分 + re_dict_sorted_lst:返回值中,现在有原先没有部分 + + cut_list:原先有,现在也有的部分 + + ''' + # 第一次检测到目标的帧率和信息 + # example_dict_fps = list(example_dict.keys())[0] + # example_sorted_lst = Process_tools.analysis_sort_list(example_list) + + # 当前帧检测结果中所有的检测结果数值 + # re_dict_fps = list(result_dict.keys())[0] + # re_dict_sorted_lst = Process_tools.analysis_sort_list(result_list) + + # 保存前后帧率连续的范围、筛选出相同的部分 + cut_list = [] + example_temp = [] + re_temp = [] + + for i,ex_bbox_dict in enumerate(example_list): + + ex_bbox = ex_bbox_dict['result'] + + for j,re_bbox in enumerate(result_list): + + iou = calculate_iou(box1=ex_bbox, box2=re_bbox) + + # print(iou) + + if iou > 0.5: + + # bbox = Process_tools.contrast_bbox(e_bbox=ex_bbox,r_bbox=re_bbox) + + # cut_list.append({i:re_bbox}) + cut_list.append(re_bbox) + example_temp.append(ex_bbox) + re_temp.append(re_bbox) + + break + + # print('example_temp:',example_temp) + # print('re_temp:',re_temp) + example_sorted_lst = [item for item in example_list if item['result'] not in example_temp] + re_dict_sorted_lst = [item for item in result_list if item not in re_temp] + + return cut_list,example_sorted_lst,re_dict_sorted_lst + + +# 计算前后帧率重叠范围 +def calculate_iou(box1, box2): + """ + 计算两个边界框之间的IoU值 + + 参数: + box1: 边界框1的坐标(x1, y1, x2, y2) + box2: 边界框2的坐标(x1, y1, x2, y2) + + 返回值: + iou: 两个边界框之间的IoU值 + """ + x1 = max(box1[0], box2[0]) + y1 = max(box1[1], box2[1]) + x2 = min(box1[2], box2[2]) + y2 = min(box1[3], box2[3]) + + # 计算交集区域面积 + intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1) + + # 计算边界框1和边界框2的面积 + box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1) + box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1) + + # 计算并集区域面积 + union_area = box1_area + box2_area - intersection_area + + # 计算IoU值 + iou = intersection_area / union_area + + return iou + +# 修正坐标参数 +def para_correction(images_size,bbox,dertpara): + + ''' + 修正检测后标注框过小的情况,如果有修正参数则使用修正参数,如果没有就按照坐标值扩大两倍 + + ''' + + # if dertpara: + # pass + # else: + w = (bbox[2] - bbox[0]) / int(dertpara) + h = (bbox[3] - bbox[1]) / int(dertpara) + + bbox_extand_list_x = [bbox[0] - w,bbox[2] + w] + bbox_extand_list_y = [bbox[1] - h,bbox[3] + h] + + bbox_list_x = contrast(size=images_size[0],bbox_extand_list=bbox_extand_list_x) + bbox_list_y = contrast(size=images_size[1],bbox_extand_list=bbox_extand_list_y) + + bbox_list = [bbox_list_x[0],bbox_list_y[0],bbox_list_x[1],bbox_list_y[1]] + + return bbox_list + + +def para_list_correction(images_size,bbox_list,dertpara): + + updata_result_list = [] + + for bbox in bbox_list: + + updata_bbox = para_correction(images_size,bbox,dertpara) + + updata_result_list.append(updata_bbox) + + return updata_result_list + +# 对比数值是否在这个范围内 +def contrast(size,bbox_extand_list): + + ''' + 对比数值是否在这个范围内 + ''' + + # print('bbox_extand_list:',bbox_extand_list) + # print('size:',size) + bbox_list = [] + + for x in bbox_extand_list: + + # print('size:',size) + + if 0 <= int(x) <= int(size): + # print('in:',x,size) + bbox_list.append(x) + if int(x) > int(size): + # print('>:',x,size) + bbox_list.append(size) + if int(x) < 0: + # print('<:',x,size) + bbox_list.append(0) + + # print('bbox_list:',bbox_list) + + return bbox_list + + +def change_list_dict(fps1,re_list): + + ''' + 给列表的结果设置对应帧率 + ''' + + bbox_list_all = [] + + for bbox_list in re_list: + + bbox_dict = {'fps':fps1,'result':bbox_list} + bbox_list_all.append(bbox_dict) + + return bbox_list_all + + +def statistics_fps(fps_now,re_list,parameter): + + ''' + 统计时长,返回时间满足截取要求的目标坐标 + ''' + + time_out_list = [] + + for bbox_dict in re_list: + + con_fps = int(fps_now) - int(bbox_dict["fps"]) + + if con_fps > parameter: + + time_out_list.append(bbox_dict) + + return time_out_list + + +def change_dict_list(dict_list): + ''' + 从字典列表得到bbox列表 + ''' + + bbox_list = [] + + for dicts1 in dict_list: + + bbox_list.append(dicts1['result']) + + return bbox_list + + +def select_list(result_list): + + ''' + 筛选列表中的空列表 + ''' + if result_list: + result_only = [] + + for result in result_list: + + if result == None : + pass + else: + + # result_bbox = select_bbox(result) + result_only.append(result) + + return result_only + +def select_bbox(bbox_list): + + # bbox_list_return = [] + + # print('bbox:',bbox_list) + left_top = [min(bbox_list, key=lambda p: p[0])[0], min(bbox_list, key=lambda p: p[1])[1]] + right_bottom = [max(bbox_list, key=lambda p: p[0])[0], max(bbox_list, key=lambda p: p[1])[1]] + + bbox_list_return = left_top + right_bottom + + + # print('bbox_list:',bbox_list_return) + + return bbox_list_return + \ No newline at end of file diff --git a/Bank_second_part/detect_process/video_process.py b/Bank_second_part/detect_process/video_process.py index c928a68..5c6ced6 100644 --- a/Bank_second_part/detect_process/video_process.py +++ b/Bank_second_part/detect_process/video_process.py @@ -1,4 +1,3 @@ -import numpy as np import cv2 import os import time @@ -11,8 +10,11 @@ import threading from config import Q_SZ from personDet import analysis_yolov8 -import tools +import tools_function from holisticDet import MediapipeProcess +import mediapipe_detection_image +from PP_TSMv2_infer import PP_TSMv2_predict +import shutil @@ -31,24 +33,25 @@ class DealVideo(): self.person_model = person_model self.mediapipe_model = mediapipe_model - self.pptsmv2_model = pptsmv2_model + self.predictor = pptsmv2_model[1] + self.infer = pptsmv2_model[0] + self.batch_size = 1 - # 图片检测后队列 + # 队列 self.videoQueue = queue.Queue(maxsize=Q_SZ) - self.frameQueue = queue.Queue(maxsize=0) + self.videoQueue2 = queue.Queue(maxsize=Q_SZ) self.cutbboxQueue = queue.Queue(maxsize=0) - self.videoframeQueue = queue.Queue(maxsize=0) - self.videohandsQueue = queue.Queue(maxsize=0) - self.videoheadQueue = queue.Queue(maxsize=0) - self.videopersonQueue = queue.Queue(maxsize=0) + self.videodetQueue = queue.Queue(maxsize=0) + self.videoQueue3 = queue.Queue(maxsize=0) #线程 self.get_video_listThread = threading.Thread(target=self.get_video_list) self.get_video_frameThread = threading.Thread(target=self.get_video_frame) - self.person_detThread = threading.Thread(target=self.person_det) self.write_videoThread = threading.Thread(target=self.write_video) - self.select_video_frameThread = threading.Thread(target=self.select_video_frame) self.head_hands_detThread = threading.Thread(target=self.head_hands_det) + self.video_select_dectThread = threading.Thread(target=self.video_select_dect) + self.select_video_pathThread = threading.Thread(target=self.select_video_path) + def get_video_list(self): @@ -70,173 +73,268 @@ class DealVideo(): else: self.videoQueue.put(self.video_file) - # def cut_video_seg(self): - - # pass - def get_video_frame(self): ''' 对视频进行分帧、每一帧都保存队列 ''' - while True: - if ~self.videoQueue.empty(): - - try: - video_path = self.videoQueue.get() - - # video_basename = os.path.basename(video_path).split('.')[0] + if self.videoQueue.empty(): - cap = cv2.VideoCapture(video_path) + time.sleep(1) + + else: + + t1 = time.time() + video_path = self.videoQueue.get() + + # video_basename = os.path.basename(video_path).split('.')[0] - frame_list = [] - count_fps = 0 + print('video_path:',video_path) - while cap.isOpened(): - success, frame = cap.read() - if not success: - print(video_path,"Ignoring empty camera frame.") - break - count_fps += 1 - # print('count_fps_read_video=',count_fps) + cap = cv2.VideoCapture(video_path) + video_fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - frame_dict = {'fps':count_fps,'frame':frame} - frame_list.append(frame_dict) - + # frame_list = [] + count_fps = 0 + frame_result_contact = [] + count_fps_del = 0 - video_dict = {'video_path':video_path,'frame_list':frame_list} - - self.frameQueue.put(video_dict) - # time.sleep(30) + while cap.isOpened(): + success, frame = cap.read() + if not success: + print(video_path,"Ignoring empty camera frame.") + print('video_fps:',video_fps,'count_fps:',count_fps) + break + + # print('count_fps_read_video=',count_fps) + imgsize = frame.shape - except Exception as e: - print(e) + person_det = analysis_yolov8(frame=frame, + model_coco=self.person_model, + confidence_set=0.5) + person_list = tools_function.get_dict_values(person_det) - def person_det(self): + if frame_result_contact: + start_fps = frame_result_contact[0]['fps'] + + else: + start_fps = count_fps - ''' - 从队列中获取视频帧frame,进行第一步人员的检测 - ''' + if count_fps == (video_fps - 1): - while True: + video_end = True - if ~self.videoframeQueue.empty(): + else: - video_frame_dict = self.videoframeQueue.get() + video_end = False - frame_list = video_frame_dict['frame_list'] - video_path = video_frame_dict['video_path'] + if person_list: - frame_result_contact = [] + count_fps_del_re,updata_result_contact = self.analysis_by_bbox(imgsize=imgsize, + detect_result=person_list, + dertpara=10, + start_fps=start_fps, + now_fps=count_fps, + label_name='person', + video_path=video_path, + frame_result_contact=frame_result_contact, + parameter_fps=200, + count_fps_del=count_fps_del, + video_end=video_end + ) + count_fps_del = count_fps_del_re + frame_result_contact = updata_result_contact - for i in range(len(frame_list)): + count_fps += 1 - if frame_list[i]["fps"] == i + 1: - person_det = analysis_yolov8(frame=frame_list[i]['frame'], - model_coco=self.person_model, - confidence_set=0.5) - - # 当前帧检测的结果列表,只包含bboxlist - person_list = tools.get_dict_values(person_det) - label_name = list(person_det[0].keys())[0] + def head_hands_det(self): - update_frame_result_contact = self.get_cut_message(fps1=frame_list[i]["fps"], - label_name = label_name, - re_list=person_list, - video_path=video_path, - frame_result_contact=frame_result_contact) - - frame_result_contact = update_frame_result_contact - # print('frame_result_contact:',frame_result_contact) + print('head_hands_detaohgaogh') + while True: - def head_hands_det(self): + if self.videoQueue3.empty(): - while True: + time.sleep(1) + else: - if ~self.videopersonQueue.empty(): + t0 = time.time() + video_path = self.videoQueue3.get() - person_frame_dict = self.videopersonQueue.get() + print('video_path_head_hands_det:',video_path) - person_frame_list = person_frame_dict['frame_list'] - video_path = person_frame_dict['video_path'] + cap = cv2.VideoCapture(video_path) + video_fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + # frame_list = [] + count_fps = 0 head_result_contact = [] hands_result_contact = [] + count_fps_del_head = 0 + count_fps_del_hand = 0 - for i in range(len(person_frame_list)): + while cap.isOpened(): + success, frame = cap.read() + if not success: + print(video_path,"Ignoring empty camera frame.") + print('count_fps:',count_fps,'video_fps:',video_fps) + break - if person_frame_list[i]["fps"] == i + 1: + # print('count_fps_read_video=',count_fps) + imgsize = frame.shape - image = person_frame_list[i]["frame"] + # 模型推理 + hh_result = MediapipeProcess.mediapipe_det(image=frame, + holistic=self.mediapipe_model) + hh_result_dict = MediapipeProcess.get_analysis_result(image=frame,results=hh_result) - imgsize = image.shape + # # 获得当前坐标列表 + head_result = hh_result_dict['face_bbox'] + head_result_1 = tools_function.select_list(head_result) + hands_result = hh_result_dict['hand_bbox'] + hands_result_1 = tools_function.select_list(hands_result) - # print(type(image)) + + if count_fps == (video_fps - 1): + + print('count_fps:',count_fps,'video_fps:',video_fps) + + video_end = True + else: + + video_end = False + + # 统一修正坐标,分别对头和手进行分析 + if head_result_1: + + if head_result_contact: + start_fps = head_result_contact[0]['fps'] + else: + start_fps = count_fps + count_fps_del_re,updata_result_contact = self.analysis_by_bbox(imgsize=imgsize, + detect_result=head_result_1, + dertpara=1, + start_fps=start_fps, + now_fps=count_fps, + label_name='head', + video_path=video_path, + frame_result_contact=head_result_contact, + parameter_fps=50, + count_fps_del=count_fps_del_head, + video_end=video_end + ) + count_fps_del_head = count_fps_del_re + head_result_contact = updata_result_contact + + if hands_result_1: + + if hands_result_contact: + start_fps = hands_result_contact[0]['fps'] + else: + start_fps = count_fps + + count_fps_del_re,updata_result_contact = self.analysis_by_bbox(imgsize=imgsize, + detect_result=hands_result_1, + dertpara=2, + start_fps=start_fps, + now_fps=count_fps, + label_name='hands', + video_path=video_path, + frame_result_contact=hands_result_contact, + parameter_fps=50, + count_fps_del=count_fps_del_hand, + video_end=video_end + ) + count_fps_del_hand = count_fps_del_re + hands_result_contact = updata_result_contact + + count_fps += 1 + + + def video_select_dect(self): - # 模型推理 - hh_result = MediapipeProcess.mediapipe_det(image=image, - holistic=self.mediapipe_model) - hh_result_dict = MediapipeProcess.get_analysis_result(image=image,results=hh_result) + while True: + if self.videodetQueue.empty(): + time.sleep(5) + else: - - # 获得当前坐标列表 - head_result = hh_result_dict['face_bbox'] - head_result_1 = tools.select_list(head_result) - hands_result = hh_result_dict['hand_bbox'] - hands_result_1 = tools.select_list(hands_result) - - print('head_result_1:',head_result_1) - print('head_result_1:',hands_result_1) - - # 统一修正坐标,分别对头和手进行分析 - if head_result_1: - head_bbox_list = tools.para_list_correction(images_size=imgsize, - bbox_list=head_result_1, - dertpara=[]) - - - update_head_result_contact = self.get_cut_message(fps1=person_frame_list[i]["fps"], - label_name = 'head', - re_list=head_bbox_list, - video_path=video_path, - frame_result_contact=head_result_contact) - head_result_contact = update_head_result_contact - - - if hands_result_1: - - hands_bbox_list = tools.para_list_correction(images_size=imgsize, - bbox_list=hands_result_1, - dertpara=[]) - - update_hands_result_contact = self.get_cut_message(fps1=person_frame_list[i]["fps"], - label_name = 'hands', - re_list=hands_bbox_list, - video_path=video_path, - frame_result_contact=hands_result_contact) - - hands_result_contact = update_hands_result_contact + video_path = self.videodetQueue.get() - # print("head_result_contact:",head_result_contact) - # print("hands_result_contact:",hands_result_contact) + try: + result_list = PP_TSMv2_predict().predict(input_f=video_path, + batch_size=self.batch_size, + predictor=self.predictor, + InferenceHelper=self.infer) + + video_base_name = os.path.basename(video_path) + video_save_select_path = self.video_save_file + '/' + 'video_select_dect/'+ str(result_list['topk_class']) + os.makedirs(video_save_select_path, exist_ok=True) + video_save = os.path.join(video_save_select_path, video_base_name) - def get_cut_message(self,fps1,label_name,re_list,video_path,frame_result_contact): + os.rename(video_path, video_save) + + print("result_list_video_select_dect:",result_list) + + except Exception as e: + print(e) + + def analysis_by_bbox(self,imgsize,detect_result,dertpara,start_fps,now_fps,label_name,video_path,frame_result_contact,parameter_fps,count_fps_del,video_end): + ''' + imgsize:图片的尺寸, + detect_result:检测到的图像的结果,bboxlist + dertpara:标注框修正参数,取整数,2,3 + start_fps: 对比列表中的起始帧 + now_fpsl:当前帧率 + label_name:用于分析的检测类别 + video_path:视频路径 + frame_result_contact:对比列表 + parameter_fps:统计截止时间 + count_fps_del:统计前后帧未出现次数 + + ''' + + bbox_list = tools_function.para_list_correction(images_size=imgsize, + bbox_list=detect_result, + dertpara=dertpara) + + count_fps_del_re,update_frame_result_contact = self.get_cut_message(fps1=now_fps, + label_name = label_name, + re_list=bbox_list, + video_path=video_path, + frame_result_contact=frame_result_contact, + parameter_fps=parameter_fps, + count_fps_del=count_fps_del, + video_end=video_end) + + # count_fps_del_re,updata_result_contact = self.get_continue_keys(count_fps_del=count_fps_del, + # continue_para=continue_para, + # start_fps=start_fps, + # now_fps=now_fps, + # frame_result_contact=frame_result_contact, + # update_frame_result_contact=update_frame_result_contact)\ + + return count_fps_del_re,update_frame_result_contact + + + def get_cut_message(self,fps1,label_name,re_list,video_path,frame_result_contact,parameter_fps,count_fps_del,video_end): + + # continue_para = False if not frame_result_contact: - bbox_list_all = tools.change_list_dict(fps1=fps1,re_list=re_list) + bbox_list_all = tools_function.change_list_dict(fps1=fps1,re_list=re_list) frame_result_contact = bbox_list_all # print("frame_result_contact:",frame_result_contact) @@ -244,41 +342,45 @@ class DealVideo(): else: example_dict_list = frame_result_contact - print('example_dict_list:',example_dict_list) - print('re_list:',re_list) - cut_list,example_lst,re_dict_lst = tools.analysis_re01_list(example_list=example_dict_list, + cut_list,example_lst,re_dict_lst = tools_function.analysis_re01_list(example_list=example_dict_list, result_list=re_list) - - # print('cut_list:',cut_list) - # print('example_sorted_lst:',example_lst) - # print('re_dict_sorted_lst:',re_dict_lst) - - # 有目标减少情况 if example_lst: - + # 截图保存视频 + # continue_para = True cut_dict = {'video_path':video_path,'label_name':label_name,"stop_fps":fps1,'bbox_list':example_lst} + + start_fps = example_lst[0]['fps'] - # 添加到新的队列 - self.cutbboxQueue.put(cut_dict) + if count_fps_del <= 3: + + frame_result_contact = frame_result_contact + count_fps_del = count_fps_del + 1 + + else: - frame_result_contact = [item for item in frame_result_contact if item not in example_lst] + if (fps1 - start_fps) < 10: + + frame_result_contact = frame_result_contact + else: + + frame_result_contact = [item for item in frame_result_contact if item not in example_lst] + self.cutbboxQueue.put(cut_dict) - # 有新添加目标情况 + # 有新添加目标情况 if re_dict_lst: # 对比示例列表更新 - update_list = tools.change_list_dict(fps1=fps1,re_list=re_dict_lst) + update_list = tools_function.change_list_dict(fps1=fps1,re_list=re_dict_lst) frame_result_contact = frame_result_contact + update_list # 统计截止时间 - time_out_list = tools.statistics_fps(fps_now=fps1,re_list=frame_result_contact,parameter=20) + time_out_list = tools_function.statistics_fps(fps_now=fps1,re_list=frame_result_contact,parameter=parameter_fps) - if time_out_list: # 裁剪保存视频 @@ -291,143 +393,183 @@ class DealVideo(): # 对比示例列表更新 frame_result_contact = [item for item in frame_result_contact if item not in time_out_list] + + if video_end: + + cut_dict = {'video_path':video_path,'label_name':label_name,"stop_fps":fps1,'bbox_list':frame_result_contact} + + self.cutbboxQueue.put(cut_dict) + + frame_result_contact.clear() # print('frame_result_contact:',frame_result_contact) - - return frame_result_contact + + return count_fps_del,frame_result_contact + + + # def get_continue_keys(self,count_fps_del,continue_para,start_fps,now_fps,frame_result_contact,update_frame_result_contact): + + # # 判断是否有偶然没检测到的情况 + # if continue_para: + + # dert_fps = now_fps - start_fps + + # print('dert_fps:',dert_fps) + + # if dert_fps <= 20: + + # count_fps_del = count_fps_del + 1 + + # if count_fps_del <= 3: + + # frame_result_contact = frame_result_contact + + # else: + + # frame_result_contact = update_frame_result_contact + # count_fps_del = 0 + + # else: + # count_fps_del = 0 + + # else: + + # frame_result_contact = update_frame_result_contact + + # return count_fps_del,frame_result_contact + def write_video(self): + # print('write_videoafagragr') ''' 保存成视频 ''' - while True: - - if ~self.cutbboxQueue.empty(): - + if self.cutbboxQueue.empty(): + time.sleep(2) + else: video_frame_dict = self.cutbboxQueue.get() - - # print('video_frame_dict:',video_frame_dict) - # 视频路径 video_path = video_frame_dict['video_path'] video_basename = os.path.basename(video_path).split('.')[0] file_name = video_frame_dict['label_name'] - # video_name_save = os.path.join(self.video_save_file, video_basename) - # 原视频帧率和尺寸 cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) + video_fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - # 获得起始帧 + print(video_path,'fps:',fps,'video_fps:',video_fps) + # 获得起始 stop_fps = video_frame_dict['stop_fps'] - # 裁剪信息 result_list = video_frame_dict['bbox_list'] + if cap.isOpened(): - for i,bbox_dict in enumerate(result_list): - - start_fps = bbox_dict['fps'] - bbox_list = bbox_dict['result'] - - w = int(bbox_list[2]) - int(bbox_list[0]) - h = int(bbox_list[3]) - int(bbox_list[1]) + for i,bbox_dict in enumerate(result_list): + start_fps = bbox_dict['fps'] - size = (w,h) + if start_fps >= stop_fps: - # 根据标签保存不同视频分类 - video_name_save = video_basename + '_' + str(start_fps) + '_' + str(stop_fps) + '_' + str(i) + '.avi' - video_save_file = self.video_save_file + '/' + file_name - os.makedirs(video_save_file, exist_ok=True) - video_save_path = os.path.join(video_save_file, video_name_save) - - videoWriter =cv2.VideoWriter(video_save_path,cv2.VideoWriter_fourcc('X','V','I','D'),fps,size) + print('start_fps:',start_fps,'stop_fps:',stop_fps) + break - tools.save_seg_video(video_name=video_path, - frameToStart=start_fps, - frametoStop=stop_fps, - videoWriter=videoWriter, - bbox=bbox_list) - - videoWriter.release() + else: + bbox_list = bbox_dict['result'] + # w = int(bbox_list[2]) - int(bbox_list[0]) + # h = int(bbox_list[3]) - int(bbox_list[1]) + size = (200,200) + # 根据标签保存不同视频分类 + video_name_save = video_basename + '_' + str(start_fps) + '_' + str(stop_fps) + '_' + str(i) + '.avi' + video_save_file = self.video_save_file + '/' + file_name + os.makedirs(video_save_file, exist_ok=True) + video_save_path = os.path.join(video_save_file, video_name_save) + videoWriter =cv2.VideoWriter(video_save_path,cv2.VideoWriter_fourcc('X','V','I','D'),fps,size) + + tools_function.save_seg_video(video_name=video_path, + frameToStart=start_fps, + frametoStop=stop_fps, + videoWriter=videoWriter, + bbox=bbox_list) + videoWriter.release() + self.videoQueue2.put(video_save_path) + + cap.release() + + else: + print(video_path) + break - self.videoQueue.put(video_save_path) - cap.release() - def select_video_frame(self): + def select_video_path(self): while True: - if ~self.frameQueue.empty(): - - video_dict = self.frameQueue.get() - video_path = video_dict["video_path"] + if self.videoQueue2.empty(): + time.sleep(5) + else: + video_path = self.videoQueue2.get() directory = os.path.dirname(video_path) labels = directory.split('/')[-1] - print('labels:',labels) - - if labels == 'person': + print('video_pathagfg:',video_path) - self.videopersonQueue.put(video_dict) + # print(labels) - if labels == 'head': - - # print('youshou') + if labels == 'person': - self.videoheadQueue.put(video_dict) - - if labels == 'hands': + self.videoQueue3.put(video_path) - # print('youshou') + if labels == 'head' or labels == 'hands': - self.videohandsQueue.put(video_dict) + self.videodetQueue.put(video_path) else: - - self.videoframeQueue.put(video_dict) - + pass def run(self): self.get_video_listThread.start() self.get_video_frameThread.start() - self.person_detThread.start() self.write_videoThread.start() - self.select_video_frameThread.start() + # self.write_videoThread.join() self.head_hands_detThread.start() + self.video_select_dectThread.start() + self.select_video_pathThread.start() if __name__ == '__main__': - - - - video = "E:/Bank_files/Bank_02/dataset/video_test/test03_3.avi" + t1 = time.time() + video = "E:/Bank_files/Bank_02/dataset/video_test/1min/0711-7_4.avi" video_save = 'test_video' - person_model = YOLO("model_file/yolov8x_person.pt") + # 初始化目标检测 + person_model = YOLO("model_file/yolov8n.pt") + + # 初始化pptsmv2 + config = 'model_file/inference/pptsm_lcnet_k400_16frames_uniform.yaml' # 配置文件地址 + model_file = 'model_file/inference/ppTSMv2.pdmodel' # 推理模型存放地址 + params_file = 'model_file/inference/ppTSMv2.pdiparams' + # batch_size= 1 + infer,predictor = PP_TSMv2_predict().create_inference_model(config,model_file,params_file) + # PP_TSMv2_predict().predict(config,input_file,batch_size,predictor,infer) + # 初始化mediapipe mp_holistic = mp.solutions.holistic holistic = mp_holistic.Holistic( min_detection_confidence=0.5, min_tracking_confidence=0.5) # get_seg_video(video_file=video,video_save_path=video_save,dertTime=dertTime) - deal = DealVideo(video_file=video,video_save_file=video_save,person_model=person_model,mediapipe_model=holistic,pptsmv2_model='model_file/yolov8x_person.pt') - deal.run() - - + deal = DealVideo(video_file=video,video_save_file=video_save,person_model=person_model,mediapipe_model=holistic,pptsmv2_model=[infer,predictor]) + deal.run() + t2 = time.time() - - - - - + # print('总时间:',t2-t1)